Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <math.h>
29 : #include <time.h>
30 : #include <sys/stat.h>
31 : #include <sys/time.h>
32 : #include <unistd.h>
33 :
34 : #include "access/timeline.h"
35 : #include "access/transam.h"
36 : #include "access/xact.h"
37 : #include "access/xlog_internal.h"
38 : #include "access/xlogarchive.h"
39 : #include "access/xlogprefetcher.h"
40 : #include "access/xlogreader.h"
41 : #include "access/xlogrecovery.h"
42 : #include "access/xlogutils.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "pgstat.h"
49 : #include "postmaster/bgwriter.h"
50 : #include "postmaster/startup.h"
51 : #include "replication/slot.h"
52 : #include "replication/slotsync.h"
53 : #include "replication/walreceiver.h"
54 : #include "storage/fd.h"
55 : #include "storage/ipc.h"
56 : #include "storage/latch.h"
57 : #include "storage/pmsignal.h"
58 : #include "storage/procarray.h"
59 : #include "storage/spin.h"
60 : #include "utils/datetime.h"
61 : #include "utils/fmgrprotos.h"
62 : #include "utils/guc_hooks.h"
63 : #include "utils/pgstat_internal.h"
64 : #include "utils/pg_lsn.h"
65 : #include "utils/ps_status.h"
66 : #include "utils/pg_rusage.h"
67 :
68 : /* Unsupported old recovery command file names (relative to $PGDATA) */
69 : #define RECOVERY_COMMAND_FILE "recovery.conf"
70 : #define RECOVERY_COMMAND_DONE "recovery.done"
71 :
72 : /*
73 : * GUC support
74 : */
75 : const struct config_enum_entry recovery_target_action_options[] = {
76 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
77 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
78 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
79 : {NULL, 0, false}
80 : };
81 :
82 : /* options formerly taken from recovery.conf for archive recovery */
83 : char *recoveryRestoreCommand = NULL;
84 : char *recoveryEndCommand = NULL;
85 : char *archiveCleanupCommand = NULL;
86 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
87 : bool recoveryTargetInclusive = true;
88 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
89 : TransactionId recoveryTargetXid;
90 : char *recovery_target_time_string;
91 : TimestampTz recoveryTargetTime;
92 : const char *recoveryTargetName;
93 : XLogRecPtr recoveryTargetLSN;
94 : int recovery_min_apply_delay = 0;
95 :
96 : /* options formerly taken from recovery.conf for XLOG streaming */
97 : char *PrimaryConnInfo = NULL;
98 : char *PrimarySlotName = NULL;
99 : bool wal_receiver_create_temp_slot = false;
100 :
101 : /*
102 : * recoveryTargetTimeLineGoal: what the user requested, if any
103 : *
104 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105 : *
106 : * recoveryTargetTLI: the currently understood target timeline; changes
107 : *
108 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
109 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
110 : * always the first list member). Only these TLIs are expected to be seen in
111 : * the WAL segments we read, and indeed only these TLIs will be considered as
112 : * candidate WAL files to open at all.
113 : *
114 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
115 : * (This is not necessarily the same as the timeline from which we are
116 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
117 : * scanning data that was copied from an ancestor timeline when the current
118 : * file was created.) During a sequential scan we do not allow this value
119 : * to decrease.
120 : */
121 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
122 : TimeLineID recoveryTargetTLIRequested = 0;
123 : TimeLineID recoveryTargetTLI = 0;
124 : static List *expectedTLEs;
125 : static TimeLineID curFileTLI;
126 :
127 : /*
128 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
129 : * ie. signal files were present. When InArchiveRecovery is set, we are
130 : * currently recovering using offline XLOG archives. These variables are only
131 : * valid in the startup process.
132 : *
133 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
134 : * currently performing crash recovery using only XLOG files in pg_wal, but
135 : * will switch to using offline XLOG archives as soon as we reach the end of
136 : * WAL in pg_wal.
137 : */
138 : bool ArchiveRecoveryRequested = false;
139 : bool InArchiveRecovery = false;
140 :
141 : /*
142 : * When StandbyModeRequested is set, standby mode was requested, i.e.
143 : * standby.signal file was present. When StandbyMode is set, we are currently
144 : * in standby mode. These variables are only valid in the startup process.
145 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146 : */
147 : static bool StandbyModeRequested = false;
148 : bool StandbyMode = false;
149 :
150 : /* was a signal file present at startup? */
151 : static bool standby_signal_file_found = false;
152 : static bool recovery_signal_file_found = false;
153 :
154 : /*
155 : * CheckPointLoc is the position of the checkpoint record that determines
156 : * where to start the replay. It comes from the backup label file or the
157 : * control file.
158 : *
159 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
160 : * file or the control file. In standby mode, XLOG streaming usually starts
161 : * from the position where an invalid record was found. But if we fail to
162 : * read even the initial checkpoint record, we use the REDO location instead
163 : * of the checkpoint location as the start position of XLOG streaming.
164 : * Otherwise we would have to jump backwards to the REDO location after
165 : * reading the checkpoint record, because the REDO record can precede the
166 : * checkpoint record.
167 : */
168 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
169 : static TimeLineID CheckPointTLI = 0;
170 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
171 : static TimeLineID RedoStartTLI = 0;
172 :
173 : /*
174 : * Local copy of SharedHotStandbyActive variable. False actually means "not
175 : * known, need to check the shared state".
176 : */
177 : static bool LocalHotStandbyActive = false;
178 :
179 : /*
180 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
181 : * known, need to check the shared state".
182 : */
183 : static bool LocalPromoteIsTriggered = false;
184 :
185 : /* Has the recovery code requested a walreceiver wakeup? */
186 : static bool doRequestWalReceiverReply;
187 :
188 : /* XLogReader object used to parse the WAL records */
189 : static XLogReaderState *xlogreader = NULL;
190 :
191 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 : static XLogPrefetcher *xlogprefetcher = NULL;
193 :
194 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
195 : typedef struct XLogPageReadPrivate
196 : {
197 : int emode;
198 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
199 : bool randAccess;
200 : TimeLineID replayTLI;
201 : } XLogPageReadPrivate;
202 :
203 : /* flag to tell XLogPageRead that we have started replaying */
204 : static bool InRedo = false;
205 :
206 : /*
207 : * Codes indicating where we got a WAL file from during recovery, or where
208 : * to attempt to get one.
209 : */
210 : typedef enum
211 : {
212 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
213 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
214 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
215 : XLOG_FROM_STREAM, /* streamed from primary */
216 : } XLogSource;
217 :
218 : /* human-readable names for XLogSources, for debugging output */
219 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
220 :
221 : /*
222 : * readFile is -1 or a kernel FD for the log file segment that's currently
223 : * open for reading. readSegNo identifies the segment. readOff is the offset
224 : * of the page just read, readLen indicates how much of it has been read into
225 : * readBuf, and readSource indicates where we got the currently open file from.
226 : *
227 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
228 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
229 : * worthwhile, since the XLOG is not read by general-purpose sessions.
230 : */
231 : static int readFile = -1;
232 : static XLogSegNo readSegNo = 0;
233 : static uint32 readOff = 0;
234 : static uint32 readLen = 0;
235 : static XLogSource readSource = XLOG_FROM_ANY;
236 :
237 : /*
238 : * Keeps track of which source we're currently reading from. This is
239 : * different from readSource in that this is always set, even when we don't
240 : * currently have a WAL file open. If lastSourceFailed is set, our last
241 : * attempt to read from currentSource failed, and we should try another source
242 : * next.
243 : *
244 : * pendingWalRcvRestart is set when a config change occurs that requires a
245 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
246 : */
247 : static XLogSource currentSource = XLOG_FROM_ANY;
248 : static bool lastSourceFailed = false;
249 : static bool pendingWalRcvRestart = false;
250 :
251 : /*
252 : * These variables track when we last obtained some WAL data to process,
253 : * and where we got it from. (XLogReceiptSource is initially the same as
254 : * readSource, but readSource gets reset to zero when we don't have data
255 : * to process right now. It is also different from currentSource, which
256 : * also changes when we try to read from a source and fail, while
257 : * XLogReceiptSource tracks where we last successfully read some WAL.)
258 : */
259 : static TimestampTz XLogReceiptTime = 0;
260 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
261 :
262 : /* Local copy of WalRcv->flushedUpto */
263 : static XLogRecPtr flushedUpto = 0;
264 : static TimeLineID receiveTLI = 0;
265 :
266 : /*
267 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
268 : *
269 : * In order to reach consistency, we must replay the WAL up to
270 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
271 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
272 : * to backupStartPoint.
273 : *
274 : * Note: In archive recovery, after consistency has been reached, the
275 : * functions in xlog.c will start updating minRecoveryPoint in the control
276 : * file. But this copy of minRecoveryPoint variable reflects the value at the
277 : * beginning of recovery, and is *not* updated after consistency is reached.
278 : */
279 : static XLogRecPtr minRecoveryPoint;
280 : static TimeLineID minRecoveryPointTLI;
281 :
282 : static XLogRecPtr backupStartPoint;
283 : static XLogRecPtr backupEndPoint;
284 : static bool backupEndRequired = false;
285 :
286 : /*
287 : * Have we reached a consistent database state? In crash recovery, we have
288 : * to replay all the WAL, so reachedConsistency is never set. During archive
289 : * recovery, the database is consistent once minRecoveryPoint is reached.
290 : *
291 : * Consistent state means that the system is internally consistent, all
292 : * the WAL has been replayed up to a certain point, and importantly, there
293 : * is no trace of later actions on disk.
294 : *
295 : * This flag is used only by the startup process and postmaster. When
296 : * minRecoveryPoint is reached, the startup process sets it to true and
297 : * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
298 : * which then sets it to true upon receiving the signal.
299 : */
300 : bool reachedConsistency = false;
301 :
302 : /* Buffers dedicated to consistency checks of size BLCKSZ */
303 : static char *replay_image_masked = NULL;
304 : static char *primary_image_masked = NULL;
305 :
306 :
307 : /*
308 : * Shared-memory state for WAL recovery.
309 : */
310 : typedef struct XLogRecoveryCtlData
311 : {
312 : /*
313 : * SharedHotStandbyActive indicates if we allow hot standby queries to be
314 : * run. Protected by info_lck.
315 : */
316 : bool SharedHotStandbyActive;
317 :
318 : /*
319 : * SharedPromoteIsTriggered indicates if a standby promotion has been
320 : * triggered. Protected by info_lck.
321 : */
322 : bool SharedPromoteIsTriggered;
323 :
324 : /*
325 : * recoveryWakeupLatch is used to wake up the startup process to continue
326 : * WAL replay, if it is waiting for WAL to arrive or promotion to be
327 : * requested.
328 : *
329 : * Note that the startup process also uses another latch, its procLatch,
330 : * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
331 : * signaling the startup process in favor of using its procLatch, which
332 : * comports better with possible generic signal handlers using that latch.
333 : * But we should not do that because the startup process doesn't assume
334 : * that it's waken up by walreceiver process or SIGHUP signal handler
335 : * while it's waiting for recovery conflict. The separate latches,
336 : * recoveryWakeupLatch and procLatch, should be used for inter-process
337 : * communication for WAL replay and recovery conflict, respectively.
338 : */
339 : Latch recoveryWakeupLatch;
340 :
341 : /*
342 : * Last record successfully replayed.
343 : */
344 : XLogRecPtr lastReplayedReadRecPtr; /* start position */
345 : XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
346 : TimeLineID lastReplayedTLI; /* timeline */
347 :
348 : /*
349 : * When we're currently replaying a record, ie. in a redo function,
350 : * replayEndRecPtr points to the end+1 of the record being replayed,
351 : * otherwise it's equal to lastReplayedEndRecPtr.
352 : */
353 : XLogRecPtr replayEndRecPtr;
354 : TimeLineID replayEndTLI;
355 : /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
356 : TimestampTz recoveryLastXTime;
357 :
358 : /*
359 : * timestamp of when we started replaying the current chunk of WAL data,
360 : * only relevant for replication or archive recovery
361 : */
362 : TimestampTz currentChunkStartTime;
363 : /* Recovery pause state */
364 : RecoveryPauseState recoveryPauseState;
365 : ConditionVariable recoveryNotPausedCV;
366 :
367 : slock_t info_lck; /* locks shared variables shown above */
368 : } XLogRecoveryCtlData;
369 :
370 : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
371 :
372 : /*
373 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
374 : * recovery completes; missingContrecPtr is the location of the first
375 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
376 : * details.
377 : */
378 : static XLogRecPtr abortedRecPtr;
379 : static XLogRecPtr missingContrecPtr;
380 :
381 : /*
382 : * if recoveryStopsBefore/After returns true, it saves information of the stop
383 : * point here
384 : */
385 : static TransactionId recoveryStopXid;
386 : static TimestampTz recoveryStopTime;
387 : static XLogRecPtr recoveryStopLSN;
388 : static char recoveryStopName[MAXFNAMELEN];
389 : static bool recoveryStopAfter;
390 :
391 : /* prototypes for local functions */
392 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
393 :
394 : static void EnableStandbyMode(void);
395 : static void readRecoverySignalFile(void);
396 : static void validateRecoveryParameters(void);
397 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
398 : TimeLineID *backupLabelTLI,
399 : bool *backupEndRequired, bool *backupFromStandby);
400 : static bool read_tablespace_map(List **tablespaces);
401 :
402 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
403 : static void CheckRecoveryConsistency(void);
404 : static void rm_redo_error_callback(void *arg);
405 : #ifdef WAL_DEBUG
406 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
407 : #endif
408 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
409 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
410 : TimeLineID prevTLI, TimeLineID replayTLI);
411 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
412 : static void verifyBackupPageConsistency(XLogReaderState *record);
413 :
414 : static bool recoveryStopsBefore(XLogReaderState *record);
415 : static bool recoveryStopsAfter(XLogReaderState *record);
416 : static char *getRecoveryStopReason(void);
417 : static void recoveryPausesHere(bool endOfRecovery);
418 : static bool recoveryApplyDelay(XLogReaderState *record);
419 : static void ConfirmRecoveryPaused(void);
420 :
421 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
422 : int emode, bool fetching_ckpt,
423 : TimeLineID replayTLI);
424 :
425 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
426 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
427 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
428 : bool randAccess,
429 : bool fetching_ckpt,
430 : XLogRecPtr tliRecPtr,
431 : TimeLineID replayTLI,
432 : XLogRecPtr replayLSN,
433 : bool nonblocking);
434 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
435 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
436 : XLogRecPtr RecPtr, TimeLineID replayTLI);
437 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
438 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
439 : XLogSource source, bool notfoundOk);
440 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
441 :
442 : static bool CheckForStandbyTrigger(void);
443 : static void SetPromoteIsTriggered(void);
444 : static bool HotStandbyActiveInReplay(void);
445 :
446 : static void SetCurrentChunkStartTime(TimestampTz xtime);
447 : static void SetLatestXTime(TimestampTz xtime);
448 :
449 : /*
450 : * Initialization of shared memory for WAL recovery
451 : */
452 : Size
453 5994 : XLogRecoveryShmemSize(void)
454 : {
455 : Size size;
456 :
457 : /* XLogRecoveryCtl */
458 5994 : size = sizeof(XLogRecoveryCtlData);
459 :
460 5994 : return size;
461 : }
462 :
463 : void
464 2096 : XLogRecoveryShmemInit(void)
465 : {
466 : bool found;
467 :
468 2096 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
469 2096 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
470 2096 : if (found)
471 0 : return;
472 2096 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
473 :
474 2096 : SpinLockInit(&XLogRecoveryCtl->info_lck);
475 2096 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
476 2096 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
477 : }
478 :
479 : /*
480 : * A thin wrapper to enable StandbyMode and do other preparatory work as
481 : * needed.
482 : */
483 : static void
484 204 : EnableStandbyMode(void)
485 : {
486 204 : StandbyMode = true;
487 :
488 : /*
489 : * To avoid server log bloat, we don't report recovery progress in a
490 : * standby as it will always be in recovery unless promoted. We disable
491 : * startup progress timeout in standby mode to avoid calling
492 : * startup_progress_timeout_handler() unnecessarily.
493 : */
494 204 : disable_startup_progress_timeout();
495 204 : }
496 :
497 : /*
498 : * Prepare the system for WAL recovery, if needed.
499 : *
500 : * This is called by StartupXLOG() which coordinates the server startup
501 : * sequence. This function analyzes the control file and the backup label
502 : * file, if any, and figures out whether we need to perform crash recovery or
503 : * archive recovery, and how far we need to replay the WAL to reach a
504 : * consistent state.
505 : *
506 : * This doesn't yet change the on-disk state, except for creating the symlinks
507 : * from table space map file if any, and for fetching WAL files needed to find
508 : * the checkpoint record. On entry, the caller has already read the control
509 : * file into memory, and passes it as argument. This function updates it to
510 : * reflect the recovery state, and the caller is expected to write it back to
511 : * disk does after initializing other subsystems, but before calling
512 : * PerformWalRecovery().
513 : *
514 : * This initializes some global variables like ArchiveRecoveryRequested, and
515 : * StandbyModeRequested and InRecovery.
516 : */
517 : void
518 1810 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
519 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
520 : {
521 : XLogPageReadPrivate *private;
522 : struct stat st;
523 : bool wasShutdown;
524 : XLogRecord *record;
525 : DBState dbstate_at_startup;
526 1810 : bool haveTblspcMap = false;
527 1810 : bool haveBackupLabel = false;
528 : CheckPoint checkPoint;
529 1810 : bool backupFromStandby = false;
530 :
531 1810 : dbstate_at_startup = ControlFile->state;
532 :
533 : /*
534 : * Initialize on the assumption we want to recover to the latest timeline
535 : * that's active according to pg_control.
536 : */
537 1810 : if (ControlFile->minRecoveryPointTLI >
538 1810 : ControlFile->checkPointCopy.ThisTimeLineID)
539 4 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
540 : else
541 1806 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
542 :
543 : /*
544 : * Check for signal files, and if so set up state for offline recovery
545 : */
546 1810 : readRecoverySignalFile();
547 1810 : validateRecoveryParameters();
548 :
549 : /*
550 : * Take ownership of the wakeup latch if we're going to sleep during
551 : * recovery, if required.
552 : */
553 1810 : if (ArchiveRecoveryRequested)
554 214 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
555 :
556 : /*
557 : * Set the WAL reading processor now, as it will be needed when reading
558 : * the checkpoint record required (backup_label or not).
559 : */
560 1810 : private = palloc0(sizeof(XLogPageReadPrivate));
561 1810 : xlogreader =
562 1810 : XLogReaderAllocate(wal_segment_size, NULL,
563 1810 : XL_ROUTINE(.page_read = &XLogPageRead,
564 : .segment_open = NULL,
565 : .segment_close = wal_segment_close),
566 : private);
567 1810 : if (!xlogreader)
568 0 : ereport(ERROR,
569 : (errcode(ERRCODE_OUT_OF_MEMORY),
570 : errmsg("out of memory"),
571 : errdetail("Failed while allocating a WAL reading processor.")));
572 1810 : xlogreader->system_identifier = ControlFile->system_identifier;
573 :
574 : /*
575 : * Set the WAL decode buffer size. This limits how far ahead we can read
576 : * in the WAL.
577 : */
578 1810 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
579 :
580 : /* Create a WAL prefetcher. */
581 1810 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
582 :
583 : /*
584 : * Allocate two page buffers dedicated to WAL consistency checks. We do
585 : * it this way, rather than just making static arrays, for two reasons:
586 : * (1) no need to waste the storage in most instantiations of the backend;
587 : * (2) a static char array isn't guaranteed to have any particular
588 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
589 : */
590 1810 : replay_image_masked = (char *) palloc(BLCKSZ);
591 1810 : primary_image_masked = (char *) palloc(BLCKSZ);
592 :
593 : /*
594 : * Read the backup_label file. We want to run this part of the recovery
595 : * process after checking for signal files and after performing validation
596 : * of the recovery parameters.
597 : */
598 1810 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
599 : &backupFromStandby))
600 : {
601 140 : List *tablespaces = NIL;
602 :
603 : /*
604 : * Archive recovery was requested, and thanks to the backup label
605 : * file, we know how far we need to replay to reach consistency. Enter
606 : * archive recovery directly.
607 : */
608 140 : InArchiveRecovery = true;
609 140 : if (StandbyModeRequested)
610 118 : EnableStandbyMode();
611 :
612 : /*
613 : * Omitting backup_label when creating a new replica, PITR node etc.
614 : * unfortunately is a common cause of corruption. Logging that
615 : * backup_label was used makes it a bit easier to exclude that as the
616 : * cause of observed corruption.
617 : *
618 : * Do so before we try to read the checkpoint record (which can fail),
619 : * as otherwise it can be hard to understand why a checkpoint other
620 : * than ControlFile->checkPoint is used.
621 : */
622 140 : ereport(LOG,
623 : (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
624 : LSN_FORMAT_ARGS(RedoStartLSN),
625 : LSN_FORMAT_ARGS(CheckPointLoc),
626 : CheckPointTLI)));
627 :
628 : /*
629 : * When a backup_label file is present, we want to roll forward from
630 : * the checkpoint it identifies, rather than using pg_control.
631 : */
632 140 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
633 : CheckPointTLI);
634 140 : if (record != NULL)
635 : {
636 140 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
637 140 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
638 140 : ereport(DEBUG1,
639 : (errmsg_internal("checkpoint record is at %X/%X",
640 : LSN_FORMAT_ARGS(CheckPointLoc))));
641 140 : InRecovery = true; /* force recovery even if SHUTDOWNED */
642 :
643 : /*
644 : * Make sure that REDO location exists. This may not be the case
645 : * if there was a crash during an online backup, which left a
646 : * backup_label around that references a WAL segment that's
647 : * already been archived.
648 : */
649 140 : if (checkPoint.redo < CheckPointLoc)
650 : {
651 140 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
652 140 : if (!ReadRecord(xlogprefetcher, LOG, false,
653 : checkPoint.ThisTimeLineID))
654 0 : ereport(FATAL,
655 : (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
656 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
657 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
658 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
659 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
660 : DataDir, DataDir, DataDir, DataDir)));
661 : }
662 : }
663 : else
664 : {
665 0 : ereport(FATAL,
666 : (errmsg("could not locate required checkpoint record at %X/%X",
667 : LSN_FORMAT_ARGS(CheckPointLoc)),
668 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
669 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
670 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
671 : DataDir, DataDir, DataDir, DataDir)));
672 : wasShutdown = false; /* keep compiler quiet */
673 : }
674 :
675 : /* Read the tablespace_map file if present and create symlinks. */
676 140 : if (read_tablespace_map(&tablespaces))
677 : {
678 : ListCell *lc;
679 :
680 8 : foreach(lc, tablespaces)
681 : {
682 4 : tablespaceinfo *ti = lfirst(lc);
683 : char *linkloc;
684 :
685 4 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
686 :
687 : /*
688 : * Remove the existing symlink if any and Create the symlink
689 : * under PGDATA.
690 : */
691 4 : remove_tablespace_symlink(linkloc);
692 :
693 4 : if (symlink(ti->path, linkloc) < 0)
694 0 : ereport(ERROR,
695 : (errcode_for_file_access(),
696 : errmsg("could not create symbolic link \"%s\": %m",
697 : linkloc)));
698 :
699 4 : pfree(ti->path);
700 4 : pfree(ti);
701 : }
702 :
703 : /* tell the caller to delete it later */
704 4 : haveTblspcMap = true;
705 : }
706 :
707 : /* tell the caller to delete it later */
708 140 : haveBackupLabel = true;
709 : }
710 : else
711 : {
712 : /* No backup_label file has been found if we are here. */
713 :
714 : /*
715 : * If tablespace_map file is present without backup_label file, there
716 : * is no use of such file. There is no harm in retaining it, but it
717 : * is better to get rid of the map file so that we don't have any
718 : * redundant file in data directory and it will avoid any sort of
719 : * confusion. It seems prudent though to just rename the file out of
720 : * the way rather than delete it completely, also we ignore any error
721 : * that occurs in rename operation as even if map file is present
722 : * without backup_label file, it is harmless.
723 : */
724 1670 : if (stat(TABLESPACE_MAP, &st) == 0)
725 : {
726 2 : unlink(TABLESPACE_MAP_OLD);
727 2 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
728 2 : ereport(LOG,
729 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
730 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
731 : errdetail("File \"%s\" was renamed to \"%s\".",
732 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
733 : else
734 0 : ereport(LOG,
735 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
736 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
737 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
738 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
739 : }
740 :
741 : /*
742 : * It's possible that archive recovery was requested, but we don't
743 : * know how far we need to replay the WAL before we reach consistency.
744 : * This can happen for example if a base backup is taken from a
745 : * running server using an atomic filesystem snapshot, without calling
746 : * pg_backup_start/stop. Or if you just kill a running primary server
747 : * and put it into archive recovery by creating a recovery signal
748 : * file.
749 : *
750 : * Our strategy in that case is to perform crash recovery first,
751 : * replaying all the WAL present in pg_wal, and only enter archive
752 : * recovery after that.
753 : *
754 : * But usually we already know how far we need to replay the WAL (up
755 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
756 : * end-of-backup record), and we can enter archive recovery directly.
757 : */
758 1670 : if (ArchiveRecoveryRequested &&
759 86 : (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
760 18 : ControlFile->backupEndRequired ||
761 18 : ControlFile->backupEndPoint != InvalidXLogRecPtr ||
762 18 : ControlFile->state == DB_SHUTDOWNED))
763 : {
764 82 : InArchiveRecovery = true;
765 82 : if (StandbyModeRequested)
766 82 : EnableStandbyMode();
767 : }
768 :
769 : /*
770 : * For the same reason as when starting up with backup_label present,
771 : * emit a log message when we continue initializing from a base
772 : * backup.
773 : */
774 1670 : if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
775 0 : ereport(LOG,
776 : (errmsg("restarting backup recovery with redo LSN %X/%X",
777 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
778 :
779 : /* Get the last valid checkpoint record. */
780 1670 : CheckPointLoc = ControlFile->checkPoint;
781 1670 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
782 1670 : RedoStartLSN = ControlFile->checkPointCopy.redo;
783 1670 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
784 1670 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
785 : CheckPointTLI);
786 1670 : if (record != NULL)
787 : {
788 1670 : ereport(DEBUG1,
789 : (errmsg_internal("checkpoint record is at %X/%X",
790 : LSN_FORMAT_ARGS(CheckPointLoc))));
791 : }
792 : else
793 : {
794 : /*
795 : * We used to attempt to go back to a secondary checkpoint record
796 : * here, but only when not in standby mode. We now just fail if we
797 : * can't read the last checkpoint because this allows us to
798 : * simplify processing around checkpoints.
799 : */
800 0 : ereport(PANIC,
801 : (errmsg("could not locate a valid checkpoint record at %X/%X",
802 : LSN_FORMAT_ARGS(CheckPointLoc))));
803 : }
804 1670 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
805 1670 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
806 : }
807 :
808 1810 : if (ArchiveRecoveryRequested)
809 : {
810 214 : if (StandbyModeRequested)
811 204 : ereport(LOG,
812 : (errmsg("entering standby mode")));
813 10 : else if (recoveryTarget == RECOVERY_TARGET_XID)
814 0 : ereport(LOG,
815 : (errmsg("starting point-in-time recovery to XID %u",
816 : recoveryTargetXid)));
817 10 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
818 0 : ereport(LOG,
819 : (errmsg("starting point-in-time recovery to %s",
820 : timestamptz_to_str(recoveryTargetTime))));
821 10 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
822 6 : ereport(LOG,
823 : (errmsg("starting point-in-time recovery to \"%s\"",
824 : recoveryTargetName)));
825 4 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
826 0 : ereport(LOG,
827 : (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
828 : LSN_FORMAT_ARGS(recoveryTargetLSN))));
829 4 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
830 0 : ereport(LOG,
831 : (errmsg("starting point-in-time recovery to earliest consistent point")));
832 : else
833 4 : ereport(LOG,
834 : (errmsg("starting archive recovery")));
835 : }
836 :
837 : /*
838 : * If the location of the checkpoint record is not on the expected
839 : * timeline in the history of the requested timeline, we cannot proceed:
840 : * the backup is not part of the history of the requested timeline.
841 : */
842 : Assert(expectedTLEs); /* was initialized by reading checkpoint
843 : * record */
844 1810 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
845 : CheckPointTLI)
846 : {
847 : XLogRecPtr switchpoint;
848 :
849 : /*
850 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
851 : * not in expectedTLEs at all.
852 : */
853 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
854 0 : ereport(FATAL,
855 : (errmsg("requested timeline %u is not a child of this server's history",
856 : recoveryTargetTLI),
857 : /* translator: %s is a backup_label file or a pg_control file */
858 : errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
859 : haveBackupLabel ? "backup_label" : "pg_control",
860 : LSN_FORMAT_ARGS(CheckPointLoc),
861 : CheckPointTLI,
862 : LSN_FORMAT_ARGS(switchpoint))));
863 : }
864 :
865 : /*
866 : * The min recovery point should be part of the requested timeline's
867 : * history, too.
868 : */
869 1810 : if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
870 80 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
871 80 : ControlFile->minRecoveryPointTLI)
872 0 : ereport(FATAL,
873 : (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
874 : recoveryTargetTLI,
875 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
876 : ControlFile->minRecoveryPointTLI)));
877 :
878 1810 : ereport(DEBUG1,
879 : (errmsg_internal("redo record is at %X/%X; shutdown %s",
880 : LSN_FORMAT_ARGS(checkPoint.redo),
881 : wasShutdown ? "true" : "false")));
882 1810 : ereport(DEBUG1,
883 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
884 : U64FromFullTransactionId(checkPoint.nextXid),
885 : checkPoint.nextOid)));
886 1810 : ereport(DEBUG1,
887 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
888 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
889 1810 : ereport(DEBUG1,
890 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
891 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
892 1810 : ereport(DEBUG1,
893 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
894 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
895 1810 : ereport(DEBUG1,
896 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
897 : checkPoint.oldestCommitTsXid,
898 : checkPoint.newestCommitTsXid)));
899 1810 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
900 0 : ereport(PANIC,
901 : (errmsg("invalid next transaction ID")));
902 :
903 : /* sanity check */
904 1810 : if (checkPoint.redo > CheckPointLoc)
905 0 : ereport(PANIC,
906 : (errmsg("invalid redo in checkpoint record")));
907 :
908 : /*
909 : * Check whether we need to force recovery from WAL. If it appears to
910 : * have been a clean shutdown and we did not have a recovery signal file,
911 : * then assume no recovery needed.
912 : */
913 1810 : if (checkPoint.redo < CheckPointLoc)
914 : {
915 220 : if (wasShutdown)
916 0 : ereport(PANIC,
917 : (errmsg("invalid redo record in shutdown checkpoint")));
918 220 : InRecovery = true;
919 : }
920 1590 : else if (ControlFile->state != DB_SHUTDOWNED)
921 188 : InRecovery = true;
922 1402 : else if (ArchiveRecoveryRequested)
923 : {
924 : /* force recovery due to presence of recovery signal file */
925 14 : InRecovery = true;
926 : }
927 :
928 : /*
929 : * If recovery is needed, update our in-memory copy of pg_control to show
930 : * that we are recovering and to show the selected checkpoint as the place
931 : * we are starting from. We also mark pg_control with any minimum recovery
932 : * stop point obtained from a backup history file.
933 : *
934 : * We don't write the changes to disk yet, though. Only do that after
935 : * initializing various subsystems.
936 : */
937 1810 : if (InRecovery)
938 : {
939 422 : if (InArchiveRecovery)
940 : {
941 222 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
942 : }
943 : else
944 : {
945 200 : ereport(LOG,
946 : (errmsg("database system was not properly shut down; "
947 : "automatic recovery in progress")));
948 200 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
949 4 : ereport(LOG,
950 : (errmsg("crash recovery starts in timeline %u "
951 : "and has target timeline %u",
952 : ControlFile->checkPointCopy.ThisTimeLineID,
953 : recoveryTargetTLI)));
954 200 : ControlFile->state = DB_IN_CRASH_RECOVERY;
955 : }
956 422 : ControlFile->checkPoint = CheckPointLoc;
957 422 : ControlFile->checkPointCopy = checkPoint;
958 422 : if (InArchiveRecovery)
959 : {
960 : /* initialize minRecoveryPoint if not set yet */
961 222 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
962 : {
963 146 : ControlFile->minRecoveryPoint = checkPoint.redo;
964 146 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
965 : }
966 : }
967 :
968 : /*
969 : * Set backupStartPoint if we're starting recovery from a base backup.
970 : *
971 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
972 : * location if we're starting recovery from a base backup which was
973 : * taken from a standby. In this case, the database system status in
974 : * pg_control must indicate that the database was already in recovery.
975 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
976 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
977 : * before reaching this point; e.g. because restore_command or
978 : * primary_conninfo were faulty.
979 : *
980 : * Any other state indicates that the backup somehow became corrupted
981 : * and we can't sensibly continue with recovery.
982 : */
983 422 : if (haveBackupLabel)
984 : {
985 140 : ControlFile->backupStartPoint = checkPoint.redo;
986 140 : ControlFile->backupEndRequired = backupEndRequired;
987 :
988 140 : if (backupFromStandby)
989 : {
990 8 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
991 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
992 0 : ereport(FATAL,
993 : (errmsg("backup_label contains data inconsistent with control file"),
994 : errhint("This means that the backup is corrupted and you will "
995 : "have to use another backup for recovery.")));
996 8 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
997 : }
998 : }
999 : }
1000 :
1001 : /* remember these, so that we know when we have reached consistency */
1002 1810 : backupStartPoint = ControlFile->backupStartPoint;
1003 1810 : backupEndRequired = ControlFile->backupEndRequired;
1004 1810 : backupEndPoint = ControlFile->backupEndPoint;
1005 1810 : if (InArchiveRecovery)
1006 : {
1007 222 : minRecoveryPoint = ControlFile->minRecoveryPoint;
1008 222 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1009 : }
1010 : else
1011 : {
1012 1588 : minRecoveryPoint = InvalidXLogRecPtr;
1013 1588 : minRecoveryPointTLI = 0;
1014 : }
1015 :
1016 : /*
1017 : * Start recovery assuming that the final record isn't lost.
1018 : */
1019 1810 : abortedRecPtr = InvalidXLogRecPtr;
1020 1810 : missingContrecPtr = InvalidXLogRecPtr;
1021 :
1022 1810 : *wasShutdown_ptr = wasShutdown;
1023 1810 : *haveBackupLabel_ptr = haveBackupLabel;
1024 1810 : *haveTblspcMap_ptr = haveTblspcMap;
1025 1810 : }
1026 :
1027 : /*
1028 : * See if there are any recovery signal files and if so, set state for
1029 : * recovery.
1030 : *
1031 : * See if there is a recovery command file (recovery.conf), and if so
1032 : * throw an ERROR since as of PG12 we no longer recognize that.
1033 : */
1034 : static void
1035 1810 : readRecoverySignalFile(void)
1036 : {
1037 : struct stat stat_buf;
1038 :
1039 1810 : if (IsBootstrapProcessingMode())
1040 1596 : return;
1041 :
1042 : /*
1043 : * Check for old recovery API file: recovery.conf
1044 : */
1045 1712 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1046 0 : ereport(FATAL,
1047 : (errcode_for_file_access(),
1048 : errmsg("using recovery command file \"%s\" is not supported",
1049 : RECOVERY_COMMAND_FILE)));
1050 :
1051 : /*
1052 : * Remove unused .done file, if present. Ignore if absent.
1053 : */
1054 1712 : unlink(RECOVERY_COMMAND_DONE);
1055 :
1056 : /*
1057 : * Check for recovery signal files and if found, fsync them since they
1058 : * represent server state information. We don't sweat too much about the
1059 : * possibility of fsync failure, however.
1060 : *
1061 : * If present, standby signal file takes precedence. If neither is present
1062 : * then we won't enter archive recovery.
1063 : */
1064 1712 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1065 : {
1066 : int fd;
1067 :
1068 204 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1069 : S_IRUSR | S_IWUSR);
1070 204 : if (fd >= 0)
1071 : {
1072 204 : (void) pg_fsync(fd);
1073 204 : close(fd);
1074 : }
1075 204 : standby_signal_file_found = true;
1076 : }
1077 1508 : else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1078 : {
1079 : int fd;
1080 :
1081 10 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1082 : S_IRUSR | S_IWUSR);
1083 10 : if (fd >= 0)
1084 : {
1085 10 : (void) pg_fsync(fd);
1086 10 : close(fd);
1087 : }
1088 10 : recovery_signal_file_found = true;
1089 : }
1090 :
1091 1712 : StandbyModeRequested = false;
1092 1712 : ArchiveRecoveryRequested = false;
1093 1712 : if (standby_signal_file_found)
1094 : {
1095 204 : StandbyModeRequested = true;
1096 204 : ArchiveRecoveryRequested = true;
1097 : }
1098 1508 : else if (recovery_signal_file_found)
1099 : {
1100 10 : StandbyModeRequested = false;
1101 10 : ArchiveRecoveryRequested = true;
1102 : }
1103 : else
1104 1498 : return;
1105 :
1106 : /*
1107 : * We don't support standby mode in standalone backends; that requires
1108 : * other processes such as the WAL receiver to be alive.
1109 : */
1110 214 : if (StandbyModeRequested && !IsUnderPostmaster)
1111 0 : ereport(FATAL,
1112 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1113 : errmsg("standby mode is not supported by single-user servers")));
1114 : }
1115 :
1116 : static void
1117 1810 : validateRecoveryParameters(void)
1118 : {
1119 1810 : if (!ArchiveRecoveryRequested)
1120 1596 : return;
1121 :
1122 : /*
1123 : * Check for compulsory parameters
1124 : */
1125 214 : if (StandbyModeRequested)
1126 : {
1127 204 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1128 22 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1129 4 : ereport(WARNING,
1130 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1131 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1132 : }
1133 : else
1134 : {
1135 10 : if (recoveryRestoreCommand == NULL ||
1136 10 : strcmp(recoveryRestoreCommand, "") == 0)
1137 0 : ereport(FATAL,
1138 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1139 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1140 : }
1141 :
1142 : /*
1143 : * Override any inconsistent requests. Note that this is a change of
1144 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1145 : * hot_standby = off, which was surprising behaviour.
1146 : */
1147 214 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1148 200 : !EnableHotStandby)
1149 6 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1150 :
1151 : /*
1152 : * Final parsing of recovery_target_time string; see also
1153 : * check_recovery_target_time().
1154 : */
1155 214 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1156 : {
1157 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1158 : CStringGetDatum(recovery_target_time_string),
1159 : ObjectIdGetDatum(InvalidOid),
1160 : Int32GetDatum(-1)));
1161 : }
1162 :
1163 : /*
1164 : * If user specified recovery_target_timeline, validate it or compute the
1165 : * "latest" value. We can't do this until after we've gotten the restore
1166 : * command and set InArchiveRecovery, because we need to fetch timeline
1167 : * history files from the archive.
1168 : */
1169 214 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1170 : {
1171 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1172 :
1173 : /* Timeline 1 does not have a history file, all else should */
1174 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1175 0 : ereport(FATAL,
1176 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1177 : errmsg("recovery target timeline %u does not exist",
1178 : rtli)));
1179 0 : recoveryTargetTLI = rtli;
1180 : }
1181 214 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1182 : {
1183 : /* We start the "latest" search from pg_control's timeline */
1184 214 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1185 : }
1186 : else
1187 : {
1188 : /*
1189 : * else we just use the recoveryTargetTLI as already read from
1190 : * ControlFile
1191 : */
1192 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1193 : }
1194 : }
1195 :
1196 : /*
1197 : * read_backup_label: check to see if a backup_label file is present
1198 : *
1199 : * If we see a backup_label during recovery, we assume that we are recovering
1200 : * from a backup dump file, and we therefore roll forward from the checkpoint
1201 : * identified by the label file, NOT what pg_control says. This avoids the
1202 : * problem that pg_control might have been archived one or more checkpoints
1203 : * later than the start of the dump, and so if we rely on it as the start
1204 : * point, we will fail to restore a consistent database state.
1205 : *
1206 : * Returns true if a backup_label was found (and fills the checkpoint
1207 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1208 : * returns false if not. If this backup_label came from a streamed backup,
1209 : * *backupEndRequired is set to true. If this backup_label was created during
1210 : * recovery, *backupFromStandby is set to true.
1211 : *
1212 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1213 : * and TLI read from the backup file.
1214 : */
1215 : static bool
1216 1810 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1217 : bool *backupEndRequired, bool *backupFromStandby)
1218 : {
1219 : char startxlogfilename[MAXFNAMELEN];
1220 : TimeLineID tli_from_walseg,
1221 : tli_from_file;
1222 : FILE *lfp;
1223 : char ch;
1224 : char backuptype[20];
1225 : char backupfrom[20];
1226 : char backuplabel[MAXPGPATH];
1227 : char backuptime[128];
1228 : uint32 hi,
1229 : lo;
1230 :
1231 : /* suppress possible uninitialized-variable warnings */
1232 1810 : *checkPointLoc = InvalidXLogRecPtr;
1233 1810 : *backupLabelTLI = 0;
1234 1810 : *backupEndRequired = false;
1235 1810 : *backupFromStandby = false;
1236 :
1237 : /*
1238 : * See if label file is present
1239 : */
1240 1810 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1241 1810 : if (!lfp)
1242 : {
1243 1670 : if (errno != ENOENT)
1244 0 : ereport(FATAL,
1245 : (errcode_for_file_access(),
1246 : errmsg("could not read file \"%s\": %m",
1247 : BACKUP_LABEL_FILE)));
1248 1670 : return false; /* it's not there, all is fine */
1249 : }
1250 :
1251 : /*
1252 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1253 : * is pretty crude, but we are not expecting any variability in the file
1254 : * format).
1255 : */
1256 140 : if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1257 140 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1258 0 : ereport(FATAL,
1259 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1260 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1261 140 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1262 140 : RedoStartTLI = tli_from_walseg;
1263 140 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1264 140 : &hi, &lo, &ch) != 3 || ch != '\n')
1265 0 : ereport(FATAL,
1266 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1267 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1268 140 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1269 140 : *backupLabelTLI = tli_from_walseg;
1270 :
1271 : /*
1272 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1273 : * which could mean either pg_basebackup or the pg_backup_start/stop
1274 : * method was used) or if this label came from somewhere else (the only
1275 : * other option today being from pg_rewind). If this was a streamed
1276 : * backup then we know that we need to play through until we get to the
1277 : * end of the WAL which was generated during the backup (at which point we
1278 : * will have reached consistency and backupEndRequired will be reset to be
1279 : * false).
1280 : */
1281 140 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1282 : {
1283 140 : if (strcmp(backuptype, "streamed") == 0)
1284 138 : *backupEndRequired = true;
1285 : }
1286 :
1287 : /*
1288 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1289 : * it was from a standby, we'll double-check that the control file state
1290 : * matches that of a standby.
1291 : */
1292 140 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1293 : {
1294 140 : if (strcmp(backupfrom, "standby") == 0)
1295 8 : *backupFromStandby = true;
1296 : }
1297 :
1298 : /*
1299 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1300 : * but checking for their presence is useful for debugging and the next
1301 : * sanity checks. Cope also with the fact that the result buffers have a
1302 : * pre-allocated size, hence if the backup_label file has been generated
1303 : * with strings longer than the maximum assumed here an incorrect parsing
1304 : * happens. That's fine as only minor consistency checks are done
1305 : * afterwards.
1306 : */
1307 140 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1308 140 : ereport(DEBUG1,
1309 : (errmsg_internal("backup time %s in file \"%s\"",
1310 : backuptime, BACKUP_LABEL_FILE)));
1311 :
1312 140 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1313 138 : ereport(DEBUG1,
1314 : (errmsg_internal("backup label %s in file \"%s\"",
1315 : backuplabel, BACKUP_LABEL_FILE)));
1316 :
1317 : /*
1318 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1319 : * it as a sanity check if present.
1320 : */
1321 140 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1322 : {
1323 138 : if (tli_from_walseg != tli_from_file)
1324 0 : ereport(FATAL,
1325 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1326 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1327 : errdetail("Timeline ID parsed is %u, but expected %u.",
1328 : tli_from_file, tli_from_walseg)));
1329 :
1330 138 : ereport(DEBUG1,
1331 : (errmsg_internal("backup timeline %u in file \"%s\"",
1332 : tli_from_file, BACKUP_LABEL_FILE)));
1333 : }
1334 :
1335 140 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1336 0 : ereport(FATAL,
1337 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1338 : errmsg("this is an incremental backup, not a data directory"),
1339 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1340 :
1341 140 : if (ferror(lfp) || FreeFile(lfp))
1342 0 : ereport(FATAL,
1343 : (errcode_for_file_access(),
1344 : errmsg("could not read file \"%s\": %m",
1345 : BACKUP_LABEL_FILE)));
1346 :
1347 140 : return true;
1348 : }
1349 :
1350 : /*
1351 : * read_tablespace_map: check to see if a tablespace_map file is present
1352 : *
1353 : * If we see a tablespace_map file during recovery, we assume that we are
1354 : * recovering from a backup dump file, and we therefore need to create symlinks
1355 : * as per the information present in tablespace_map file.
1356 : *
1357 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1358 : * with a tablespaceinfo struct for each tablespace listed in the file);
1359 : * returns false if not.
1360 : */
1361 : static bool
1362 140 : read_tablespace_map(List **tablespaces)
1363 : {
1364 : tablespaceinfo *ti;
1365 : FILE *lfp;
1366 : char str[MAXPGPATH];
1367 : int ch,
1368 : i,
1369 : n;
1370 : bool was_backslash;
1371 :
1372 : /*
1373 : * See if tablespace_map file is present
1374 : */
1375 140 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1376 140 : if (!lfp)
1377 : {
1378 136 : if (errno != ENOENT)
1379 0 : ereport(FATAL,
1380 : (errcode_for_file_access(),
1381 : errmsg("could not read file \"%s\": %m",
1382 : TABLESPACE_MAP)));
1383 136 : return false; /* it's not there, all is fine */
1384 : }
1385 :
1386 : /*
1387 : * Read and parse the link name and path lines from tablespace_map file
1388 : * (this code is pretty crude, but we are not expecting any variability in
1389 : * the file format). De-escape any backslashes that were inserted.
1390 : */
1391 4 : i = 0;
1392 4 : was_backslash = false;
1393 154 : while ((ch = fgetc(lfp)) != EOF)
1394 : {
1395 150 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1396 : {
1397 : char *endp;
1398 :
1399 4 : if (i == 0)
1400 0 : continue; /* \r immediately followed by \n */
1401 :
1402 : /*
1403 : * The de-escaped line should contain an OID followed by exactly
1404 : * one space followed by a path. The path might start with
1405 : * spaces, so don't be too liberal about parsing.
1406 : */
1407 4 : str[i] = '\0';
1408 4 : n = 0;
1409 24 : while (str[n] && str[n] != ' ')
1410 20 : n++;
1411 4 : if (n < 1 || n >= i - 1)
1412 0 : ereport(FATAL,
1413 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1414 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1415 4 : str[n++] = '\0';
1416 :
1417 4 : ti = palloc0(sizeof(tablespaceinfo));
1418 4 : errno = 0;
1419 4 : ti->oid = strtoul(str, &endp, 10);
1420 4 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1421 0 : ereport(FATAL,
1422 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1423 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1424 4 : ti->path = pstrdup(str + n);
1425 4 : *tablespaces = lappend(*tablespaces, ti);
1426 :
1427 4 : i = 0;
1428 4 : continue;
1429 : }
1430 146 : else if (!was_backslash && ch == '\\')
1431 0 : was_backslash = true;
1432 : else
1433 : {
1434 146 : if (i < sizeof(str) - 1)
1435 146 : str[i++] = ch;
1436 146 : was_backslash = false;
1437 : }
1438 : }
1439 :
1440 4 : if (i != 0 || was_backslash) /* last line not terminated? */
1441 0 : ereport(FATAL,
1442 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1443 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1444 :
1445 4 : if (ferror(lfp) || FreeFile(lfp))
1446 0 : ereport(FATAL,
1447 : (errcode_for_file_access(),
1448 : errmsg("could not read file \"%s\": %m",
1449 : TABLESPACE_MAP)));
1450 :
1451 4 : return true;
1452 : }
1453 :
1454 : /*
1455 : * Finish WAL recovery.
1456 : *
1457 : * This does not close the 'xlogreader' yet, because in some cases the caller
1458 : * still wants to re-read the last checkpoint record by calling
1459 : * ReadCheckpointRecord().
1460 : *
1461 : * Returns the position of the last valid or applied record, after which new
1462 : * WAL should be appended, information about why recovery was ended, and some
1463 : * other things. See the EndOfWalRecoveryInfo struct for details.
1464 : */
1465 : EndOfWalRecoveryInfo *
1466 1694 : FinishWalRecovery(void)
1467 : {
1468 1694 : EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1469 : XLogRecPtr lastRec;
1470 : TimeLineID lastRecTLI;
1471 : XLogRecPtr endOfLog;
1472 :
1473 : /*
1474 : * Kill WAL receiver, if it's still running, before we continue to write
1475 : * the startup checkpoint and aborted-contrecord records. It will trump
1476 : * over these records and subsequent ones if it's still alive when we
1477 : * start writing WAL.
1478 : */
1479 1694 : XLogShutdownWalRcv();
1480 :
1481 : /*
1482 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1483 : * it and to prevent it from keep trying to fetch the failover slots.
1484 : *
1485 : * We do not update the 'synced' column in 'pg_replication_slots' system
1486 : * view from true to false here, as any failed update could leave 'synced'
1487 : * column false for some slots. This could cause issues during slot sync
1488 : * after restarting the server as a standby. While updating the 'synced'
1489 : * column after switching to the new timeline is an option, it does not
1490 : * simplify the handling for the 'synced' column. Therefore, we retain the
1491 : * 'synced' column as true after promotion as it may provide useful
1492 : * information about the slot origin.
1493 : */
1494 1694 : ShutDownSlotSync();
1495 :
1496 : /*
1497 : * We are now done reading the xlog from stream. Turn off streaming
1498 : * recovery to force fetching the files (which would be required at end of
1499 : * recovery, e.g., timeline history file) from archive or pg_wal.
1500 : *
1501 : * Note that standby mode must be turned off after killing WAL receiver,
1502 : * i.e., calling XLogShutdownWalRcv().
1503 : */
1504 : Assert(!WalRcvStreaming());
1505 1694 : StandbyMode = false;
1506 :
1507 : /*
1508 : * Determine where to start writing WAL next.
1509 : *
1510 : * Re-fetch the last valid or last applied record, so we can identify the
1511 : * exact endpoint of what we consider the valid portion of WAL. There may
1512 : * be an incomplete continuation record after that, in which case
1513 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1514 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1515 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1516 : *
1517 : * An important side-effect of this is to load the last page into
1518 : * xlogreader. The caller uses it to initialize the WAL for writing.
1519 : */
1520 1694 : if (!InRecovery)
1521 : {
1522 1388 : lastRec = CheckPointLoc;
1523 1388 : lastRecTLI = CheckPointTLI;
1524 : }
1525 : else
1526 : {
1527 306 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1528 306 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1529 : }
1530 1694 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1531 1694 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1532 1694 : endOfLog = xlogreader->EndRecPtr;
1533 :
1534 : /*
1535 : * Remember the TLI in the filename of the XLOG segment containing the
1536 : * end-of-log. It could be different from the timeline that endOfLog
1537 : * nominally belongs to, if there was a timeline switch in that segment,
1538 : * and we were reading the old WAL from a segment belonging to a higher
1539 : * timeline.
1540 : */
1541 1694 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1542 :
1543 1694 : if (ArchiveRecoveryRequested)
1544 : {
1545 : /*
1546 : * We are no longer in archive recovery state.
1547 : *
1548 : * We are now done reading the old WAL. Turn off archive fetching if
1549 : * it was active.
1550 : */
1551 : Assert(InArchiveRecovery);
1552 98 : InArchiveRecovery = false;
1553 :
1554 : /*
1555 : * If the ending log segment is still open, close it (to avoid
1556 : * problems on Windows with trying to rename or delete an open file).
1557 : */
1558 98 : if (readFile >= 0)
1559 : {
1560 98 : close(readFile);
1561 98 : readFile = -1;
1562 : }
1563 : }
1564 :
1565 : /*
1566 : * Copy the last partial block to the caller, for initializing the WAL
1567 : * buffer for appending new WAL.
1568 : */
1569 1694 : if (endOfLog % XLOG_BLCKSZ != 0)
1570 : {
1571 : char *page;
1572 : int len;
1573 : XLogRecPtr pageBeginPtr;
1574 :
1575 1656 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1576 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1577 :
1578 : /* Copy the valid part of the last block */
1579 1656 : len = endOfLog % XLOG_BLCKSZ;
1580 1656 : page = palloc(len);
1581 1656 : memcpy(page, xlogreader->readBuf, len);
1582 :
1583 1656 : result->lastPageBeginPtr = pageBeginPtr;
1584 1656 : result->lastPage = page;
1585 : }
1586 : else
1587 : {
1588 : /* There is no partial block to copy. */
1589 38 : result->lastPageBeginPtr = endOfLog;
1590 38 : result->lastPage = NULL;
1591 : }
1592 :
1593 : /*
1594 : * Create a comment for the history file to explain why and where timeline
1595 : * changed.
1596 : */
1597 1694 : result->recoveryStopReason = getRecoveryStopReason();
1598 :
1599 1694 : result->lastRec = lastRec;
1600 1694 : result->lastRecTLI = lastRecTLI;
1601 1694 : result->endOfLog = endOfLog;
1602 :
1603 1694 : result->abortedRecPtr = abortedRecPtr;
1604 1694 : result->missingContrecPtr = missingContrecPtr;
1605 :
1606 1694 : result->standby_signal_file_found = standby_signal_file_found;
1607 1694 : result->recovery_signal_file_found = recovery_signal_file_found;
1608 :
1609 1694 : return result;
1610 : }
1611 :
1612 : /*
1613 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1614 : */
1615 : void
1616 1694 : ShutdownWalRecovery(void)
1617 : {
1618 : char recoveryPath[MAXPGPATH];
1619 :
1620 : /* Final update of pg_stat_recovery_prefetch. */
1621 1694 : XLogPrefetcherComputeStats(xlogprefetcher);
1622 :
1623 : /* Shut down xlogreader */
1624 1694 : if (readFile >= 0)
1625 : {
1626 1596 : close(readFile);
1627 1596 : readFile = -1;
1628 : }
1629 1694 : XLogReaderFree(xlogreader);
1630 1694 : XLogPrefetcherFree(xlogprefetcher);
1631 :
1632 1694 : if (ArchiveRecoveryRequested)
1633 : {
1634 : /*
1635 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1636 : * rid of it.
1637 : */
1638 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1639 98 : unlink(recoveryPath); /* ignore any error */
1640 :
1641 : /* Get rid of any remaining recovered timeline-history file, too */
1642 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1643 98 : unlink(recoveryPath); /* ignore any error */
1644 : }
1645 :
1646 : /*
1647 : * We don't need the latch anymore. It's not strictly necessary to disown
1648 : * it, but let's do it for the sake of tidiness.
1649 : */
1650 1694 : if (ArchiveRecoveryRequested)
1651 98 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1652 1694 : }
1653 :
1654 : /*
1655 : * Perform WAL recovery.
1656 : *
1657 : * If the system was shut down cleanly, this is never called.
1658 : */
1659 : void
1660 420 : PerformWalRecovery(void)
1661 : {
1662 : XLogRecord *record;
1663 420 : bool reachedRecoveryTarget = false;
1664 : TimeLineID replayTLI;
1665 :
1666 : /*
1667 : * Initialize shared variables for tracking progress of WAL replay, as if
1668 : * we had just replayed the record before the REDO location (or the
1669 : * checkpoint record itself, if it's a shutdown checkpoint).
1670 : */
1671 420 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1672 420 : if (RedoStartLSN < CheckPointLoc)
1673 : {
1674 218 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1675 218 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1676 218 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1677 : }
1678 : else
1679 : {
1680 202 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1681 202 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1682 202 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1683 : }
1684 420 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1685 420 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1686 420 : XLogRecoveryCtl->recoveryLastXTime = 0;
1687 420 : XLogRecoveryCtl->currentChunkStartTime = 0;
1688 420 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1689 420 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1690 :
1691 : /* Also ensure XLogReceiptTime has a sane value */
1692 420 : XLogReceiptTime = GetCurrentTimestamp();
1693 :
1694 : /*
1695 : * Let postmaster know we've started redo now, so that it can launch the
1696 : * archiver if necessary.
1697 : */
1698 420 : if (IsUnderPostmaster)
1699 402 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1700 :
1701 : /*
1702 : * Allow read-only connections immediately if we're consistent already.
1703 : */
1704 420 : CheckRecoveryConsistency();
1705 :
1706 : /*
1707 : * Find the first record that logically follows the checkpoint --- it
1708 : * might physically precede it, though.
1709 : */
1710 420 : if (RedoStartLSN < CheckPointLoc)
1711 : {
1712 : /* back up to find the record */
1713 218 : replayTLI = RedoStartTLI;
1714 218 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1715 218 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1716 :
1717 : /*
1718 : * If a checkpoint record's redo pointer points back to an earlier
1719 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1720 : * record.
1721 : */
1722 218 : if (record->xl_rmid != RM_XLOG_ID ||
1723 218 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1724 0 : ereport(FATAL,
1725 : (errmsg("unexpected record type found at redo point %X/%X",
1726 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1727 : }
1728 : else
1729 : {
1730 : /* just have to read next record after CheckPoint */
1731 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1732 202 : replayTLI = CheckPointTLI;
1733 202 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1734 : }
1735 :
1736 420 : if (record != NULL)
1737 : {
1738 : TimestampTz xtime;
1739 : PGRUsage ru0;
1740 :
1741 402 : pg_rusage_init(&ru0);
1742 :
1743 402 : InRedo = true;
1744 :
1745 402 : RmgrStartup();
1746 :
1747 402 : ereport(LOG,
1748 : (errmsg("redo starts at %X/%X",
1749 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1750 :
1751 : /* Prepare to report progress of the redo phase. */
1752 402 : if (!StandbyMode)
1753 210 : begin_startup_progress_phase();
1754 :
1755 : /*
1756 : * main redo apply loop
1757 : */
1758 : do
1759 : {
1760 5430568 : if (!StandbyMode)
1761 527472 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1762 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1763 :
1764 : #ifdef WAL_DEBUG
1765 : if (XLOG_DEBUG)
1766 : {
1767 : StringInfoData buf;
1768 :
1769 : initStringInfo(&buf);
1770 : appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1771 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1772 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1773 : xlog_outrec(&buf, xlogreader);
1774 : appendStringInfoString(&buf, " - ");
1775 : xlog_outdesc(&buf, xlogreader);
1776 : elog(LOG, "%s", buf.data);
1777 : pfree(buf.data);
1778 : }
1779 : #endif
1780 :
1781 : /* Handle interrupt signals of startup process */
1782 5430568 : ProcessStartupProcInterrupts();
1783 :
1784 : /*
1785 : * Pause WAL replay, if requested by a hot-standby session via
1786 : * SetRecoveryPause().
1787 : *
1788 : * Note that we intentionally don't take the info_lck spinlock
1789 : * here. We might therefore read a slightly stale value of the
1790 : * recoveryPause flag, but it can't be very stale (no worse than
1791 : * the last spinlock we did acquire). Since a pause request is a
1792 : * pretty asynchronous thing anyway, possibly responding to it one
1793 : * WAL record later than we otherwise would is a minor issue, so
1794 : * it doesn't seem worth adding another spinlock cycle to prevent
1795 : * that.
1796 : */
1797 5430568 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1798 : RECOVERY_NOT_PAUSED)
1799 0 : recoveryPausesHere(false);
1800 :
1801 : /*
1802 : * Have we reached our recovery target?
1803 : */
1804 5430568 : if (recoveryStopsBefore(xlogreader))
1805 : {
1806 2 : reachedRecoveryTarget = true;
1807 2 : break;
1808 : }
1809 :
1810 : /*
1811 : * If we've been asked to lag the primary, wait on latch until
1812 : * enough time has passed.
1813 : */
1814 5430566 : if (recoveryApplyDelay(xlogreader))
1815 : {
1816 : /*
1817 : * We test for paused recovery again here. If user sets
1818 : * delayed apply, it may be because they expect to pause
1819 : * recovery in case of problems, so we must test again here
1820 : * otherwise pausing during the delay-wait wouldn't work.
1821 : */
1822 0 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1823 : RECOVERY_NOT_PAUSED)
1824 0 : recoveryPausesHere(false);
1825 : }
1826 :
1827 : /*
1828 : * Apply the record
1829 : */
1830 5430566 : ApplyWalRecord(xlogreader, record, &replayTLI);
1831 :
1832 : /* Exit loop if we reached inclusive recovery target */
1833 5430562 : if (recoveryStopsAfter(xlogreader))
1834 : {
1835 12 : reachedRecoveryTarget = true;
1836 12 : break;
1837 : }
1838 :
1839 : /* Else, try to fetch the next WAL record */
1840 5430550 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1841 5430442 : } while (record != NULL);
1842 :
1843 : /*
1844 : * end of main redo apply loop
1845 : */
1846 :
1847 290 : if (reachedRecoveryTarget)
1848 : {
1849 14 : if (!reachedConsistency)
1850 0 : ereport(FATAL,
1851 : (errmsg("requested recovery stop point is before consistent recovery point")));
1852 :
1853 : /*
1854 : * This is the last point where we can restart recovery with a new
1855 : * recovery target, if we shutdown and begin again. After this,
1856 : * Resource Managers may choose to do permanent corrective actions
1857 : * at end of recovery.
1858 : */
1859 14 : switch (recoveryTargetAction)
1860 : {
1861 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1862 :
1863 : /*
1864 : * exit with special return code to request shutdown of
1865 : * postmaster. Log messages issued from postmaster.
1866 : */
1867 0 : proc_exit(3);
1868 :
1869 2 : case RECOVERY_TARGET_ACTION_PAUSE:
1870 2 : SetRecoveryPause(true);
1871 2 : recoveryPausesHere(true);
1872 :
1873 : /* drop into promote */
1874 :
1875 14 : case RECOVERY_TARGET_ACTION_PROMOTE:
1876 14 : break;
1877 : }
1878 276 : }
1879 :
1880 290 : RmgrCleanup();
1881 :
1882 290 : ereport(LOG,
1883 : (errmsg("redo done at %X/%X system usage: %s",
1884 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1885 : pg_rusage_show(&ru0))));
1886 290 : xtime = GetLatestXTime();
1887 290 : if (xtime)
1888 70 : ereport(LOG,
1889 : (errmsg("last completed transaction was at log time %s",
1890 : timestamptz_to_str(xtime))));
1891 :
1892 290 : InRedo = false;
1893 : }
1894 : else
1895 : {
1896 : /* there are no WAL records following the checkpoint */
1897 18 : ereport(LOG,
1898 : (errmsg("redo is not required")));
1899 : }
1900 :
1901 : /*
1902 : * This check is intentionally after the above log messages that indicate
1903 : * how far recovery went.
1904 : */
1905 308 : if (ArchiveRecoveryRequested &&
1906 100 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1907 16 : !reachedRecoveryTarget)
1908 2 : ereport(FATAL,
1909 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1910 : errmsg("recovery ended before configured recovery target was reached")));
1911 306 : }
1912 :
1913 : /*
1914 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1915 : */
1916 : static void
1917 5430566 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1918 : {
1919 : ErrorContextCallback errcallback;
1920 5430566 : bool switchedTLI = false;
1921 :
1922 : /* Setup error traceback support for ereport() */
1923 5430566 : errcallback.callback = rm_redo_error_callback;
1924 5430566 : errcallback.arg = xlogreader;
1925 5430566 : errcallback.previous = error_context_stack;
1926 5430566 : error_context_stack = &errcallback;
1927 :
1928 : /*
1929 : * TransamVariables->nextXid must be beyond record's xid.
1930 : */
1931 5430566 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1932 :
1933 : /*
1934 : * Before replaying this record, check if this record causes the current
1935 : * timeline to change. The record is already considered to be part of the
1936 : * new timeline, so we update replayTLI before replaying it. That's
1937 : * important so that replayEndTLI, which is recorded as the minimum
1938 : * recovery point's TLI if recovery stops after this record, is set
1939 : * correctly.
1940 : */
1941 5430566 : if (record->xl_rmid == RM_XLOG_ID)
1942 : {
1943 82542 : TimeLineID newReplayTLI = *replayTLI;
1944 82542 : TimeLineID prevReplayTLI = *replayTLI;
1945 82542 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1946 :
1947 82542 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1948 : {
1949 : CheckPoint checkPoint;
1950 :
1951 68 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1952 68 : newReplayTLI = checkPoint.ThisTimeLineID;
1953 68 : prevReplayTLI = checkPoint.PrevTimeLineID;
1954 : }
1955 82474 : else if (info == XLOG_END_OF_RECOVERY)
1956 : {
1957 : xl_end_of_recovery xlrec;
1958 :
1959 20 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1960 20 : newReplayTLI = xlrec.ThisTimeLineID;
1961 20 : prevReplayTLI = xlrec.PrevTimeLineID;
1962 : }
1963 :
1964 82542 : if (newReplayTLI != *replayTLI)
1965 : {
1966 : /* Check that it's OK to switch to this TLI */
1967 22 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1968 : newReplayTLI, prevReplayTLI, *replayTLI);
1969 :
1970 : /* Following WAL records should be run with new TLI */
1971 22 : *replayTLI = newReplayTLI;
1972 22 : switchedTLI = true;
1973 : }
1974 : }
1975 :
1976 : /*
1977 : * Update shared replayEndRecPtr before replaying this record, so that
1978 : * XLogFlush will update minRecoveryPoint correctly.
1979 : */
1980 5430566 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1981 5430566 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1982 5430566 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1983 5430566 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1984 :
1985 : /*
1986 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1987 : */
1988 5430566 : if (standbyState >= STANDBY_INITIALIZED &&
1989 4942914 : TransactionIdIsValid(record->xl_xid))
1990 4849000 : RecordKnownAssignedTransactionIds(record->xl_xid);
1991 :
1992 : /*
1993 : * Some XLOG record types that are related to recovery are processed
1994 : * directly here, rather than in xlog_redo()
1995 : */
1996 5430566 : if (record->xl_rmid == RM_XLOG_ID)
1997 82542 : xlogrecovery_redo(xlogreader, *replayTLI);
1998 :
1999 : /* Now apply the WAL record itself */
2000 5430566 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
2001 :
2002 : /*
2003 : * After redo, check whether the backup pages associated with the WAL
2004 : * record are consistent with the existing pages. This check is done only
2005 : * if consistency check is enabled for this record.
2006 : */
2007 5430562 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2008 4281306 : verifyBackupPageConsistency(xlogreader);
2009 :
2010 : /* Pop the error context stack */
2011 5430562 : error_context_stack = errcallback.previous;
2012 :
2013 : /*
2014 : * Update lastReplayedEndRecPtr after this record has been successfully
2015 : * replayed.
2016 : */
2017 5430562 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2018 5430562 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2019 5430562 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2020 5430562 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2021 5430562 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2022 :
2023 : /* ------
2024 : * Wakeup walsenders:
2025 : *
2026 : * On the standby, the WAL is flushed first (which will only wake up
2027 : * physical walsenders) and then applied, which will only wake up logical
2028 : * walsenders.
2029 : *
2030 : * Indeed, logical walsenders on standby can't decode and send data until
2031 : * it's been applied.
2032 : *
2033 : * Physical walsenders don't need to be woken up during replay unless
2034 : * cascading replication is allowed and time line change occurred (so that
2035 : * they can notice that they are on a new time line).
2036 : *
2037 : * That's why the wake up conditions are for:
2038 : *
2039 : * - physical walsenders in case of new time line and cascade
2040 : * replication is allowed
2041 : * - logical walsenders in case cascade replication is allowed (could not
2042 : * be created otherwise)
2043 : * ------
2044 : */
2045 5430562 : if (AllowCascadeReplication())
2046 5052238 : WalSndWakeup(switchedTLI, true);
2047 :
2048 : /*
2049 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2050 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2051 : * a reply to the primary.
2052 : */
2053 5430562 : if (doRequestWalReceiverReply)
2054 : {
2055 4 : doRequestWalReceiverReply = false;
2056 4 : WalRcvForceReply();
2057 : }
2058 :
2059 : /* Allow read-only connections if we're consistent now */
2060 5430562 : CheckRecoveryConsistency();
2061 :
2062 : /* Is this a timeline switch? */
2063 5430562 : if (switchedTLI)
2064 : {
2065 : /*
2066 : * Before we continue on the new timeline, clean up any (possibly
2067 : * bogus) future WAL segments on the old timeline.
2068 : */
2069 22 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2070 :
2071 : /* Reset the prefetcher. */
2072 22 : XLogPrefetchReconfigure();
2073 : }
2074 5430562 : }
2075 :
2076 : /*
2077 : * Some XLOG RM record types that are directly related to WAL recovery are
2078 : * handled here rather than in the xlog_redo()
2079 : */
2080 : static void
2081 82542 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2082 : {
2083 82542 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2084 82542 : XLogRecPtr lsn = record->EndRecPtr;
2085 :
2086 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2087 :
2088 82542 : if (info == XLOG_OVERWRITE_CONTRECORD)
2089 : {
2090 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2091 : xl_overwrite_contrecord xlrec;
2092 :
2093 2 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2094 2 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2095 0 : elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2096 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2097 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2098 :
2099 : /* We have safely skipped the aborted record */
2100 2 : abortedRecPtr = InvalidXLogRecPtr;
2101 2 : missingContrecPtr = InvalidXLogRecPtr;
2102 :
2103 2 : ereport(LOG,
2104 : (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2105 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2106 : timestamptz_to_str(xlrec.overwrite_time))));
2107 :
2108 : /* Verifying the record should only happen once */
2109 2 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2110 : }
2111 82540 : else if (info == XLOG_BACKUP_END)
2112 : {
2113 : XLogRecPtr startpoint;
2114 :
2115 168 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2116 :
2117 168 : if (backupStartPoint == startpoint)
2118 : {
2119 : /*
2120 : * We have reached the end of base backup, the point where
2121 : * pg_backup_stop() was done. The data on disk is now consistent
2122 : * (assuming we have also reached minRecoveryPoint). Set
2123 : * backupEndPoint to the current LSN, so that the next call to
2124 : * CheckRecoveryConsistency() will notice it and do the
2125 : * end-of-backup processing.
2126 : */
2127 136 : elog(DEBUG1, "end of backup record reached");
2128 :
2129 136 : backupEndPoint = lsn;
2130 : }
2131 : else
2132 32 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2133 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2134 : }
2135 82542 : }
2136 :
2137 : /*
2138 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2139 : * directories.
2140 : *
2141 : * Replay of database creation XLOG records for databases that were later
2142 : * dropped can create fake directories in pg_tblspc. By the time consistency
2143 : * is reached these directories should have been removed; here we verify
2144 : * that this did indeed happen. This is to be called at the point where
2145 : * consistent state is reached.
2146 : *
2147 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2148 : * useful for testing purposes, and also allows for an escape hatch in case
2149 : * things go south.
2150 : */
2151 : static void
2152 224 : CheckTablespaceDirectory(void)
2153 : {
2154 : DIR *dir;
2155 : struct dirent *de;
2156 :
2157 224 : dir = AllocateDir(PG_TBLSPC_DIR);
2158 686 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2159 : {
2160 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2161 :
2162 : /* Skip entries of non-oid names */
2163 462 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2164 448 : continue;
2165 :
2166 14 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2167 :
2168 14 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2169 8 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2170 : (errcode(ERRCODE_DATA_CORRUPTED),
2171 : errmsg("unexpected directory entry \"%s\" found in %s",
2172 : de->d_name, PG_TBLSPC_DIR),
2173 : errdetail("All directory entries in %s/ should be symbolic links.",
2174 : PG_TBLSPC_DIR),
2175 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2176 : }
2177 224 : }
2178 :
2179 : /*
2180 : * Checks if recovery has reached a consistent state. When consistency is
2181 : * reached and we have a valid starting standby snapshot, tell postmaster
2182 : * that it can start accepting read-only connections.
2183 : */
2184 : static void
2185 5430986 : CheckRecoveryConsistency(void)
2186 : {
2187 : XLogRecPtr lastReplayedEndRecPtr;
2188 : TimeLineID lastReplayedTLI;
2189 :
2190 : /*
2191 : * During crash recovery, we don't reach a consistent state until we've
2192 : * replayed all the WAL.
2193 : */
2194 5430986 : if (XLogRecPtrIsInvalid(minRecoveryPoint))
2195 517244 : return;
2196 :
2197 : Assert(InArchiveRecovery);
2198 :
2199 : /*
2200 : * assume that we are called in the startup process, and hence don't need
2201 : * a lock to read lastReplayedEndRecPtr
2202 : */
2203 4913742 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2204 4913742 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2205 :
2206 : /*
2207 : * Have we reached the point where our base backup was completed?
2208 : */
2209 4913742 : if (!XLogRecPtrIsInvalid(backupEndPoint) &&
2210 200 : backupEndPoint <= lastReplayedEndRecPtr)
2211 : {
2212 140 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2213 140 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2214 :
2215 140 : elog(DEBUG1, "end of backup reached");
2216 :
2217 : /*
2218 : * We have reached the end of base backup, as indicated by pg_control.
2219 : * Update the control file accordingly.
2220 : */
2221 140 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2222 140 : backupStartPoint = InvalidXLogRecPtr;
2223 140 : backupEndPoint = InvalidXLogRecPtr;
2224 140 : backupEndRequired = false;
2225 :
2226 140 : ereport(LOG,
2227 : (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2228 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2229 : LSN_FORMAT_ARGS(saveBackupEndPoint))));
2230 : }
2231 :
2232 : /*
2233 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2234 : * known to be incorrectly set if recovering from a backup, until the
2235 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2236 : * All we know prior to that is that we're not consistent yet.
2237 : */
2238 4913742 : if (!reachedConsistency && !backupEndRequired &&
2239 15188 : minRecoveryPoint <= lastReplayedEndRecPtr)
2240 : {
2241 : /*
2242 : * Check to see if the XLOG sequence contained any unresolved
2243 : * references to uninitialized pages.
2244 : */
2245 224 : XLogCheckInvalidPages();
2246 :
2247 : /*
2248 : * Check that pg_tblspc doesn't contain any real directories. Replay
2249 : * of Database/CREATE_* records may have created fictitious tablespace
2250 : * directories that should have been removed by the time consistency
2251 : * was reached.
2252 : */
2253 224 : CheckTablespaceDirectory();
2254 :
2255 224 : reachedConsistency = true;
2256 224 : SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2257 224 : ereport(LOG,
2258 : (errmsg("consistent recovery state reached at %X/%X",
2259 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2260 : }
2261 :
2262 : /*
2263 : * Have we got a valid starting snapshot that will allow queries to be
2264 : * run? If so, we can tell postmaster that the database is consistent now,
2265 : * enabling connections.
2266 : */
2267 4913742 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2268 4913304 : !LocalHotStandbyActive &&
2269 208 : reachedConsistency &&
2270 : IsUnderPostmaster)
2271 : {
2272 208 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2273 208 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2274 208 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2275 :
2276 208 : LocalHotStandbyActive = true;
2277 :
2278 208 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2279 : }
2280 : }
2281 :
2282 : /*
2283 : * Error context callback for errors occurring during rm_redo().
2284 : */
2285 : static void
2286 200 : rm_redo_error_callback(void *arg)
2287 : {
2288 200 : XLogReaderState *record = (XLogReaderState *) arg;
2289 : StringInfoData buf;
2290 :
2291 200 : initStringInfo(&buf);
2292 200 : xlog_outdesc(&buf, record);
2293 200 : xlog_block_info(&buf, record);
2294 :
2295 : /* translator: %s is a WAL record description */
2296 200 : errcontext("WAL redo at %X/%X for %s",
2297 200 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2298 : buf.data);
2299 :
2300 200 : pfree(buf.data);
2301 200 : }
2302 :
2303 : /*
2304 : * Returns a string describing an XLogRecord, consisting of its identity
2305 : * optionally followed by a colon, a space, and a further description.
2306 : */
2307 : void
2308 200 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2309 : {
2310 200 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2311 200 : uint8 info = XLogRecGetInfo(record);
2312 : const char *id;
2313 :
2314 200 : appendStringInfoString(buf, rmgr.rm_name);
2315 200 : appendStringInfoChar(buf, '/');
2316 :
2317 200 : id = rmgr.rm_identify(info);
2318 200 : if (id == NULL)
2319 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2320 : else
2321 200 : appendStringInfo(buf, "%s: ", id);
2322 :
2323 200 : rmgr.rm_desc(buf, record);
2324 200 : }
2325 :
2326 : #ifdef WAL_DEBUG
2327 :
2328 : static void
2329 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2330 : {
2331 : appendStringInfo(buf, "prev %X/%X; xid %u",
2332 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2333 : XLogRecGetXid(record));
2334 :
2335 : appendStringInfo(buf, "; len %u",
2336 : XLogRecGetDataLen(record));
2337 :
2338 : xlog_block_info(buf, record);
2339 : }
2340 : #endif /* WAL_DEBUG */
2341 :
2342 : /*
2343 : * Returns a string giving information about all the blocks in an
2344 : * XLogRecord.
2345 : */
2346 : static void
2347 200 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2348 : {
2349 : int block_id;
2350 :
2351 : /* decode block references */
2352 312 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2353 : {
2354 : RelFileLocator rlocator;
2355 : ForkNumber forknum;
2356 : BlockNumber blk;
2357 :
2358 112 : if (!XLogRecGetBlockTagExtended(record, block_id,
2359 : &rlocator, &forknum, &blk, NULL))
2360 0 : continue;
2361 :
2362 112 : if (forknum != MAIN_FORKNUM)
2363 4 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2364 : block_id,
2365 : rlocator.spcOid, rlocator.dbOid,
2366 : rlocator.relNumber,
2367 : forknum,
2368 : blk);
2369 : else
2370 108 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2371 : block_id,
2372 : rlocator.spcOid, rlocator.dbOid,
2373 : rlocator.relNumber,
2374 : blk);
2375 112 : if (XLogRecHasBlockImage(record, block_id))
2376 70 : appendStringInfoString(buf, " FPW");
2377 : }
2378 200 : }
2379 :
2380 :
2381 : /*
2382 : * Check that it's OK to switch to new timeline during recovery.
2383 : *
2384 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2385 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2386 : */
2387 : static void
2388 22 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2389 : TimeLineID replayTLI)
2390 : {
2391 : /* Check that the record agrees on what the current (old) timeline is */
2392 22 : if (prevTLI != replayTLI)
2393 0 : ereport(PANIC,
2394 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2395 : prevTLI, replayTLI)));
2396 :
2397 : /*
2398 : * The new timeline better be in the list of timelines we expect to see,
2399 : * according to the timeline history. It should also not decrease.
2400 : */
2401 22 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2402 0 : ereport(PANIC,
2403 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2404 : newTLI, replayTLI)));
2405 :
2406 : /*
2407 : * If we have not yet reached min recovery point, and we're about to
2408 : * switch to a timeline greater than the timeline of the min recovery
2409 : * point: trouble. After switching to the new timeline, we could not
2410 : * possibly visit the min recovery point on the correct timeline anymore.
2411 : * This can happen if there is a newer timeline in the archive that
2412 : * branched before the timeline the min recovery point is on, and you
2413 : * attempt to do PITR to the new timeline.
2414 : */
2415 22 : if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
2416 18 : lsn < minRecoveryPoint &&
2417 2 : newTLI > minRecoveryPointTLI)
2418 0 : ereport(PANIC,
2419 : (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2420 : newTLI,
2421 : LSN_FORMAT_ARGS(minRecoveryPoint),
2422 : minRecoveryPointTLI)));
2423 :
2424 : /* Looks good */
2425 22 : }
2426 :
2427 :
2428 : /*
2429 : * Extract timestamp from WAL record.
2430 : *
2431 : * If the record contains a timestamp, returns true, and saves the timestamp
2432 : * in *recordXtime. If the record type has no timestamp, returns false.
2433 : * Currently, only transaction commit/abort records and restore points contain
2434 : * timestamps.
2435 : */
2436 : static bool
2437 84020 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2438 : {
2439 84020 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2440 84020 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2441 84020 : uint8 rmid = XLogRecGetRmid(record);
2442 :
2443 84020 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2444 : {
2445 4 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2446 4 : return true;
2447 : }
2448 84016 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2449 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2450 : {
2451 76984 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2452 76984 : return true;
2453 : }
2454 7032 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2455 : xact_info == XLOG_XACT_ABORT_PREPARED))
2456 : {
2457 7032 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2458 7032 : return true;
2459 : }
2460 0 : return false;
2461 : }
2462 :
2463 : /*
2464 : * Checks whether the current buffer page and backup page stored in the
2465 : * WAL record are consistent or not. Before comparing the two pages, a
2466 : * masking can be applied to the pages to ignore certain areas like hint bits,
2467 : * unused space between pd_lower and pd_upper among other things. This
2468 : * function should be called once WAL replay has been completed for a
2469 : * given record.
2470 : */
2471 : static void
2472 4281306 : verifyBackupPageConsistency(XLogReaderState *record)
2473 : {
2474 4281306 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2475 : RelFileLocator rlocator;
2476 : ForkNumber forknum;
2477 : BlockNumber blkno;
2478 : int block_id;
2479 :
2480 : /* Records with no backup blocks have no need for consistency checks. */
2481 4281306 : if (!XLogRecHasAnyBlockRefs(record))
2482 0 : return;
2483 :
2484 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2485 :
2486 8892172 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2487 : {
2488 : Buffer buf;
2489 : Page page;
2490 :
2491 4610866 : if (!XLogRecGetBlockTagExtended(record, block_id,
2492 : &rlocator, &forknum, &blkno, NULL))
2493 : {
2494 : /*
2495 : * WAL record doesn't contain a block reference with the given id.
2496 : * Do nothing.
2497 : */
2498 3904 : continue;
2499 : }
2500 :
2501 : Assert(XLogRecHasBlockImage(record, block_id));
2502 :
2503 4606962 : if (XLogRecBlockImageApply(record, block_id))
2504 : {
2505 : /*
2506 : * WAL record has already applied the page, so bypass the
2507 : * consistency check as that would result in comparing the full
2508 : * page stored in the record with itself.
2509 : */
2510 41882 : continue;
2511 : }
2512 :
2513 : /*
2514 : * Read the contents from the current buffer and store it in a
2515 : * temporary page.
2516 : */
2517 4565080 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2518 : RBM_NORMAL_NO_LOG,
2519 : InvalidBuffer);
2520 4565080 : if (!BufferIsValid(buf))
2521 0 : continue;
2522 :
2523 4565080 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2524 4565080 : page = BufferGetPage(buf);
2525 :
2526 : /*
2527 : * Take a copy of the local page where WAL has been applied to have a
2528 : * comparison base before masking it...
2529 : */
2530 4565080 : memcpy(replay_image_masked, page, BLCKSZ);
2531 :
2532 : /* No need for this page anymore now that a copy is in. */
2533 4565080 : UnlockReleaseBuffer(buf);
2534 :
2535 : /*
2536 : * If the block LSN is already ahead of this WAL record, we can't
2537 : * expect contents to match. This can happen if recovery is
2538 : * restarted.
2539 : */
2540 4565080 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2541 0 : continue;
2542 :
2543 : /*
2544 : * Read the contents from the backup copy, stored in WAL record and
2545 : * store it in a temporary page. There is no need to allocate a new
2546 : * page here, a local buffer is fine to hold its contents and a mask
2547 : * can be directly applied on it.
2548 : */
2549 4565080 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2550 0 : ereport(ERROR,
2551 : (errcode(ERRCODE_INTERNAL_ERROR),
2552 : errmsg_internal("%s", record->errormsg_buf)));
2553 :
2554 : /*
2555 : * If masking function is defined, mask both the primary and replay
2556 : * images
2557 : */
2558 4565080 : if (rmgr.rm_mask != NULL)
2559 : {
2560 4565080 : rmgr.rm_mask(replay_image_masked, blkno);
2561 4565080 : rmgr.rm_mask(primary_image_masked, blkno);
2562 : }
2563 :
2564 : /* Time to compare the primary and replay images. */
2565 4565080 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2566 : {
2567 0 : elog(FATAL,
2568 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2569 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2570 : forknum, blkno);
2571 : }
2572 : }
2573 : }
2574 :
2575 : /*
2576 : * For point-in-time recovery, this function decides whether we want to
2577 : * stop applying the XLOG before the current record.
2578 : *
2579 : * Returns true if we are stopping, false otherwise. If stopping, some
2580 : * information is saved in recoveryStopXid et al for use in annotating the
2581 : * new timeline's history file.
2582 : */
2583 : static bool
2584 5430568 : recoveryStopsBefore(XLogReaderState *record)
2585 : {
2586 5430568 : bool stopsHere = false;
2587 : uint8 xact_info;
2588 : bool isCommit;
2589 5430568 : TimestampTz recordXtime = 0;
2590 : TransactionId recordXid;
2591 :
2592 : /*
2593 : * Ignore recovery target settings when not in archive recovery (meaning
2594 : * we are in crash recovery).
2595 : */
2596 5430568 : if (!ArchiveRecoveryRequested)
2597 487624 : return false;
2598 :
2599 : /* Check if we should stop as soon as reaching consistency */
2600 4942944 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2601 : {
2602 0 : ereport(LOG,
2603 : (errmsg("recovery stopping after reaching consistency")));
2604 :
2605 0 : recoveryStopAfter = false;
2606 0 : recoveryStopXid = InvalidTransactionId;
2607 0 : recoveryStopLSN = InvalidXLogRecPtr;
2608 0 : recoveryStopTime = 0;
2609 0 : recoveryStopName[0] = '\0';
2610 0 : return true;
2611 : }
2612 :
2613 : /* Check if target LSN has been reached */
2614 4942944 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2615 16904 : !recoveryTargetInclusive &&
2616 732 : record->ReadRecPtr >= recoveryTargetLSN)
2617 : {
2618 2 : recoveryStopAfter = false;
2619 2 : recoveryStopXid = InvalidTransactionId;
2620 2 : recoveryStopLSN = record->ReadRecPtr;
2621 2 : recoveryStopTime = 0;
2622 2 : recoveryStopName[0] = '\0';
2623 2 : ereport(LOG,
2624 : (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2625 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2626 2 : return true;
2627 : }
2628 :
2629 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2630 4942942 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2631 4900366 : return false;
2632 :
2633 42576 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2634 :
2635 42576 : if (xact_info == XLOG_XACT_COMMIT)
2636 : {
2637 38446 : isCommit = true;
2638 38446 : recordXid = XLogRecGetXid(record);
2639 : }
2640 4130 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2641 : {
2642 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2643 : xl_xact_parsed_commit parsed;
2644 :
2645 48 : isCommit = true;
2646 48 : ParseCommitRecord(XLogRecGetInfo(record),
2647 : xlrec,
2648 : &parsed);
2649 48 : recordXid = parsed.twophase_xid;
2650 : }
2651 4082 : else if (xact_info == XLOG_XACT_ABORT)
2652 : {
2653 3494 : isCommit = false;
2654 3494 : recordXid = XLogRecGetXid(record);
2655 : }
2656 588 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2657 : {
2658 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2659 : xl_xact_parsed_abort parsed;
2660 :
2661 22 : isCommit = false;
2662 22 : ParseAbortRecord(XLogRecGetInfo(record),
2663 : xlrec,
2664 : &parsed);
2665 22 : recordXid = parsed.twophase_xid;
2666 : }
2667 : else
2668 566 : return false;
2669 :
2670 42010 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2671 : {
2672 : /*
2673 : * There can be only one transaction end record with this exact
2674 : * transactionid
2675 : *
2676 : * when testing for an xid, we MUST test for equality only, since
2677 : * transactions are numbered in the order they start, not the order
2678 : * they complete. A higher numbered xid will complete before you about
2679 : * 50% of the time...
2680 : */
2681 0 : stopsHere = (recordXid == recoveryTargetXid);
2682 : }
2683 :
2684 : /*
2685 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2686 : * We don't expect getRecordTimestamp ever to fail, since we already know
2687 : * this is a commit or abort record; but test its result anyway.
2688 : */
2689 42010 : if (getRecordTimestamp(record, &recordXtime) &&
2690 42010 : recoveryTarget == RECOVERY_TARGET_TIME)
2691 : {
2692 : /*
2693 : * There can be many transactions that share the same commit time, so
2694 : * we stop after the last one, if we are inclusive, or stop at the
2695 : * first one if we are exclusive
2696 : */
2697 0 : if (recoveryTargetInclusive)
2698 0 : stopsHere = (recordXtime > recoveryTargetTime);
2699 : else
2700 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2701 : }
2702 :
2703 42010 : if (stopsHere)
2704 : {
2705 0 : recoveryStopAfter = false;
2706 0 : recoveryStopXid = recordXid;
2707 0 : recoveryStopTime = recordXtime;
2708 0 : recoveryStopLSN = InvalidXLogRecPtr;
2709 0 : recoveryStopName[0] = '\0';
2710 :
2711 0 : if (isCommit)
2712 : {
2713 0 : ereport(LOG,
2714 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2715 : recoveryStopXid,
2716 : timestamptz_to_str(recoveryStopTime))));
2717 : }
2718 : else
2719 : {
2720 0 : ereport(LOG,
2721 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2722 : recoveryStopXid,
2723 : timestamptz_to_str(recoveryStopTime))));
2724 : }
2725 : }
2726 :
2727 42010 : return stopsHere;
2728 : }
2729 :
2730 : /*
2731 : * Same as recoveryStopsBefore, but called after applying the record.
2732 : *
2733 : * We also track the timestamp of the latest applied COMMIT/ABORT
2734 : * record in XLogRecoveryCtl->recoveryLastXTime.
2735 : */
2736 : static bool
2737 5430562 : recoveryStopsAfter(XLogReaderState *record)
2738 : {
2739 : uint8 info;
2740 : uint8 xact_info;
2741 : uint8 rmid;
2742 5430562 : TimestampTz recordXtime = 0;
2743 :
2744 : /*
2745 : * Ignore recovery target settings when not in archive recovery (meaning
2746 : * we are in crash recovery).
2747 : */
2748 5430562 : if (!ArchiveRecoveryRequested)
2749 487624 : return false;
2750 :
2751 4942938 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2752 4942938 : rmid = XLogRecGetRmid(record);
2753 :
2754 : /*
2755 : * There can be many restore points that share the same name; we stop at
2756 : * the first one.
2757 : */
2758 4942938 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2759 44 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2760 : {
2761 : xl_restore_point *recordRestorePointData;
2762 :
2763 6 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2764 :
2765 6 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2766 : {
2767 4 : recoveryStopAfter = true;
2768 4 : recoveryStopXid = InvalidTransactionId;
2769 4 : recoveryStopLSN = InvalidXLogRecPtr;
2770 4 : (void) getRecordTimestamp(record, &recoveryStopTime);
2771 4 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2772 :
2773 4 : ereport(LOG,
2774 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2775 : recoveryStopName,
2776 : timestamptz_to_str(recoveryStopTime))));
2777 4 : return true;
2778 : }
2779 : }
2780 :
2781 : /* Check if the target LSN has been reached */
2782 4942934 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2783 16172 : recoveryTargetInclusive &&
2784 16172 : record->ReadRecPtr >= recoveryTargetLSN)
2785 : {
2786 8 : recoveryStopAfter = true;
2787 8 : recoveryStopXid = InvalidTransactionId;
2788 8 : recoveryStopLSN = record->ReadRecPtr;
2789 8 : recoveryStopTime = 0;
2790 8 : recoveryStopName[0] = '\0';
2791 8 : ereport(LOG,
2792 : (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2793 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2794 8 : return true;
2795 : }
2796 :
2797 4942926 : if (rmid != RM_XACT_ID)
2798 4900354 : return false;
2799 :
2800 42572 : xact_info = info & XLOG_XACT_OPMASK;
2801 :
2802 42572 : if (xact_info == XLOG_XACT_COMMIT ||
2803 4082 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2804 588 : xact_info == XLOG_XACT_ABORT ||
2805 : xact_info == XLOG_XACT_ABORT_PREPARED)
2806 : {
2807 : TransactionId recordXid;
2808 :
2809 : /* Update the last applied transaction timestamp */
2810 42006 : if (getRecordTimestamp(record, &recordXtime))
2811 42006 : SetLatestXTime(recordXtime);
2812 :
2813 : /* Extract the XID of the committed/aborted transaction */
2814 42006 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2815 : {
2816 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2817 : xl_xact_parsed_commit parsed;
2818 :
2819 48 : ParseCommitRecord(XLogRecGetInfo(record),
2820 : xlrec,
2821 : &parsed);
2822 48 : recordXid = parsed.twophase_xid;
2823 : }
2824 41958 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2825 : {
2826 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2827 : xl_xact_parsed_abort parsed;
2828 :
2829 22 : ParseAbortRecord(XLogRecGetInfo(record),
2830 : xlrec,
2831 : &parsed);
2832 22 : recordXid = parsed.twophase_xid;
2833 : }
2834 : else
2835 41936 : recordXid = XLogRecGetXid(record);
2836 :
2837 : /*
2838 : * There can be only one transaction end record with this exact
2839 : * transactionid
2840 : *
2841 : * when testing for an xid, we MUST test for equality only, since
2842 : * transactions are numbered in the order they start, not the order
2843 : * they complete. A higher numbered xid will complete before you about
2844 : * 50% of the time...
2845 : */
2846 42006 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2847 0 : recordXid == recoveryTargetXid)
2848 : {
2849 0 : recoveryStopAfter = true;
2850 0 : recoveryStopXid = recordXid;
2851 0 : recoveryStopTime = recordXtime;
2852 0 : recoveryStopLSN = InvalidXLogRecPtr;
2853 0 : recoveryStopName[0] = '\0';
2854 :
2855 0 : if (xact_info == XLOG_XACT_COMMIT ||
2856 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2857 : {
2858 0 : ereport(LOG,
2859 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2860 : recoveryStopXid,
2861 : timestamptz_to_str(recoveryStopTime))));
2862 : }
2863 0 : else if (xact_info == XLOG_XACT_ABORT ||
2864 : xact_info == XLOG_XACT_ABORT_PREPARED)
2865 : {
2866 0 : ereport(LOG,
2867 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2868 : recoveryStopXid,
2869 : timestamptz_to_str(recoveryStopTime))));
2870 : }
2871 0 : return true;
2872 : }
2873 : }
2874 :
2875 : /* Check if we should stop as soon as reaching consistency */
2876 42572 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2877 : {
2878 0 : ereport(LOG,
2879 : (errmsg("recovery stopping after reaching consistency")));
2880 :
2881 0 : recoveryStopAfter = true;
2882 0 : recoveryStopXid = InvalidTransactionId;
2883 0 : recoveryStopTime = 0;
2884 0 : recoveryStopLSN = InvalidXLogRecPtr;
2885 0 : recoveryStopName[0] = '\0';
2886 0 : return true;
2887 : }
2888 :
2889 42572 : return false;
2890 : }
2891 :
2892 : /*
2893 : * Create a comment for the history file to explain why and where
2894 : * timeline changed.
2895 : */
2896 : static char *
2897 1694 : getRecoveryStopReason(void)
2898 : {
2899 : char reason[200];
2900 :
2901 1694 : if (recoveryTarget == RECOVERY_TARGET_XID)
2902 0 : snprintf(reason, sizeof(reason),
2903 : "%s transaction %u",
2904 0 : recoveryStopAfter ? "after" : "before",
2905 : recoveryStopXid);
2906 1694 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2907 0 : snprintf(reason, sizeof(reason),
2908 : "%s %s\n",
2909 0 : recoveryStopAfter ? "after" : "before",
2910 : timestamptz_to_str(recoveryStopTime));
2911 1694 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2912 14 : snprintf(reason, sizeof(reason),
2913 : "%s LSN %X/%X\n",
2914 14 : recoveryStopAfter ? "after" : "before",
2915 14 : LSN_FORMAT_ARGS(recoveryStopLSN));
2916 1680 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2917 6 : snprintf(reason, sizeof(reason),
2918 : "at restore point \"%s\"",
2919 : recoveryStopName);
2920 1674 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2921 0 : snprintf(reason, sizeof(reason), "reached consistency");
2922 : else
2923 1674 : snprintf(reason, sizeof(reason), "no recovery target specified");
2924 :
2925 1694 : return pstrdup(reason);
2926 : }
2927 :
2928 : /*
2929 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2930 : *
2931 : * endOfRecovery is true if the recovery target is reached and
2932 : * the paused state starts at the end of recovery because of
2933 : * recovery_target_action=pause, and false otherwise.
2934 : */
2935 : static void
2936 6 : recoveryPausesHere(bool endOfRecovery)
2937 : {
2938 : /* Don't pause unless users can connect! */
2939 6 : if (!LocalHotStandbyActive)
2940 0 : return;
2941 :
2942 : /* Don't pause after standby promotion has been triggered */
2943 6 : if (LocalPromoteIsTriggered)
2944 0 : return;
2945 :
2946 6 : if (endOfRecovery)
2947 2 : ereport(LOG,
2948 : (errmsg("pausing at the end of recovery"),
2949 : errhint("Execute pg_wal_replay_resume() to promote.")));
2950 : else
2951 4 : ereport(LOG,
2952 : (errmsg("recovery has paused"),
2953 : errhint("Execute pg_wal_replay_resume() to continue.")));
2954 :
2955 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2956 18 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2957 : {
2958 16 : ProcessStartupProcInterrupts();
2959 16 : if (CheckForStandbyTrigger())
2960 4 : return;
2961 :
2962 : /*
2963 : * If recovery pause is requested then set it paused. While we are in
2964 : * the loop, user might resume and pause again so set this every time.
2965 : */
2966 12 : ConfirmRecoveryPaused();
2967 :
2968 : /*
2969 : * We wait on a condition variable that will wake us as soon as the
2970 : * pause ends, but we use a timeout so we can check the above exit
2971 : * condition periodically too.
2972 : */
2973 12 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2974 : WAIT_EVENT_RECOVERY_PAUSE);
2975 : }
2976 2 : ConditionVariableCancelSleep();
2977 : }
2978 :
2979 : /*
2980 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2981 : * certain record types are applied at least that interval behind the primary.
2982 : *
2983 : * Returns true if we waited.
2984 : *
2985 : * Note that the delay is calculated between the WAL record log time and
2986 : * the current time on standby. We would prefer to keep track of when this
2987 : * standby received each WAL record, which would allow a more consistent
2988 : * approach and one not affected by time synchronisation issues, but that
2989 : * is significantly more effort and complexity for little actual gain in
2990 : * usability.
2991 : */
2992 : static bool
2993 5430566 : recoveryApplyDelay(XLogReaderState *record)
2994 : {
2995 : uint8 xact_info;
2996 : TimestampTz xtime;
2997 : TimestampTz delayUntil;
2998 : long msecs;
2999 :
3000 : /* nothing to do if no delay configured */
3001 5430566 : if (recovery_min_apply_delay <= 0)
3002 5430566 : return false;
3003 :
3004 : /* no delay is applied on a database not yet consistent */
3005 0 : if (!reachedConsistency)
3006 0 : return false;
3007 :
3008 : /* nothing to do if crash recovery is requested */
3009 0 : if (!ArchiveRecoveryRequested)
3010 0 : return false;
3011 :
3012 : /*
3013 : * Is it a COMMIT record?
3014 : *
3015 : * We deliberately choose not to delay aborts since they have no effect on
3016 : * MVCC. We already allow replay of records that don't have a timestamp,
3017 : * so there is already opportunity for issues caused by early conflicts on
3018 : * standbys.
3019 : */
3020 0 : if (XLogRecGetRmid(record) != RM_XACT_ID)
3021 0 : return false;
3022 :
3023 0 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3024 :
3025 0 : if (xact_info != XLOG_XACT_COMMIT &&
3026 : xact_info != XLOG_XACT_COMMIT_PREPARED)
3027 0 : return false;
3028 :
3029 0 : if (!getRecordTimestamp(record, &xtime))
3030 0 : return false;
3031 :
3032 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3033 :
3034 : /*
3035 : * Exit without arming the latch if it's already past time to apply this
3036 : * record
3037 : */
3038 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3039 0 : if (msecs <= 0)
3040 0 : return false;
3041 :
3042 : while (true)
3043 : {
3044 0 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3045 :
3046 : /* This might change recovery_min_apply_delay. */
3047 0 : ProcessStartupProcInterrupts();
3048 :
3049 0 : if (CheckForStandbyTrigger())
3050 0 : break;
3051 :
3052 : /*
3053 : * Recalculate delayUntil as recovery_min_apply_delay could have
3054 : * changed while waiting in this loop.
3055 : */
3056 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3057 :
3058 : /*
3059 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3060 : */
3061 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3062 : delayUntil);
3063 :
3064 0 : if (msecs <= 0)
3065 0 : break;
3066 :
3067 0 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3068 :
3069 0 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3070 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3071 : msecs,
3072 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3073 : }
3074 0 : return true;
3075 : }
3076 :
3077 : /*
3078 : * Get the current state of the recovery pause request.
3079 : */
3080 : RecoveryPauseState
3081 30 : GetRecoveryPauseState(void)
3082 : {
3083 : RecoveryPauseState state;
3084 :
3085 30 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3086 30 : state = XLogRecoveryCtl->recoveryPauseState;
3087 30 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3088 :
3089 30 : return state;
3090 : }
3091 :
3092 : /*
3093 : * Set the recovery pause state.
3094 : *
3095 : * If recovery pause is requested then sets the recovery pause state to
3096 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3097 : * to 'not paused' to resume the recovery. The recovery pause will be
3098 : * confirmed by the ConfirmRecoveryPaused.
3099 : */
3100 : void
3101 92 : SetRecoveryPause(bool recoveryPause)
3102 : {
3103 92 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3104 :
3105 92 : if (!recoveryPause)
3106 86 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3107 6 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3108 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3109 :
3110 92 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3111 :
3112 92 : if (!recoveryPause)
3113 86 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3114 92 : }
3115 :
3116 : /*
3117 : * Confirm the recovery pause by setting the recovery pause state to
3118 : * RECOVERY_PAUSED.
3119 : */
3120 : static void
3121 12 : ConfirmRecoveryPaused(void)
3122 : {
3123 : /* If recovery pause is requested then set it paused */
3124 12 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3125 12 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3126 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3127 12 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3128 12 : }
3129 :
3130 :
3131 : /*
3132 : * Attempt to read the next XLOG record.
3133 : *
3134 : * Before first call, the reader needs to be positioned to the first record
3135 : * by calling XLogPrefetcherBeginRead().
3136 : *
3137 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3138 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3139 : * record is available.
3140 : */
3141 : static XLogRecord *
3142 5434614 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3143 : bool fetching_ckpt, TimeLineID replayTLI)
3144 : {
3145 : XLogRecord *record;
3146 5434614 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3147 5434614 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3148 :
3149 : /* Pass through parameters to XLogPageRead */
3150 5434614 : private->fetching_ckpt = fetching_ckpt;
3151 5434614 : private->emode = emode;
3152 5434614 : private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3153 5434614 : private->replayTLI = replayTLI;
3154 :
3155 : /* This is the first attempt to read this page. */
3156 5434614 : lastSourceFailed = false;
3157 :
3158 : for (;;)
3159 222 : {
3160 : char *errormsg;
3161 :
3162 5434836 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3163 5434728 : if (record == NULL)
3164 : {
3165 : /*
3166 : * When we find that WAL ends in an incomplete record, keep track
3167 : * of that record. After recovery is done, we'll write a record
3168 : * to indicate to downstream WAL readers that that portion is to
3169 : * be ignored.
3170 : *
3171 : * However, when ArchiveRecoveryRequested = true, we're going to
3172 : * switch to a new timeline at the end of recovery. We will only
3173 : * copy WAL over to the new timeline up to the end of the last
3174 : * complete record, so if we did this, we would later create an
3175 : * overwrite contrecord in the wrong place, breaking everything.
3176 : */
3177 516 : if (!ArchiveRecoveryRequested &&
3178 208 : !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
3179 : {
3180 22 : abortedRecPtr = xlogreader->abortedRecPtr;
3181 22 : missingContrecPtr = xlogreader->missingContrecPtr;
3182 : }
3183 :
3184 516 : if (readFile >= 0)
3185 : {
3186 474 : close(readFile);
3187 474 : readFile = -1;
3188 : }
3189 :
3190 : /*
3191 : * We only end up here without a message when XLogPageRead()
3192 : * failed - in that case we already logged something. In
3193 : * StandbyMode that only happens if we have been triggered, so we
3194 : * shouldn't loop anymore in that case.
3195 : */
3196 516 : if (errormsg)
3197 474 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3198 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3199 : }
3200 :
3201 : /*
3202 : * Check page TLI is one of the expected values.
3203 : */
3204 5434212 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3205 : {
3206 : char fname[MAXFNAMELEN];
3207 : XLogSegNo segno;
3208 : int32 offset;
3209 :
3210 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3211 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3212 : wal_segment_size);
3213 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3214 : wal_segment_size);
3215 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3216 : (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3217 : xlogreader->latestPageTLI,
3218 : fname,
3219 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3220 : offset)));
3221 0 : record = NULL;
3222 : }
3223 :
3224 5434728 : if (record)
3225 : {
3226 : /* Great, got a record */
3227 5434506 : return record;
3228 : }
3229 : else
3230 : {
3231 : /* No valid record available from this source */
3232 516 : lastSourceFailed = true;
3233 :
3234 : /*
3235 : * If archive recovery was requested, but we were still doing
3236 : * crash recovery, switch to archive recovery and retry using the
3237 : * offline archive. We have now replayed all the valid WAL in
3238 : * pg_wal, so we are presumably now consistent.
3239 : *
3240 : * We require that there's at least some valid WAL present in
3241 : * pg_wal, however (!fetching_ckpt). We could recover using the
3242 : * WAL from the archive, even if pg_wal is completely empty, but
3243 : * we'd have no idea how far we'd have to replay to reach
3244 : * consistency. So err on the safe side and give up.
3245 : */
3246 516 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3247 4 : !fetching_ckpt)
3248 : {
3249 4 : ereport(DEBUG1,
3250 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3251 4 : InArchiveRecovery = true;
3252 4 : if (StandbyModeRequested)
3253 4 : EnableStandbyMode();
3254 :
3255 4 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3256 4 : minRecoveryPoint = xlogreader->EndRecPtr;
3257 4 : minRecoveryPointTLI = replayTLI;
3258 :
3259 4 : CheckRecoveryConsistency();
3260 :
3261 : /*
3262 : * Before we retry, reset lastSourceFailed and currentSource
3263 : * so that we will check the archive next.
3264 : */
3265 4 : lastSourceFailed = false;
3266 4 : currentSource = XLOG_FROM_ANY;
3267 :
3268 222 : continue;
3269 : }
3270 :
3271 : /* In standby mode, loop back to retry. Otherwise, give up. */
3272 512 : if (StandbyMode && !CheckForStandbyTrigger())
3273 218 : continue;
3274 : else
3275 294 : return NULL;
3276 : }
3277 : }
3278 : }
3279 :
3280 : /*
3281 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3282 : * already). Returns number of bytes read, if the page is read successfully,
3283 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3284 : * but only if they have not been previously reported.
3285 : *
3286 : * See XLogReaderRoutine.page_read for more details.
3287 : *
3288 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3289 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3290 : *
3291 : * This is responsible for restoring files from archive as needed, as well
3292 : * as for waiting for the requested WAL record to arrive in standby mode.
3293 : *
3294 : * xlogreader->private_data->emode specifies the log level used for reporting
3295 : * "file not found" or "end of WAL" situations in archive recovery, or in
3296 : * standby mode when promotion is triggered. If set to WARNING or below,
3297 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3298 : * levels the ereport() won't return.
3299 : *
3300 : * In standby mode, if after a successful return of XLogPageRead() the
3301 : * caller finds the record it's interested in to be broken, it should
3302 : * ereport the error with the level determined by
3303 : * emode_for_corrupt_record(), and then set lastSourceFailed
3304 : * and call XLogPageRead() again with the same arguments. This lets
3305 : * XLogPageRead() to try fetching the record from another source, or to
3306 : * sleep and retry.
3307 : */
3308 : static int
3309 2796008 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3310 : XLogRecPtr targetRecPtr, char *readBuf)
3311 : {
3312 2796008 : XLogPageReadPrivate *private =
3313 : (XLogPageReadPrivate *) xlogreader->private_data;
3314 2796008 : int emode = private->emode;
3315 : uint32 targetPageOff;
3316 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3317 : int r;
3318 : instr_time io_start;
3319 :
3320 2796008 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3321 2796008 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3322 :
3323 : /*
3324 : * See if we need to switch to a new segment because the requested record
3325 : * is not in the currently open one.
3326 : */
3327 2796008 : if (readFile >= 0 &&
3328 2792500 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3329 : {
3330 : /*
3331 : * Request a restartpoint if we've replayed too much xlog since the
3332 : * last one.
3333 : */
3334 3184 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3335 : {
3336 3154 : if (XLogCheckpointNeeded(readSegNo))
3337 : {
3338 2938 : (void) GetRedoRecPtr();
3339 2938 : if (XLogCheckpointNeeded(readSegNo))
3340 2924 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3341 : }
3342 : }
3343 :
3344 3184 : close(readFile);
3345 3184 : readFile = -1;
3346 3184 : readSource = XLOG_FROM_ANY;
3347 : }
3348 :
3349 2796008 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3350 :
3351 2796014 : retry:
3352 : /* See if we need to retrieve more data */
3353 2796014 : if (readFile < 0 ||
3354 2789316 : (readSource == XLOG_FROM_STREAM &&
3355 2765438 : flushedUpto < targetPagePtr + reqLen))
3356 : {
3357 25342 : if (readFile >= 0 &&
3358 18644 : xlogreader->nonblocking &&
3359 9152 : readSource == XLOG_FROM_STREAM &&
3360 9152 : flushedUpto < targetPagePtr + reqLen)
3361 9152 : return XLREAD_WOULDBLOCK;
3362 :
3363 16082 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3364 16190 : private->randAccess,
3365 16190 : private->fetching_ckpt,
3366 : targetRecPtr,
3367 : private->replayTLI,
3368 : xlogreader->EndRecPtr,
3369 16190 : xlogreader->nonblocking))
3370 : {
3371 1146 : case XLREAD_WOULDBLOCK:
3372 1146 : return XLREAD_WOULDBLOCK;
3373 80 : case XLREAD_FAIL:
3374 80 : if (readFile >= 0)
3375 0 : close(readFile);
3376 80 : readFile = -1;
3377 80 : readLen = 0;
3378 80 : readSource = XLOG_FROM_ANY;
3379 80 : return XLREAD_FAIL;
3380 14856 : case XLREAD_SUCCESS:
3381 14856 : break;
3382 : }
3383 2770672 : }
3384 :
3385 : /*
3386 : * At this point, we have the right segment open and if we're streaming we
3387 : * know the requested record is in it.
3388 : */
3389 : Assert(readFile != -1);
3390 :
3391 : /*
3392 : * If the current segment is being streamed from the primary, calculate
3393 : * how much of the current page we have received already. We know the
3394 : * requested record has been received, but this is for the benefit of
3395 : * future calls, to allow quick exit at the top of this function.
3396 : */
3397 2785528 : if (readSource == XLOG_FROM_STREAM)
3398 : {
3399 2758636 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3400 2751130 : readLen = XLOG_BLCKSZ;
3401 : else
3402 7506 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3403 : targetPageOff;
3404 : }
3405 : else
3406 26892 : readLen = XLOG_BLCKSZ;
3407 :
3408 : /* Read the requested page */
3409 2785528 : readOff = targetPageOff;
3410 :
3411 : /* Measure I/O timing when reading segment */
3412 2785528 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3413 :
3414 2785528 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3415 2785528 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3416 2785528 : if (r != XLOG_BLCKSZ)
3417 : {
3418 : char fname[MAXFNAMELEN];
3419 0 : int save_errno = errno;
3420 :
3421 0 : pgstat_report_wait_end();
3422 :
3423 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3424 : io_start, 1, r);
3425 :
3426 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3427 0 : if (r < 0)
3428 : {
3429 0 : errno = save_errno;
3430 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3431 : (errcode_for_file_access(),
3432 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3433 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3434 : readOff)));
3435 : }
3436 : else
3437 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3438 : (errcode(ERRCODE_DATA_CORRUPTED),
3439 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3440 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3441 : readOff, r, (Size) XLOG_BLCKSZ)));
3442 0 : goto next_record_is_invalid;
3443 : }
3444 2785528 : pgstat_report_wait_end();
3445 :
3446 2785528 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3447 : io_start, 1, r);
3448 :
3449 : Assert(targetSegNo == readSegNo);
3450 : Assert(targetPageOff == readOff);
3451 : Assert(reqLen <= readLen);
3452 :
3453 2785528 : xlogreader->seg.ws_tli = curFileTLI;
3454 :
3455 : /*
3456 : * Check the page header immediately, so that we can retry immediately if
3457 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3458 : * validates the page header anyway, and would propagate the failure up to
3459 : * ReadRecord(), which would retry. However, there's a corner case with
3460 : * continuation records, if a record is split across two pages such that
3461 : * we would need to read the two pages from different sources across two
3462 : * WAL segments.
3463 : *
3464 : * The first page is only available locally, in pg_wal, because it's
3465 : * already been recycled on the primary. The second page, however, is not
3466 : * present in pg_wal, and we should stream it from the primary. There is a
3467 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3468 : * We would read the first page from the local WAL segment, but when
3469 : * reading the second page, we would read the bogus, recycled, WAL
3470 : * segment. If we didn't catch that case here, we would never recover,
3471 : * because ReadRecord() would retry reading the whole record from the
3472 : * beginning.
3473 : *
3474 : * Of course, this only catches errors in the page header, which is what
3475 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3476 : * corruption still has the same problem. But this at least fixes the
3477 : * common case, which can happen as part of normal operation.
3478 : *
3479 : * Validating the page header is cheap enough that doing it twice
3480 : * shouldn't be a big deal from a performance point of view.
3481 : *
3482 : * When not in standby mode, an invalid page header should cause recovery
3483 : * to end, not retry reading the page, so we don't need to validate the
3484 : * page header here for the retry. Instead, ReadPageInternal() is
3485 : * responsible for the validation.
3486 : */
3487 2785528 : if (StandbyMode &&
3488 2765968 : (targetPagePtr % wal_segment_size) == 0 &&
3489 2646 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3490 : {
3491 : /*
3492 : * Emit this error right now then retry this page immediately. Use
3493 : * errmsg_internal() because the message was already translated.
3494 : */
3495 8 : if (xlogreader->errormsg_buf[0])
3496 8 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3497 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3498 :
3499 : /* reset any error XLogReaderValidatePageHeader() might have set */
3500 8 : XLogReaderResetError(xlogreader);
3501 8 : goto next_record_is_invalid;
3502 : }
3503 :
3504 2785520 : return readLen;
3505 :
3506 8 : next_record_is_invalid:
3507 :
3508 : /*
3509 : * If we're reading ahead, give up fast. Retries and error reporting will
3510 : * be handled by a later read when recovery catches up to this point.
3511 : */
3512 8 : if (xlogreader->nonblocking)
3513 2 : return XLREAD_WOULDBLOCK;
3514 :
3515 6 : lastSourceFailed = true;
3516 :
3517 6 : if (readFile >= 0)
3518 6 : close(readFile);
3519 6 : readFile = -1;
3520 6 : readLen = 0;
3521 6 : readSource = XLOG_FROM_ANY;
3522 :
3523 : /* In standby-mode, keep trying */
3524 6 : if (StandbyMode)
3525 6 : goto retry;
3526 : else
3527 0 : return XLREAD_FAIL;
3528 : }
3529 :
3530 : /*
3531 : * Open the WAL segment containing WAL location 'RecPtr'.
3532 : *
3533 : * The segment can be fetched via restore_command, or via walreceiver having
3534 : * streamed the record, or it can already be present in pg_wal. Checking
3535 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3536 : * too, in case someone copies a new segment directly to pg_wal. That is not
3537 : * documented or recommended, though.
3538 : *
3539 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3540 : * prepare to read WAL starting from RedoStartLSN after this.
3541 : *
3542 : * 'RecPtr' might not point to the beginning of the record we're interested
3543 : * in, it might also point to the page or segment header. In that case,
3544 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3545 : * used to decide which timeline to stream the requested WAL from.
3546 : *
3547 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3548 : * timelines, we can reject a switch to a timeline that branched off before
3549 : * this point.
3550 : *
3551 : * If the record is not immediately available, the function returns false
3552 : * if we're not in standby mode. In standby mode, waits for it to become
3553 : * available.
3554 : *
3555 : * When the requested record becomes available, the function opens the file
3556 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3557 : * of standby mode is triggered by the user, and there is no more WAL
3558 : * available, returns XLREAD_FAIL.
3559 : *
3560 : * If nonblocking is true, then give up immediately if we can't satisfy the
3561 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3562 : */
3563 : static XLogPageReadResult
3564 16190 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3565 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3566 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3567 : bool nonblocking)
3568 : {
3569 : static TimestampTz last_fail_time = 0;
3570 : TimestampTz now;
3571 16190 : bool streaming_reply_sent = false;
3572 :
3573 : /*-------
3574 : * Standby mode is implemented by a state machine:
3575 : *
3576 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3577 : * pg_wal (XLOG_FROM_PG_WAL)
3578 : * 2. Check for promotion trigger request
3579 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3580 : * 4. Rescan timelines
3581 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3582 : *
3583 : * Failure to read from the current source advances the state machine to
3584 : * the next state.
3585 : *
3586 : * 'currentSource' indicates the current state. There are no currentSource
3587 : * values for "check trigger", "rescan timelines", and "sleep" states,
3588 : * those actions are taken when reading from the previous source fails, as
3589 : * part of advancing to the next state.
3590 : *
3591 : * If standby mode is turned off while reading WAL from stream, we move
3592 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3593 : * the files (which would be required at end of recovery, e.g., timeline
3594 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3595 : * here because it's already stopped when standby mode is turned off at
3596 : * the end of recovery.
3597 : *-------
3598 : */
3599 16190 : if (!InArchiveRecovery)
3600 1818 : currentSource = XLOG_FROM_PG_WAL;
3601 14372 : else if (currentSource == XLOG_FROM_ANY ||
3602 14146 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3603 : {
3604 226 : lastSourceFailed = false;
3605 226 : currentSource = XLOG_FROM_ARCHIVE;
3606 : }
3607 :
3608 : for (;;)
3609 13134 : {
3610 29324 : XLogSource oldSource = currentSource;
3611 29324 : bool startWalReceiver = false;
3612 :
3613 : /*
3614 : * First check if we failed to read from the current source, and
3615 : * advance the state machine if so. The failure to read might've
3616 : * happened outside this function, e.g when a CRC check fails on a
3617 : * record, or within this loop.
3618 : */
3619 29324 : if (lastSourceFailed)
3620 : {
3621 : /*
3622 : * Don't allow any retry loops to occur during nonblocking
3623 : * readahead. Let the caller process everything that has been
3624 : * decoded already first.
3625 : */
3626 880 : if (nonblocking)
3627 140 : return XLREAD_WOULDBLOCK;
3628 :
3629 740 : switch (currentSource)
3630 : {
3631 452 : case XLOG_FROM_ARCHIVE:
3632 : case XLOG_FROM_PG_WAL:
3633 :
3634 : /*
3635 : * Check to see if promotion is requested. Note that we do
3636 : * this only after failure, so when you promote, we still
3637 : * finish replaying as much as we can from archive and
3638 : * pg_wal before failover.
3639 : */
3640 452 : if (StandbyMode && CheckForStandbyTrigger())
3641 : {
3642 36 : XLogShutdownWalRcv();
3643 36 : return XLREAD_FAIL;
3644 : }
3645 :
3646 : /*
3647 : * Not in standby mode, and we've now tried the archive
3648 : * and pg_wal.
3649 : */
3650 416 : if (!StandbyMode)
3651 44 : return XLREAD_FAIL;
3652 :
3653 : /*
3654 : * Move to XLOG_FROM_STREAM state, and set to start a
3655 : * walreceiver if necessary.
3656 : */
3657 372 : currentSource = XLOG_FROM_STREAM;
3658 372 : startWalReceiver = true;
3659 372 : break;
3660 :
3661 288 : case XLOG_FROM_STREAM:
3662 :
3663 : /*
3664 : * Failure while streaming. Most likely, we got here
3665 : * because streaming replication was terminated, or
3666 : * promotion was triggered. But we also get here if we
3667 : * find an invalid record in the WAL streamed from the
3668 : * primary, in which case something is seriously wrong.
3669 : * There's little chance that the problem will just go
3670 : * away, but PANIC is not good for availability either,
3671 : * especially in hot standby mode. So, we treat that the
3672 : * same as disconnection, and retry from archive/pg_wal
3673 : * again. The WAL in the archive should be identical to
3674 : * what was streamed, so it's unlikely that it helps, but
3675 : * one can hope...
3676 : */
3677 :
3678 : /*
3679 : * We should be able to move to XLOG_FROM_STREAM only in
3680 : * standby mode.
3681 : */
3682 : Assert(StandbyMode);
3683 :
3684 : /*
3685 : * Before we leave XLOG_FROM_STREAM state, make sure that
3686 : * walreceiver is not active, so that it won't overwrite
3687 : * WAL that we restore from archive.
3688 : */
3689 288 : XLogShutdownWalRcv();
3690 :
3691 : /*
3692 : * Before we sleep, re-scan for possible new timelines if
3693 : * we were requested to recover to the latest timeline.
3694 : */
3695 288 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3696 : {
3697 288 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3698 : {
3699 12 : currentSource = XLOG_FROM_ARCHIVE;
3700 12 : break;
3701 : }
3702 : }
3703 :
3704 : /*
3705 : * XLOG_FROM_STREAM is the last state in our state
3706 : * machine, so we've exhausted all the options for
3707 : * obtaining the requested WAL. We're going to loop back
3708 : * and retry from the archive, but if it hasn't been long
3709 : * since last attempt, sleep wal_retrieve_retry_interval
3710 : * milliseconds to avoid busy-waiting.
3711 : */
3712 276 : now = GetCurrentTimestamp();
3713 276 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3714 : wal_retrieve_retry_interval))
3715 : {
3716 : long wait_time;
3717 :
3718 300 : wait_time = wal_retrieve_retry_interval -
3719 150 : TimestampDifferenceMilliseconds(last_fail_time, now);
3720 :
3721 150 : elog(LOG, "waiting for WAL to become available at %X/%X",
3722 : LSN_FORMAT_ARGS(RecPtr));
3723 :
3724 : /* Do background tasks that might benefit us later. */
3725 150 : KnownAssignedTransactionIdsIdleMaintenance();
3726 :
3727 150 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3728 : WL_LATCH_SET | WL_TIMEOUT |
3729 : WL_EXIT_ON_PM_DEATH,
3730 : wait_time,
3731 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3732 150 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3733 150 : now = GetCurrentTimestamp();
3734 :
3735 : /* Handle interrupt signals of startup process */
3736 150 : ProcessStartupProcInterrupts();
3737 : }
3738 248 : last_fail_time = now;
3739 248 : currentSource = XLOG_FROM_ARCHIVE;
3740 248 : break;
3741 :
3742 0 : default:
3743 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3744 : }
3745 : }
3746 28444 : else if (currentSource == XLOG_FROM_PG_WAL)
3747 : {
3748 : /*
3749 : * We just successfully read a file in pg_wal. We prefer files in
3750 : * the archive over ones in pg_wal, so try the next file again
3751 : * from the archive first.
3752 : */
3753 1812 : if (InArchiveRecovery)
3754 0 : currentSource = XLOG_FROM_ARCHIVE;
3755 : }
3756 :
3757 29076 : if (currentSource != oldSource)
3758 632 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3759 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3760 : lastSourceFailed ? "failure" : "success");
3761 :
3762 : /*
3763 : * We've now handled possible failure. Try to read from the chosen
3764 : * source.
3765 : */
3766 29076 : lastSourceFailed = false;
3767 :
3768 29076 : switch (currentSource)
3769 : {
3770 3344 : case XLOG_FROM_ARCHIVE:
3771 : case XLOG_FROM_PG_WAL:
3772 :
3773 : /*
3774 : * WAL receiver must not be running when reading WAL from
3775 : * archive or pg_wal.
3776 : */
3777 : Assert(!WalRcvStreaming());
3778 :
3779 : /* Close any old file we might have open. */
3780 3344 : if (readFile >= 0)
3781 : {
3782 146 : close(readFile);
3783 146 : readFile = -1;
3784 : }
3785 : /* Reset curFileTLI if random fetch. */
3786 3344 : if (randAccess)
3787 2108 : curFileTLI = 0;
3788 :
3789 : /*
3790 : * Try to restore the file from archive, or read an existing
3791 : * file from pg_wal.
3792 : */
3793 3344 : readFile = XLogFileReadAnyTLI(readSegNo,
3794 3344 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3795 : currentSource);
3796 3344 : if (readFile >= 0)
3797 3014 : return XLREAD_SUCCESS; /* success! */
3798 :
3799 : /*
3800 : * Nope, not found in archive or pg_wal.
3801 : */
3802 330 : lastSourceFailed = true;
3803 330 : break;
3804 :
3805 25732 : case XLOG_FROM_STREAM:
3806 : {
3807 : bool havedata;
3808 :
3809 : /*
3810 : * We should be able to move to XLOG_FROM_STREAM only in
3811 : * standby mode.
3812 : */
3813 : Assert(StandbyMode);
3814 :
3815 : /*
3816 : * First, shutdown walreceiver if its restart has been
3817 : * requested -- but no point if we're already slated for
3818 : * starting it.
3819 : */
3820 25732 : if (pendingWalRcvRestart && !startWalReceiver)
3821 : {
3822 6 : XLogShutdownWalRcv();
3823 :
3824 : /*
3825 : * Re-scan for possible new timelines if we were
3826 : * requested to recover to the latest timeline.
3827 : */
3828 6 : if (recoveryTargetTimeLineGoal ==
3829 : RECOVERY_TARGET_TIMELINE_LATEST)
3830 6 : rescanLatestTimeLine(replayTLI, replayLSN);
3831 :
3832 6 : startWalReceiver = true;
3833 : }
3834 25732 : pendingWalRcvRestart = false;
3835 :
3836 : /*
3837 : * Launch walreceiver if needed.
3838 : *
3839 : * If fetching_ckpt is true, RecPtr points to the initial
3840 : * checkpoint location. In that case, we use RedoStartLSN
3841 : * as the streaming start position instead of RecPtr, so
3842 : * that when we later jump backwards to start redo at
3843 : * RedoStartLSN, we will have the logs streamed already.
3844 : */
3845 25732 : if (startWalReceiver &&
3846 378 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3847 : {
3848 : XLogRecPtr ptr;
3849 : TimeLineID tli;
3850 :
3851 330 : if (fetching_ckpt)
3852 : {
3853 0 : ptr = RedoStartLSN;
3854 0 : tli = RedoStartTLI;
3855 : }
3856 : else
3857 : {
3858 330 : ptr = RecPtr;
3859 :
3860 : /*
3861 : * Use the record begin position to determine the
3862 : * TLI, rather than the position we're reading.
3863 : */
3864 330 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3865 :
3866 330 : if (curFileTLI > 0 && tli < curFileTLI)
3867 0 : elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3868 : LSN_FORMAT_ARGS(tliRecPtr),
3869 : tli, curFileTLI);
3870 : }
3871 330 : curFileTLI = tli;
3872 330 : SetInstallXLogFileSegmentActive();
3873 330 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3874 : PrimarySlotName,
3875 : wal_receiver_create_temp_slot);
3876 330 : flushedUpto = 0;
3877 : }
3878 :
3879 : /*
3880 : * Check if WAL receiver is active or wait to start up.
3881 : */
3882 25732 : if (!WalRcvStreaming())
3883 : {
3884 232 : lastSourceFailed = true;
3885 232 : break;
3886 : }
3887 :
3888 : /*
3889 : * Walreceiver is active, so see if new data has arrived.
3890 : *
3891 : * We only advance XLogReceiptTime when we obtain fresh
3892 : * WAL from walreceiver and observe that we had already
3893 : * processed everything before the most recent "chunk"
3894 : * that it flushed to disk. In steady state where we are
3895 : * keeping up with the incoming data, XLogReceiptTime will
3896 : * be updated on each cycle. When we are behind,
3897 : * XLogReceiptTime will not advance, so the grace time
3898 : * allotted to conflicting queries will decrease.
3899 : */
3900 25500 : if (RecPtr < flushedUpto)
3901 3658 : havedata = true;
3902 : else
3903 : {
3904 : XLogRecPtr latestChunkStart;
3905 :
3906 21842 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3907 21842 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3908 : {
3909 10738 : havedata = true;
3910 10738 : if (latestChunkStart <= RecPtr)
3911 : {
3912 9016 : XLogReceiptTime = GetCurrentTimestamp();
3913 9016 : SetCurrentChunkStartTime(XLogReceiptTime);
3914 : }
3915 : }
3916 : else
3917 11104 : havedata = false;
3918 : }
3919 25500 : if (havedata)
3920 : {
3921 : /*
3922 : * Great, streamed far enough. Open the file if it's
3923 : * not open already. Also read the timeline history
3924 : * file if we haven't initialized timeline history
3925 : * yet; it should be streamed over and present in
3926 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3927 : * info is set correctly and XLogReceiptTime isn't
3928 : * changed.
3929 : *
3930 : * NB: We must set readTimeLineHistory based on
3931 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3932 : * be the same, but if recovery_target_timeline is
3933 : * 'latest' and archiving is configured, then it's
3934 : * possible that we managed to retrieve one or more
3935 : * new timeline history files from the archive,
3936 : * updating recoveryTargetTLI.
3937 : */
3938 14396 : if (readFile < 0)
3939 : {
3940 2554 : if (!expectedTLEs)
3941 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3942 2554 : readFile = XLogFileRead(readSegNo, receiveTLI,
3943 : XLOG_FROM_STREAM, false);
3944 : Assert(readFile >= 0);
3945 : }
3946 : else
3947 : {
3948 : /* just make sure source info is correct... */
3949 11842 : readSource = XLOG_FROM_STREAM;
3950 11842 : XLogReceiptSource = XLOG_FROM_STREAM;
3951 11842 : return XLREAD_SUCCESS;
3952 : }
3953 2554 : break;
3954 : }
3955 :
3956 : /* In nonblocking mode, return rather than sleeping. */
3957 11104 : if (nonblocking)
3958 1006 : return XLREAD_WOULDBLOCK;
3959 :
3960 : /*
3961 : * Data not here yet. Check for trigger, then wait for
3962 : * walreceiver to wake us up when new WAL arrives.
3963 : */
3964 10098 : if (CheckForStandbyTrigger())
3965 : {
3966 : /*
3967 : * Note that we don't return XLREAD_FAIL immediately
3968 : * here. After being triggered, we still want to
3969 : * replay all the WAL that was already streamed. It's
3970 : * in pg_wal now, so we just treat this as a failure,
3971 : * and the state machine will move on to replay the
3972 : * streamed WAL from pg_wal, and then recheck the
3973 : * trigger and exit replay.
3974 : */
3975 56 : lastSourceFailed = true;
3976 56 : break;
3977 : }
3978 :
3979 : /*
3980 : * Since we have replayed everything we have received so
3981 : * far and are about to start waiting for more WAL, let's
3982 : * tell the upstream server our replay location now so
3983 : * that pg_stat_replication doesn't show stale
3984 : * information.
3985 : */
3986 10042 : if (!streaming_reply_sent)
3987 : {
3988 8250 : WalRcvForceReply();
3989 8250 : streaming_reply_sent = true;
3990 : }
3991 :
3992 : /* Do any background tasks that might benefit us later. */
3993 10042 : KnownAssignedTransactionIdsIdleMaintenance();
3994 :
3995 : /* Update pg_stat_recovery_prefetch before sleeping. */
3996 10042 : XLogPrefetcherComputeStats(xlogprefetcher);
3997 :
3998 : /*
3999 : * Wait for more WAL to arrive, when we will be woken
4000 : * immediately by the WAL receiver.
4001 : */
4002 10042 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
4003 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
4004 : -1L,
4005 : WAIT_EVENT_RECOVERY_WAL_STREAM);
4006 10042 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4007 10042 : break;
4008 : }
4009 :
4010 0 : default:
4011 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
4012 : }
4013 :
4014 : /*
4015 : * Check for recovery pause here so that we can confirm more quickly
4016 : * that a requested pause has actually taken effect.
4017 : */
4018 13214 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4019 : RECOVERY_NOT_PAUSED)
4020 4 : recoveryPausesHere(false);
4021 :
4022 : /*
4023 : * This possibly-long loop needs to handle interrupts of startup
4024 : * process.
4025 : */
4026 13214 : ProcessStartupProcInterrupts();
4027 : }
4028 :
4029 : return XLREAD_FAIL; /* not reached */
4030 : }
4031 :
4032 :
4033 : /*
4034 : * Determine what log level should be used to report a corrupt WAL record
4035 : * in the current WAL page, previously read by XLogPageRead().
4036 : *
4037 : * 'emode' is the error mode that would be used to report a file-not-found
4038 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4039 : * we're retrying the exact same record that we've tried previously, only
4040 : * complain the first time to keep the noise down. However, we only do when
4041 : * reading from pg_wal, because we don't expect any invalid records in archive
4042 : * or in records streamed from the primary. Files in the archive should be complete,
4043 : * and we should never hit the end of WAL because we stop and wait for more WAL
4044 : * to arrive before replaying it.
4045 : *
4046 : * NOTE: This function remembers the RecPtr value it was last called with,
4047 : * to suppress repeated messages about the same record. Only call this when
4048 : * you are about to ereport(), or you might cause a later message to be
4049 : * erroneously suppressed.
4050 : */
4051 : static int
4052 482 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4053 : {
4054 : static XLogRecPtr lastComplaint = 0;
4055 :
4056 482 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4057 : {
4058 476 : if (RecPtr == lastComplaint)
4059 104 : emode = DEBUG1;
4060 : else
4061 372 : lastComplaint = RecPtr;
4062 : }
4063 482 : return emode;
4064 : }
4065 :
4066 :
4067 : /*
4068 : * Subroutine to try to fetch and validate a prior checkpoint record.
4069 : */
4070 : static XLogRecord *
4071 1810 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4072 : TimeLineID replayTLI)
4073 : {
4074 : XLogRecord *record;
4075 : uint8 info;
4076 :
4077 : Assert(xlogreader != NULL);
4078 :
4079 1810 : if (!XRecOffIsValid(RecPtr))
4080 : {
4081 0 : ereport(LOG,
4082 : (errmsg("invalid checkpoint location")));
4083 0 : return NULL;
4084 : }
4085 :
4086 1810 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4087 1810 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4088 :
4089 1810 : if (record == NULL)
4090 : {
4091 0 : ereport(LOG,
4092 : (errmsg("invalid checkpoint record")));
4093 0 : return NULL;
4094 : }
4095 1810 : if (record->xl_rmid != RM_XLOG_ID)
4096 : {
4097 0 : ereport(LOG,
4098 : (errmsg("invalid resource manager ID in checkpoint record")));
4099 0 : return NULL;
4100 : }
4101 1810 : info = record->xl_info & ~XLR_INFO_MASK;
4102 1810 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4103 : info != XLOG_CHECKPOINT_ONLINE)
4104 : {
4105 0 : ereport(LOG,
4106 : (errmsg("invalid xl_info in checkpoint record")));
4107 0 : return NULL;
4108 : }
4109 1810 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4110 : {
4111 0 : ereport(LOG,
4112 : (errmsg("invalid length of checkpoint record")));
4113 0 : return NULL;
4114 : }
4115 1810 : return record;
4116 : }
4117 :
4118 : /*
4119 : * Scan for new timelines that might have appeared in the archive since we
4120 : * started recovery.
4121 : *
4122 : * If there are any, the function changes recovery target TLI to the latest
4123 : * one and returns 'true'.
4124 : */
4125 : static bool
4126 294 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4127 : {
4128 : List *newExpectedTLEs;
4129 : bool found;
4130 : ListCell *cell;
4131 : TimeLineID newtarget;
4132 294 : TimeLineID oldtarget = recoveryTargetTLI;
4133 294 : TimeLineHistoryEntry *currentTle = NULL;
4134 :
4135 294 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4136 294 : if (newtarget == recoveryTargetTLI)
4137 : {
4138 : /* No new timelines found */
4139 282 : return false;
4140 : }
4141 :
4142 : /*
4143 : * Determine the list of expected TLIs for the new TLI
4144 : */
4145 :
4146 12 : newExpectedTLEs = readTimeLineHistory(newtarget);
4147 :
4148 : /*
4149 : * If the current timeline is not part of the history of the new timeline,
4150 : * we cannot proceed to it.
4151 : */
4152 12 : found = false;
4153 24 : foreach(cell, newExpectedTLEs)
4154 : {
4155 24 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4156 :
4157 24 : if (currentTle->tli == recoveryTargetTLI)
4158 : {
4159 12 : found = true;
4160 12 : break;
4161 : }
4162 : }
4163 12 : if (!found)
4164 : {
4165 0 : ereport(LOG,
4166 : (errmsg("new timeline %u is not a child of database system timeline %u",
4167 : newtarget,
4168 : replayTLI)));
4169 0 : return false;
4170 : }
4171 :
4172 : /*
4173 : * The current timeline was found in the history file, but check that the
4174 : * next timeline was forked off from it *after* the current recovery
4175 : * location.
4176 : */
4177 12 : if (currentTle->end < replayLSN)
4178 : {
4179 0 : ereport(LOG,
4180 : (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4181 : newtarget,
4182 : replayTLI,
4183 : LSN_FORMAT_ARGS(replayLSN))));
4184 0 : return false;
4185 : }
4186 :
4187 : /* The new timeline history seems valid. Switch target */
4188 12 : recoveryTargetTLI = newtarget;
4189 12 : list_free_deep(expectedTLEs);
4190 12 : expectedTLEs = newExpectedTLEs;
4191 :
4192 : /*
4193 : * As in StartupXLOG(), try to ensure we have all the history files
4194 : * between the old target and new target in pg_wal.
4195 : */
4196 12 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4197 :
4198 12 : ereport(LOG,
4199 : (errmsg("new target timeline is %u",
4200 : recoveryTargetTLI)));
4201 :
4202 12 : return true;
4203 : }
4204 :
4205 :
4206 : /*
4207 : * Open a logfile segment for reading (during recovery).
4208 : *
4209 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4210 : * Otherwise, it's assumed to be already available in pg_wal.
4211 : */
4212 : static int
4213 6740 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4214 : XLogSource source, bool notfoundOk)
4215 : {
4216 : char xlogfname[MAXFNAMELEN];
4217 : char activitymsg[MAXFNAMELEN + 16];
4218 : char path[MAXPGPATH];
4219 : int fd;
4220 :
4221 6740 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4222 :
4223 6740 : switch (source)
4224 : {
4225 1548 : case XLOG_FROM_ARCHIVE:
4226 : /* Report recovery progress in PS display */
4227 1548 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4228 : xlogfname);
4229 1548 : set_ps_display(activitymsg);
4230 :
4231 1548 : if (!RestoreArchivedFile(path, xlogfname,
4232 : "RECOVERYXLOG",
4233 : wal_segment_size,
4234 : InRedo))
4235 826 : return -1;
4236 722 : break;
4237 :
4238 5192 : case XLOG_FROM_PG_WAL:
4239 : case XLOG_FROM_STREAM:
4240 5192 : XLogFilePath(path, tli, segno, wal_segment_size);
4241 5192 : break;
4242 :
4243 0 : default:
4244 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4245 : }
4246 :
4247 : /*
4248 : * If the segment was fetched from archival storage, replace the existing
4249 : * xlog segment (if any) with the archival version.
4250 : */
4251 5914 : if (source == XLOG_FROM_ARCHIVE)
4252 : {
4253 : Assert(!IsInstallXLogFileSegmentActive());
4254 722 : KeepFileRestoredFromArchive(path, xlogfname);
4255 :
4256 : /*
4257 : * Set path to point at the new file in pg_wal.
4258 : */
4259 722 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4260 : }
4261 :
4262 5914 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4263 5914 : if (fd >= 0)
4264 : {
4265 : /* Success! */
4266 5568 : curFileTLI = tli;
4267 :
4268 : /* Report recovery progress in PS display */
4269 5568 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4270 : xlogfname);
4271 5568 : set_ps_display(activitymsg);
4272 :
4273 : /* Track source of data in assorted state variables */
4274 5568 : readSource = source;
4275 5568 : XLogReceiptSource = source;
4276 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4277 5568 : if (source != XLOG_FROM_STREAM)
4278 3014 : XLogReceiptTime = GetCurrentTimestamp();
4279 :
4280 5568 : return fd;
4281 : }
4282 346 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4283 0 : ereport(PANIC,
4284 : (errcode_for_file_access(),
4285 : errmsg("could not open file \"%s\": %m", path)));
4286 346 : return -1;
4287 : }
4288 :
4289 : /*
4290 : * Open a logfile segment for reading (during recovery).
4291 : *
4292 : * This version searches for the segment with any TLI listed in expectedTLEs.
4293 : */
4294 : static int
4295 3344 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4296 : {
4297 : char path[MAXPGPATH];
4298 : ListCell *cell;
4299 : int fd;
4300 : List *tles;
4301 :
4302 : /*
4303 : * Loop looking for a suitable timeline ID: we might need to read any of
4304 : * the timelines listed in expectedTLEs.
4305 : *
4306 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4307 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4308 : * to go backwards; this prevents us from picking up the wrong file when a
4309 : * parent timeline extends to higher segment numbers than the child we
4310 : * want to read.
4311 : *
4312 : * If we haven't read the timeline history file yet, read it now, so that
4313 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4314 : * however, unless we actually find a valid segment. That way if there is
4315 : * neither a timeline history file nor a WAL segment in the archive, and
4316 : * streaming replication is set up, we'll read the timeline history file
4317 : * streamed from the primary when we start streaming, instead of
4318 : * recovering with a dummy history generated here.
4319 : */
4320 3344 : if (expectedTLEs)
4321 1534 : tles = expectedTLEs;
4322 : else
4323 1810 : tles = readTimeLineHistory(recoveryTargetTLI);
4324 :
4325 3704 : foreach(cell, tles)
4326 : {
4327 3382 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4328 3382 : TimeLineID tli = hent->tli;
4329 :
4330 3382 : if (tli < curFileTLI)
4331 8 : break; /* don't bother looking at too-old TLIs */
4332 :
4333 : /*
4334 : * Skip scanning the timeline ID that the logfile segment to read
4335 : * doesn't belong to
4336 : */
4337 3374 : if (hent->begin != InvalidXLogRecPtr)
4338 : {
4339 140 : XLogSegNo beginseg = 0;
4340 :
4341 140 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4342 :
4343 : /*
4344 : * The logfile segment that doesn't belong to the timeline is
4345 : * older or newer than the segment that the timeline started or
4346 : * ended at, respectively. It's sufficient to check only the
4347 : * starting segment of the timeline here. Since the timelines are
4348 : * scanned in descending order in this loop, any segments newer
4349 : * than the ending segment should belong to newer timeline and
4350 : * have already been read before. So it's not necessary to check
4351 : * the ending segment of the timeline here.
4352 : */
4353 140 : if (segno < beginseg)
4354 14 : continue;
4355 : }
4356 :
4357 3360 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4358 : {
4359 1548 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4360 1548 : if (fd != -1)
4361 : {
4362 722 : elog(DEBUG1, "got WAL segment from archive");
4363 722 : if (!expectedTLEs)
4364 36 : expectedTLEs = tles;
4365 3014 : return fd;
4366 : }
4367 : }
4368 :
4369 2638 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4370 : {
4371 2638 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4372 2638 : if (fd != -1)
4373 : {
4374 2292 : if (!expectedTLEs)
4375 1774 : expectedTLEs = tles;
4376 2292 : return fd;
4377 : }
4378 : }
4379 : }
4380 :
4381 : /* Couldn't find it. For simplicity, complain about front timeline */
4382 330 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4383 330 : errno = ENOENT;
4384 330 : ereport(DEBUG2,
4385 : (errcode_for_file_access(),
4386 : errmsg("could not open file \"%s\": %m", path)));
4387 330 : return -1;
4388 : }
4389 :
4390 : /*
4391 : * Set flag to signal the walreceiver to restart. (The startup process calls
4392 : * this on noticing a relevant configuration change.)
4393 : */
4394 : void
4395 6 : StartupRequestWalReceiverRestart(void)
4396 : {
4397 6 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4398 : {
4399 6 : ereport(LOG,
4400 : (errmsg("WAL receiver process shutdown requested")));
4401 :
4402 6 : pendingWalRcvRestart = true;
4403 : }
4404 6 : }
4405 :
4406 :
4407 : /*
4408 : * Has a standby promotion already been triggered?
4409 : *
4410 : * Unlike CheckForStandbyTrigger(), this works in any process
4411 : * that's connected to shared memory.
4412 : */
4413 : bool
4414 104 : PromoteIsTriggered(void)
4415 : {
4416 : /*
4417 : * We check shared state each time only until a standby promotion is
4418 : * triggered. We can't trigger a promotion again, so there's no need to
4419 : * keep checking after the shared variable has once been seen true.
4420 : */
4421 104 : if (LocalPromoteIsTriggered)
4422 84 : return true;
4423 :
4424 20 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4425 20 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4426 20 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4427 :
4428 20 : return LocalPromoteIsTriggered;
4429 : }
4430 :
4431 : static void
4432 84 : SetPromoteIsTriggered(void)
4433 : {
4434 84 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4435 84 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4436 84 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4437 :
4438 : /*
4439 : * Mark the recovery pause state as 'not paused' because the paused state
4440 : * ends and promotion continues if a promotion is triggered while recovery
4441 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4442 : * return 'paused' while a promotion is ongoing.
4443 : */
4444 84 : SetRecoveryPause(false);
4445 :
4446 84 : LocalPromoteIsTriggered = true;
4447 84 : }
4448 :
4449 : /*
4450 : * Check whether a promote request has arrived.
4451 : */
4452 : static bool
4453 10822 : CheckForStandbyTrigger(void)
4454 : {
4455 10822 : if (LocalPromoteIsTriggered)
4456 94 : return true;
4457 :
4458 10728 : if (IsPromoteSignaled() && CheckPromoteSignal())
4459 : {
4460 84 : ereport(LOG, (errmsg("received promote request")));
4461 84 : RemovePromoteSignalFiles();
4462 84 : ResetPromoteSignaled();
4463 84 : SetPromoteIsTriggered();
4464 84 : return true;
4465 : }
4466 :
4467 10644 : return false;
4468 : }
4469 :
4470 : /*
4471 : * Remove the files signaling a standby promotion request.
4472 : */
4473 : void
4474 1760 : RemovePromoteSignalFiles(void)
4475 : {
4476 1760 : unlink(PROMOTE_SIGNAL_FILE);
4477 1760 : }
4478 :
4479 : /*
4480 : * Check to see if a promote request has arrived.
4481 : */
4482 : bool
4483 1546 : CheckPromoteSignal(void)
4484 : {
4485 : struct stat stat_buf;
4486 :
4487 1546 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4488 168 : return true;
4489 :
4490 1378 : return false;
4491 : }
4492 :
4493 : /*
4494 : * Wake up startup process to replay newly arrived WAL, or to notice that
4495 : * failover has been requested.
4496 : */
4497 : void
4498 22768 : WakeupRecovery(void)
4499 : {
4500 22768 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4501 22768 : }
4502 :
4503 : /*
4504 : * Schedule a walreceiver wakeup in the main recovery loop.
4505 : */
4506 : void
4507 4 : XLogRequestWalReceiverReply(void)
4508 : {
4509 4 : doRequestWalReceiverReply = true;
4510 4 : }
4511 :
4512 : /*
4513 : * Is HotStandby active yet? This is only important in special backends
4514 : * since normal backends won't ever be able to connect until this returns
4515 : * true. Postmaster knows this by way of signal, not via shared memory.
4516 : *
4517 : * Unlike testing standbyState, this works in any process that's connected to
4518 : * shared memory. (And note that standbyState alone doesn't tell the truth
4519 : * anyway.)
4520 : */
4521 : bool
4522 318 : HotStandbyActive(void)
4523 : {
4524 : /*
4525 : * We check shared state each time only until Hot Standby is active. We
4526 : * can't de-activate Hot Standby, so there's no need to keep checking
4527 : * after the shared variable has once been seen true.
4528 : */
4529 318 : if (LocalHotStandbyActive)
4530 46 : return true;
4531 : else
4532 : {
4533 : /* spinlock is essential on machines with weak memory ordering! */
4534 272 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4535 272 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4536 272 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4537 :
4538 272 : return LocalHotStandbyActive;
4539 : }
4540 : }
4541 :
4542 : /*
4543 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4544 : * where we don't need to ask any other process what the state is.
4545 : */
4546 : static bool
4547 0 : HotStandbyActiveInReplay(void)
4548 : {
4549 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4550 0 : return LocalHotStandbyActive;
4551 : }
4552 :
4553 : /*
4554 : * Get latest redo apply position.
4555 : *
4556 : * Exported to allow WALReceiver to read the pointer directly.
4557 : */
4558 : XLogRecPtr
4559 70524 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4560 : {
4561 : XLogRecPtr recptr;
4562 : TimeLineID tli;
4563 :
4564 70524 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4565 70524 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4566 70524 : tli = XLogRecoveryCtl->lastReplayedTLI;
4567 70524 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4568 :
4569 70524 : if (replayTLI)
4570 4296 : *replayTLI = tli;
4571 70524 : return recptr;
4572 : }
4573 :
4574 :
4575 : /*
4576 : * Get position of last applied, or the record being applied.
4577 : *
4578 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4579 : * record is currently being applied, this includes that record.
4580 : */
4581 : XLogRecPtr
4582 11806 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4583 : {
4584 : XLogRecPtr recptr;
4585 : TimeLineID tli;
4586 :
4587 11806 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4588 11806 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4589 11806 : tli = XLogRecoveryCtl->replayEndTLI;
4590 11806 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4591 :
4592 11806 : if (replayEndTLI)
4593 11806 : *replayEndTLI = tli;
4594 11806 : return recptr;
4595 : }
4596 :
4597 : /*
4598 : * Save timestamp of latest processed commit/abort record.
4599 : *
4600 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4601 : * seen by processes other than the startup process. Note in particular
4602 : * that CreateRestartPoint is executed in the checkpointer.
4603 : */
4604 : static void
4605 42006 : SetLatestXTime(TimestampTz xtime)
4606 : {
4607 42006 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4608 42006 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4609 42006 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4610 42006 : }
4611 :
4612 : /*
4613 : * Fetch timestamp of latest processed commit/abort record.
4614 : */
4615 : TimestampTz
4616 668 : GetLatestXTime(void)
4617 : {
4618 : TimestampTz xtime;
4619 :
4620 668 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4621 668 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4622 668 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4623 :
4624 668 : return xtime;
4625 : }
4626 :
4627 : /*
4628 : * Save timestamp of the next chunk of WAL records to apply.
4629 : *
4630 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4631 : * seen by all backends.
4632 : */
4633 : static void
4634 9016 : SetCurrentChunkStartTime(TimestampTz xtime)
4635 : {
4636 9016 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4637 9016 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4638 9016 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4639 9016 : }
4640 :
4641 : /*
4642 : * Fetch timestamp of latest processed commit/abort record.
4643 : * Startup process maintains an accurate local copy in XLogReceiptTime
4644 : */
4645 : TimestampTz
4646 270 : GetCurrentChunkReplayStartTime(void)
4647 : {
4648 : TimestampTz xtime;
4649 :
4650 270 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4651 270 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4652 270 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4653 :
4654 270 : return xtime;
4655 : }
4656 :
4657 : /*
4658 : * Returns time of receipt of current chunk of XLOG data, as well as
4659 : * whether it was received from streaming replication or from archives.
4660 : */
4661 : void
4662 58 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4663 : {
4664 : /*
4665 : * This must be executed in the startup process, since we don't export the
4666 : * relevant state to shared memory.
4667 : */
4668 : Assert(InRecovery);
4669 :
4670 58 : *rtime = XLogReceiptTime;
4671 58 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4672 58 : }
4673 :
4674 : /*
4675 : * Note that text field supplied is a parameter name and does not require
4676 : * translation
4677 : */
4678 : void
4679 1210 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4680 : {
4681 1210 : if (currValue < minValue)
4682 : {
4683 0 : if (HotStandbyActiveInReplay())
4684 : {
4685 0 : bool warned_for_promote = false;
4686 :
4687 0 : ereport(WARNING,
4688 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4689 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4690 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4691 : param_name,
4692 : currValue,
4693 : minValue)));
4694 :
4695 0 : SetRecoveryPause(true);
4696 :
4697 0 : ereport(LOG,
4698 : (errmsg("recovery has paused"),
4699 : errdetail("If recovery is unpaused, the server will shut down."),
4700 : errhint("You can then restart the server after making the necessary configuration changes.")));
4701 :
4702 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4703 : {
4704 0 : ProcessStartupProcInterrupts();
4705 :
4706 0 : if (CheckForStandbyTrigger())
4707 : {
4708 0 : if (!warned_for_promote)
4709 0 : ereport(WARNING,
4710 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4711 : errmsg("promotion is not possible because of insufficient parameter settings"),
4712 :
4713 : /*
4714 : * Repeat the detail from above so it's easy to find
4715 : * in the log.
4716 : */
4717 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4718 : param_name,
4719 : currValue,
4720 : minValue),
4721 : errhint("Restart the server after making the necessary configuration changes.")));
4722 0 : warned_for_promote = true;
4723 : }
4724 :
4725 : /*
4726 : * If recovery pause is requested then set it paused. While
4727 : * we are in the loop, user might resume and pause again so
4728 : * set this every time.
4729 : */
4730 0 : ConfirmRecoveryPaused();
4731 :
4732 : /*
4733 : * We wait on a condition variable that will wake us as soon
4734 : * as the pause ends, but we use a timeout so we can check the
4735 : * above conditions periodically too.
4736 : */
4737 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4738 : WAIT_EVENT_RECOVERY_PAUSE);
4739 : }
4740 0 : ConditionVariableCancelSleep();
4741 : }
4742 :
4743 0 : ereport(FATAL,
4744 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4745 : errmsg("recovery aborted because of insufficient parameter settings"),
4746 : /* Repeat the detail from above so it's easy to find in the log. */
4747 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4748 : param_name,
4749 : currValue,
4750 : minValue),
4751 : errhint("You can restart the server after making the necessary configuration changes.")));
4752 : }
4753 1210 : }
4754 :
4755 :
4756 : /*
4757 : * GUC check_hook for primary_slot_name
4758 : */
4759 : bool
4760 2460 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4761 : {
4762 2460 : if (*newval && strcmp(*newval, "") != 0 &&
4763 290 : !ReplicationSlotValidateName(*newval, WARNING))
4764 0 : return false;
4765 :
4766 2460 : return true;
4767 : }
4768 :
4769 : /*
4770 : * Recovery target settings: Only one of the several recovery_target* settings
4771 : * may be set. Setting a second one results in an error. The global variable
4772 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4773 : * variables store the actual target value (for example a string or a xid).
4774 : * The assign functions of the parameters check whether a competing parameter
4775 : * was already set. But we want to allow setting the same parameter multiple
4776 : * times. We also want to allow unsetting a parameter and setting a different
4777 : * one, so we unset recoveryTarget when the parameter is set to an empty
4778 : * string.
4779 : *
4780 : * XXX this code is broken by design. Throwing an error from a GUC assign
4781 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4782 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4783 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4784 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4785 : */
4786 :
4787 : pg_noreturn static void
4788 2 : error_multiple_recovery_targets(void)
4789 : {
4790 2 : ereport(ERROR,
4791 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4792 : errmsg("multiple recovery targets specified"),
4793 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4794 : }
4795 :
4796 : /*
4797 : * GUC check_hook for recovery_target
4798 : */
4799 : bool
4800 2172 : check_recovery_target(char **newval, void **extra, GucSource source)
4801 : {
4802 2172 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4803 : {
4804 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4805 0 : return false;
4806 : }
4807 2172 : return true;
4808 : }
4809 :
4810 : /*
4811 : * GUC assign_hook for recovery_target
4812 : */
4813 : void
4814 2172 : assign_recovery_target(const char *newval, void *extra)
4815 : {
4816 2172 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4817 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4818 0 : error_multiple_recovery_targets();
4819 :
4820 2172 : if (newval && strcmp(newval, "") != 0)
4821 2 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4822 : else
4823 2170 : recoveryTarget = RECOVERY_TARGET_UNSET;
4824 2172 : }
4825 :
4826 : /*
4827 : * GUC check_hook for recovery_target_lsn
4828 : */
4829 : bool
4830 2184 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4831 : {
4832 2184 : if (strcmp(*newval, "") != 0)
4833 : {
4834 : XLogRecPtr lsn;
4835 : XLogRecPtr *myextra;
4836 18 : bool have_error = false;
4837 :
4838 18 : lsn = pg_lsn_in_internal(*newval, &have_error);
4839 18 : if (have_error)
4840 0 : return false;
4841 :
4842 18 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4843 18 : if (!myextra)
4844 0 : return false;
4845 18 : *myextra = lsn;
4846 18 : *extra = myextra;
4847 : }
4848 2184 : return true;
4849 : }
4850 :
4851 : /*
4852 : * GUC assign_hook for recovery_target_lsn
4853 : */
4854 : void
4855 2184 : assign_recovery_target_lsn(const char *newval, void *extra)
4856 : {
4857 2184 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4858 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4859 0 : error_multiple_recovery_targets();
4860 :
4861 2184 : if (newval && strcmp(newval, "") != 0)
4862 : {
4863 18 : recoveryTarget = RECOVERY_TARGET_LSN;
4864 18 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4865 : }
4866 : else
4867 2166 : recoveryTarget = RECOVERY_TARGET_UNSET;
4868 2184 : }
4869 :
4870 : /*
4871 : * GUC check_hook for recovery_target_name
4872 : */
4873 : bool
4874 2184 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4875 : {
4876 : /* Use the value of newval directly */
4877 2184 : if (strlen(*newval) >= MAXFNAMELEN)
4878 : {
4879 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4880 : "recovery_target_name", MAXFNAMELEN - 1);
4881 0 : return false;
4882 : }
4883 2184 : return true;
4884 : }
4885 :
4886 : /*
4887 : * GUC assign_hook for recovery_target_name
4888 : */
4889 : void
4890 2184 : assign_recovery_target_name(const char *newval, void *extra)
4891 : {
4892 2184 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4893 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4894 0 : error_multiple_recovery_targets();
4895 :
4896 2184 : if (newval && strcmp(newval, "") != 0)
4897 : {
4898 12 : recoveryTarget = RECOVERY_TARGET_NAME;
4899 12 : recoveryTargetName = newval;
4900 : }
4901 : else
4902 2172 : recoveryTarget = RECOVERY_TARGET_UNSET;
4903 2184 : }
4904 :
4905 : /*
4906 : * GUC check_hook for recovery_target_time
4907 : *
4908 : * The interpretation of the recovery_target_time string can depend on the
4909 : * time zone setting, so we need to wait until after all GUC processing is
4910 : * done before we can do the final parsing of the string. This check function
4911 : * only does a parsing pass to catch syntax errors, but we store the string
4912 : * and parse it again when we need to use it.
4913 : */
4914 : bool
4915 2176 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4916 : {
4917 2176 : if (strcmp(*newval, "") != 0)
4918 : {
4919 : /* reject some special values */
4920 6 : if (strcmp(*newval, "now") == 0 ||
4921 6 : strcmp(*newval, "today") == 0 ||
4922 6 : strcmp(*newval, "tomorrow") == 0 ||
4923 6 : strcmp(*newval, "yesterday") == 0)
4924 : {
4925 0 : return false;
4926 : }
4927 :
4928 : /*
4929 : * parse timestamp value (see also timestamptz_in())
4930 : */
4931 : {
4932 6 : char *str = *newval;
4933 : fsec_t fsec;
4934 : struct pg_tm tt,
4935 6 : *tm = &tt;
4936 : int tz;
4937 : int dtype;
4938 : int nf;
4939 : int dterr;
4940 : char *field[MAXDATEFIELDS];
4941 : int ftype[MAXDATEFIELDS];
4942 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4943 : DateTimeErrorExtra dtextra;
4944 : TimestampTz timestamp;
4945 :
4946 6 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4947 : field, ftype, MAXDATEFIELDS, &nf);
4948 6 : if (dterr == 0)
4949 6 : dterr = DecodeDateTime(field, ftype, nf,
4950 : &dtype, tm, &fsec, &tz, &dtextra);
4951 6 : if (dterr != 0)
4952 0 : return false;
4953 6 : if (dtype != DTK_DATE)
4954 0 : return false;
4955 :
4956 6 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4957 : {
4958 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4959 0 : return false;
4960 : }
4961 : }
4962 : }
4963 2176 : return true;
4964 : }
4965 :
4966 : /*
4967 : * GUC assign_hook for recovery_target_time
4968 : */
4969 : void
4970 2176 : assign_recovery_target_time(const char *newval, void *extra)
4971 : {
4972 2176 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4973 2 : recoveryTarget != RECOVERY_TARGET_TIME)
4974 2 : error_multiple_recovery_targets();
4975 :
4976 2174 : if (newval && strcmp(newval, "") != 0)
4977 4 : recoveryTarget = RECOVERY_TARGET_TIME;
4978 : else
4979 2170 : recoveryTarget = RECOVERY_TARGET_UNSET;
4980 2174 : }
4981 :
4982 : /*
4983 : * GUC check_hook for recovery_target_timeline
4984 : */
4985 : bool
4986 2172 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4987 : {
4988 : RecoveryTargetTimeLineGoal rttg;
4989 : RecoveryTargetTimeLineGoal *myextra;
4990 :
4991 2172 : if (strcmp(*newval, "current") == 0)
4992 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4993 2172 : else if (strcmp(*newval, "latest") == 0)
4994 2172 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4995 : else
4996 : {
4997 0 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
4998 :
4999 0 : errno = 0;
5000 0 : strtoul(*newval, NULL, 0);
5001 0 : if (errno == EINVAL || errno == ERANGE)
5002 : {
5003 0 : GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
5004 0 : return false;
5005 : }
5006 : }
5007 :
5008 2172 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5009 2172 : if (!myextra)
5010 0 : return false;
5011 2172 : *myextra = rttg;
5012 2172 : *extra = myextra;
5013 :
5014 2172 : return true;
5015 : }
5016 :
5017 : /*
5018 : * GUC assign_hook for recovery_target_timeline
5019 : */
5020 : void
5021 2172 : assign_recovery_target_timeline(const char *newval, void *extra)
5022 : {
5023 2172 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5024 2172 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5025 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5026 : else
5027 2172 : recoveryTargetTLIRequested = 0;
5028 2172 : }
5029 :
5030 : /*
5031 : * GUC check_hook for recovery_target_xid
5032 : */
5033 : bool
5034 2172 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5035 : {
5036 2172 : if (strcmp(*newval, "") != 0)
5037 : {
5038 : TransactionId xid;
5039 : TransactionId *myextra;
5040 :
5041 2 : errno = 0;
5042 2 : xid = (TransactionId) strtou64(*newval, NULL, 0);
5043 2 : if (errno == EINVAL || errno == ERANGE)
5044 0 : return false;
5045 :
5046 2 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5047 2 : if (!myextra)
5048 0 : return false;
5049 2 : *myextra = xid;
5050 2 : *extra = myextra;
5051 : }
5052 2172 : return true;
5053 : }
5054 :
5055 : /*
5056 : * GUC assign_hook for recovery_target_xid
5057 : */
5058 : void
5059 2172 : assign_recovery_target_xid(const char *newval, void *extra)
5060 : {
5061 2172 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5062 0 : recoveryTarget != RECOVERY_TARGET_XID)
5063 0 : error_multiple_recovery_targets();
5064 :
5065 2172 : if (newval && strcmp(newval, "") != 0)
5066 : {
5067 2 : recoveryTarget = RECOVERY_TARGET_XID;
5068 2 : recoveryTargetXid = *((TransactionId *) extra);
5069 : }
5070 : else
5071 2170 : recoveryTarget = RECOVERY_TARGET_UNSET;
5072 2172 : }
|