Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <time.h>
29 : #include <sys/stat.h>
30 : #include <sys/time.h>
31 : #include <unistd.h>
32 :
33 : #include "access/timeline.h"
34 : #include "access/transam.h"
35 : #include "access/xact.h"
36 : #include "access/xlog_internal.h"
37 : #include "access/xlogarchive.h"
38 : #include "access/xlogprefetcher.h"
39 : #include "access/xlogreader.h"
40 : #include "access/xlogrecovery.h"
41 : #include "access/xlogutils.h"
42 : #include "access/xlogwait.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "nodes/miscnodes.h"
49 : #include "pgstat.h"
50 : #include "postmaster/bgwriter.h"
51 : #include "postmaster/startup.h"
52 : #include "replication/slot.h"
53 : #include "replication/slotsync.h"
54 : #include "replication/walreceiver.h"
55 : #include "storage/fd.h"
56 : #include "storage/ipc.h"
57 : #include "storage/latch.h"
58 : #include "storage/pmsignal.h"
59 : #include "storage/procarray.h"
60 : #include "storage/spin.h"
61 : #include "utils/datetime.h"
62 : #include "utils/fmgrprotos.h"
63 : #include "utils/guc_hooks.h"
64 : #include "utils/pgstat_internal.h"
65 : #include "utils/pg_lsn.h"
66 : #include "utils/ps_status.h"
67 : #include "utils/pg_rusage.h"
68 :
69 : /* Unsupported old recovery command file names (relative to $PGDATA) */
70 : #define RECOVERY_COMMAND_FILE "recovery.conf"
71 : #define RECOVERY_COMMAND_DONE "recovery.done"
72 :
73 : /*
74 : * GUC support
75 : */
76 : const struct config_enum_entry recovery_target_action_options[] = {
77 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
78 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
79 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
80 : {NULL, 0, false}
81 : };
82 :
83 : /* options formerly taken from recovery.conf for archive recovery */
84 : char *recoveryRestoreCommand = NULL;
85 : char *recoveryEndCommand = NULL;
86 : char *archiveCleanupCommand = NULL;
87 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
88 : bool recoveryTargetInclusive = true;
89 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
90 : TransactionId recoveryTargetXid;
91 : char *recovery_target_time_string;
92 : TimestampTz recoveryTargetTime;
93 : const char *recoveryTargetName;
94 : XLogRecPtr recoveryTargetLSN;
95 : int recovery_min_apply_delay = 0;
96 :
97 : /* options formerly taken from recovery.conf for XLOG streaming */
98 : char *PrimaryConnInfo = NULL;
99 : char *PrimarySlotName = NULL;
100 : bool wal_receiver_create_temp_slot = false;
101 :
102 : /*
103 : * recoveryTargetTimeLineGoal: what the user requested, if any
104 : *
105 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
106 : *
107 : * recoveryTargetTLI: the currently understood target timeline; changes
108 : *
109 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
110 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
111 : * always the first list member). Only these TLIs are expected to be seen in
112 : * the WAL segments we read, and indeed only these TLIs will be considered as
113 : * candidate WAL files to open at all.
114 : *
115 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
116 : * (This is not necessarily the same as the timeline from which we are
117 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
118 : * scanning data that was copied from an ancestor timeline when the current
119 : * file was created.) During a sequential scan we do not allow this value
120 : * to decrease.
121 : */
122 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
123 : TimeLineID recoveryTargetTLIRequested = 0;
124 : TimeLineID recoveryTargetTLI = 0;
125 : static List *expectedTLEs;
126 : static TimeLineID curFileTLI;
127 :
128 : /*
129 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
130 : * ie. signal files were present. When InArchiveRecovery is set, we are
131 : * currently recovering using offline XLOG archives. These variables are only
132 : * valid in the startup process.
133 : *
134 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
135 : * currently performing crash recovery using only XLOG files in pg_wal, but
136 : * will switch to using offline XLOG archives as soon as we reach the end of
137 : * WAL in pg_wal.
138 : */
139 : bool ArchiveRecoveryRequested = false;
140 : bool InArchiveRecovery = false;
141 :
142 : /*
143 : * When StandbyModeRequested is set, standby mode was requested, i.e.
144 : * standby.signal file was present. When StandbyMode is set, we are currently
145 : * in standby mode. These variables are only valid in the startup process.
146 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
147 : */
148 : static bool StandbyModeRequested = false;
149 : bool StandbyMode = false;
150 :
151 : /* was a signal file present at startup? */
152 : static bool standby_signal_file_found = false;
153 : static bool recovery_signal_file_found = false;
154 :
155 : /*
156 : * CheckPointLoc is the position of the checkpoint record that determines
157 : * where to start the replay. It comes from the backup label file or the
158 : * control file.
159 : *
160 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
161 : * file or the control file. In standby mode, XLOG streaming usually starts
162 : * from the position where an invalid record was found. But if we fail to
163 : * read even the initial checkpoint record, we use the REDO location instead
164 : * of the checkpoint location as the start position of XLOG streaming.
165 : * Otherwise we would have to jump backwards to the REDO location after
166 : * reading the checkpoint record, because the REDO record can precede the
167 : * checkpoint record.
168 : */
169 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
170 : static TimeLineID CheckPointTLI = 0;
171 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
172 : static TimeLineID RedoStartTLI = 0;
173 :
174 : /*
175 : * Local copy of SharedHotStandbyActive variable. False actually means "not
176 : * known, need to check the shared state".
177 : */
178 : static bool LocalHotStandbyActive = false;
179 :
180 : /*
181 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
182 : * known, need to check the shared state".
183 : */
184 : static bool LocalPromoteIsTriggered = false;
185 :
186 : /* Has the recovery code requested a walreceiver wakeup? */
187 : static bool doRequestWalReceiverReply;
188 :
189 : /* XLogReader object used to parse the WAL records */
190 : static XLogReaderState *xlogreader = NULL;
191 :
192 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
193 : static XLogPrefetcher *xlogprefetcher = NULL;
194 :
195 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
196 : typedef struct XLogPageReadPrivate
197 : {
198 : int emode;
199 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
200 : bool randAccess;
201 : TimeLineID replayTLI;
202 : } XLogPageReadPrivate;
203 :
204 : /* flag to tell XLogPageRead that we have started replaying */
205 : static bool InRedo = false;
206 :
207 : /*
208 : * Codes indicating where we got a WAL file from during recovery, or where
209 : * to attempt to get one.
210 : */
211 : typedef enum
212 : {
213 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
214 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
215 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
216 : XLOG_FROM_STREAM, /* streamed from primary */
217 : } XLogSource;
218 :
219 : /* human-readable names for XLogSources, for debugging output */
220 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
221 :
222 : /*
223 : * readFile is -1 or a kernel FD for the log file segment that's currently
224 : * open for reading. readSegNo identifies the segment. readOff is the offset
225 : * of the page just read, readLen indicates how much of it has been read into
226 : * readBuf, and readSource indicates where we got the currently open file from.
227 : *
228 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
229 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
230 : * worthwhile, since the XLOG is not read by general-purpose sessions.
231 : */
232 : static int readFile = -1;
233 : static XLogSegNo readSegNo = 0;
234 : static uint32 readOff = 0;
235 : static uint32 readLen = 0;
236 : static XLogSource readSource = XLOG_FROM_ANY;
237 :
238 : /*
239 : * Keeps track of which source we're currently reading from. This is
240 : * different from readSource in that this is always set, even when we don't
241 : * currently have a WAL file open. If lastSourceFailed is set, our last
242 : * attempt to read from currentSource failed, and we should try another source
243 : * next.
244 : *
245 : * pendingWalRcvRestart is set when a config change occurs that requires a
246 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
247 : */
248 : static XLogSource currentSource = XLOG_FROM_ANY;
249 : static bool lastSourceFailed = false;
250 : static bool pendingWalRcvRestart = false;
251 :
252 : /*
253 : * These variables track when we last obtained some WAL data to process,
254 : * and where we got it from. (XLogReceiptSource is initially the same as
255 : * readSource, but readSource gets reset to zero when we don't have data
256 : * to process right now. It is also different from currentSource, which
257 : * also changes when we try to read from a source and fail, while
258 : * XLogReceiptSource tracks where we last successfully read some WAL.)
259 : */
260 : static TimestampTz XLogReceiptTime = 0;
261 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
262 :
263 : /* Local copy of WalRcv->flushedUpto */
264 : static XLogRecPtr flushedUpto = InvalidXLogRecPtr;
265 : static TimeLineID receiveTLI = 0;
266 :
267 : /*
268 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
269 : *
270 : * In order to reach consistency, we must replay the WAL up to
271 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
272 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
273 : * to backupStartPoint.
274 : *
275 : * Note: In archive recovery, after consistency has been reached, the
276 : * functions in xlog.c will start updating minRecoveryPoint in the control
277 : * file. But this copy of minRecoveryPoint variable reflects the value at the
278 : * beginning of recovery, and is *not* updated after consistency is reached.
279 : */
280 : static XLogRecPtr minRecoveryPoint;
281 : static TimeLineID minRecoveryPointTLI;
282 :
283 : static XLogRecPtr backupStartPoint;
284 : static XLogRecPtr backupEndPoint;
285 : static bool backupEndRequired = false;
286 :
287 : /*
288 : * Have we reached a consistent database state? In crash recovery, we have
289 : * to replay all the WAL, so reachedConsistency is never set. During archive
290 : * recovery, the database is consistent once minRecoveryPoint is reached.
291 : *
292 : * Consistent state means that the system is internally consistent, all
293 : * the WAL has been replayed up to a certain point, and importantly, there
294 : * is no trace of later actions on disk.
295 : *
296 : * This flag is used only by the startup process and postmaster. When
297 : * minRecoveryPoint is reached, the startup process sets it to true and
298 : * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
299 : * which then sets it to true upon receiving the signal.
300 : */
301 : bool reachedConsistency = false;
302 :
303 : /* Buffers dedicated to consistency checks of size BLCKSZ */
304 : static char *replay_image_masked = NULL;
305 : static char *primary_image_masked = NULL;
306 :
307 :
308 : /*
309 : * Shared-memory state for WAL recovery.
310 : */
311 : typedef struct XLogRecoveryCtlData
312 : {
313 : /*
314 : * SharedHotStandbyActive indicates if we allow hot standby queries to be
315 : * run. Protected by info_lck.
316 : */
317 : bool SharedHotStandbyActive;
318 :
319 : /*
320 : * SharedPromoteIsTriggered indicates if a standby promotion has been
321 : * triggered. Protected by info_lck.
322 : */
323 : bool SharedPromoteIsTriggered;
324 :
325 : /*
326 : * recoveryWakeupLatch is used to wake up the startup process to continue
327 : * WAL replay, if it is waiting for WAL to arrive or promotion to be
328 : * requested.
329 : *
330 : * Note that the startup process also uses another latch, its procLatch,
331 : * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
332 : * signaling the startup process in favor of using its procLatch, which
333 : * comports better with possible generic signal handlers using that latch.
334 : * But we should not do that because the startup process doesn't assume
335 : * that it's waken up by walreceiver process or SIGHUP signal handler
336 : * while it's waiting for recovery conflict. The separate latches,
337 : * recoveryWakeupLatch and procLatch, should be used for inter-process
338 : * communication for WAL replay and recovery conflict, respectively.
339 : */
340 : Latch recoveryWakeupLatch;
341 :
342 : /*
343 : * Last record successfully replayed.
344 : */
345 : XLogRecPtr lastReplayedReadRecPtr; /* start position */
346 : XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
347 : TimeLineID lastReplayedTLI; /* timeline */
348 :
349 : /*
350 : * When we're currently replaying a record, ie. in a redo function,
351 : * replayEndRecPtr points to the end+1 of the record being replayed,
352 : * otherwise it's equal to lastReplayedEndRecPtr.
353 : */
354 : XLogRecPtr replayEndRecPtr;
355 : TimeLineID replayEndTLI;
356 : /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
357 : TimestampTz recoveryLastXTime;
358 :
359 : /*
360 : * timestamp of when we started replaying the current chunk of WAL data,
361 : * only relevant for replication or archive recovery
362 : */
363 : TimestampTz currentChunkStartTime;
364 : /* Recovery pause state */
365 : RecoveryPauseState recoveryPauseState;
366 : ConditionVariable recoveryNotPausedCV;
367 :
368 : slock_t info_lck; /* locks shared variables shown above */
369 : } XLogRecoveryCtlData;
370 :
371 : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
372 :
373 : /*
374 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
375 : * recovery completes; missingContrecPtr is the location of the first
376 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
377 : * details.
378 : */
379 : static XLogRecPtr abortedRecPtr;
380 : static XLogRecPtr missingContrecPtr;
381 :
382 : /*
383 : * if recoveryStopsBefore/After returns true, it saves information of the stop
384 : * point here
385 : */
386 : static TransactionId recoveryStopXid;
387 : static TimestampTz recoveryStopTime;
388 : static XLogRecPtr recoveryStopLSN;
389 : static char recoveryStopName[MAXFNAMELEN];
390 : static bool recoveryStopAfter;
391 :
392 : /* prototypes for local functions */
393 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
394 :
395 : static void EnableStandbyMode(void);
396 : static void readRecoverySignalFile(void);
397 : static void validateRecoveryParameters(void);
398 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
399 : TimeLineID *backupLabelTLI,
400 : bool *backupEndRequired, bool *backupFromStandby);
401 : static bool read_tablespace_map(List **tablespaces);
402 :
403 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
404 : static void CheckRecoveryConsistency(void);
405 : static void rm_redo_error_callback(void *arg);
406 : #ifdef WAL_DEBUG
407 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
408 : #endif
409 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
410 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
411 : TimeLineID prevTLI, TimeLineID replayTLI);
412 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
413 : static void verifyBackupPageConsistency(XLogReaderState *record);
414 :
415 : static bool recoveryStopsBefore(XLogReaderState *record);
416 : static bool recoveryStopsAfter(XLogReaderState *record);
417 : static char *getRecoveryStopReason(void);
418 : static void recoveryPausesHere(bool endOfRecovery);
419 : static bool recoveryApplyDelay(XLogReaderState *record);
420 : static void ConfirmRecoveryPaused(void);
421 :
422 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
423 : int emode, bool fetching_ckpt,
424 : TimeLineID replayTLI);
425 :
426 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
427 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
428 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
429 : bool randAccess,
430 : bool fetching_ckpt,
431 : XLogRecPtr tliRecPtr,
432 : TimeLineID replayTLI,
433 : XLogRecPtr replayLSN,
434 : bool nonblocking);
435 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
436 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
437 : XLogRecPtr RecPtr, TimeLineID replayTLI);
438 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
439 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
440 : XLogSource source, bool notfoundOk);
441 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
442 :
443 : static bool CheckForStandbyTrigger(void);
444 : static void SetPromoteIsTriggered(void);
445 : static bool HotStandbyActiveInReplay(void);
446 :
447 : static void SetCurrentChunkStartTime(TimestampTz xtime);
448 : static void SetLatestXTime(TimestampTz xtime);
449 :
450 : /*
451 : * Initialization of shared memory for WAL recovery
452 : */
453 : Size
454 3297 : XLogRecoveryShmemSize(void)
455 : {
456 : Size size;
457 :
458 : /* XLogRecoveryCtl */
459 3297 : size = sizeof(XLogRecoveryCtlData);
460 :
461 3297 : return size;
462 : }
463 :
464 : void
465 1150 : XLogRecoveryShmemInit(void)
466 : {
467 : bool found;
468 :
469 1150 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
470 1150 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
471 1150 : if (found)
472 0 : return;
473 1150 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
474 :
475 1150 : SpinLockInit(&XLogRecoveryCtl->info_lck);
476 1150 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
477 1150 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
478 : }
479 :
480 : /*
481 : * A thin wrapper to enable StandbyMode and do other preparatory work as
482 : * needed.
483 : */
484 : static void
485 113 : EnableStandbyMode(void)
486 : {
487 113 : StandbyMode = true;
488 :
489 : /*
490 : * To avoid server log bloat, we don't report recovery progress in a
491 : * standby as it will always be in recovery unless promoted. We disable
492 : * startup progress timeout in standby mode to avoid calling
493 : * startup_progress_timeout_handler() unnecessarily.
494 : */
495 113 : disable_startup_progress_timeout();
496 113 : }
497 :
498 : /*
499 : * Prepare the system for WAL recovery, if needed.
500 : *
501 : * This is called by StartupXLOG() which coordinates the server startup
502 : * sequence. This function analyzes the control file and the backup label
503 : * file, if any, and figures out whether we need to perform crash recovery or
504 : * archive recovery, and how far we need to replay the WAL to reach a
505 : * consistent state.
506 : *
507 : * This doesn't yet change the on-disk state, except for creating the symlinks
508 : * from table space map file if any, and for fetching WAL files needed to find
509 : * the checkpoint record. On entry, the caller has already read the control
510 : * file into memory, and passes it as argument. This function updates it to
511 : * reflect the recovery state, and the caller is expected to write it back to
512 : * disk does after initializing other subsystems, but before calling
513 : * PerformWalRecovery().
514 : *
515 : * This initializes some global variables like ArchiveRecoveryRequested, and
516 : * StandbyModeRequested and InRecovery.
517 : */
518 : void
519 1006 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
520 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
521 : {
522 : XLogPageReadPrivate *private;
523 : struct stat st;
524 : bool wasShutdown;
525 : XLogRecord *record;
526 : DBState dbstate_at_startup;
527 1006 : bool haveTblspcMap = false;
528 1006 : bool haveBackupLabel = false;
529 : CheckPoint checkPoint;
530 1006 : bool backupFromStandby = false;
531 :
532 1006 : dbstate_at_startup = ControlFile->state;
533 :
534 : /*
535 : * Initialize on the assumption we want to recover to the latest timeline
536 : * that's active according to pg_control.
537 : */
538 1006 : if (ControlFile->minRecoveryPointTLI >
539 1006 : ControlFile->checkPointCopy.ThisTimeLineID)
540 2 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
541 : else
542 1004 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
543 :
544 : /*
545 : * Check for signal files, and if so set up state for offline recovery
546 : */
547 1006 : readRecoverySignalFile();
548 1006 : validateRecoveryParameters();
549 :
550 : /*
551 : * Take ownership of the wakeup latch if we're going to sleep during
552 : * recovery, if required.
553 : */
554 1006 : if (ArchiveRecoveryRequested)
555 118 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
556 :
557 : /*
558 : * Set the WAL reading processor now, as it will be needed when reading
559 : * the checkpoint record required (backup_label or not).
560 : */
561 1006 : private = palloc0_object(XLogPageReadPrivate);
562 1006 : xlogreader =
563 1006 : XLogReaderAllocate(wal_segment_size, NULL,
564 1006 : XL_ROUTINE(.page_read = &XLogPageRead,
565 : .segment_open = NULL,
566 : .segment_close = wal_segment_close),
567 : private);
568 1006 : if (!xlogreader)
569 0 : ereport(ERROR,
570 : (errcode(ERRCODE_OUT_OF_MEMORY),
571 : errmsg("out of memory"),
572 : errdetail("Failed while allocating a WAL reading processor.")));
573 1006 : xlogreader->system_identifier = ControlFile->system_identifier;
574 :
575 : /*
576 : * Set the WAL decode buffer size. This limits how far ahead we can read
577 : * in the WAL.
578 : */
579 1006 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
580 :
581 : /* Create a WAL prefetcher. */
582 1006 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
583 :
584 : /*
585 : * Allocate two page buffers dedicated to WAL consistency checks. We do
586 : * it this way, rather than just making static arrays, for two reasons:
587 : * (1) no need to waste the storage in most instantiations of the backend;
588 : * (2) a static char array isn't guaranteed to have any particular
589 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
590 : */
591 1006 : replay_image_masked = (char *) palloc(BLCKSZ);
592 1006 : primary_image_masked = (char *) palloc(BLCKSZ);
593 :
594 : /*
595 : * Read the backup_label file. We want to run this part of the recovery
596 : * process after checking for signal files and after performing validation
597 : * of the recovery parameters.
598 : */
599 1006 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
600 : &backupFromStandby))
601 : {
602 81 : List *tablespaces = NIL;
603 :
604 : /*
605 : * Archive recovery was requested, and thanks to the backup label
606 : * file, we know how far we need to replay to reach consistency. Enter
607 : * archive recovery directly.
608 : */
609 81 : InArchiveRecovery = true;
610 81 : if (StandbyModeRequested)
611 69 : EnableStandbyMode();
612 :
613 : /*
614 : * Omitting backup_label when creating a new replica, PITR node etc.
615 : * unfortunately is a common cause of corruption. Logging that
616 : * backup_label was used makes it a bit easier to exclude that as the
617 : * cause of observed corruption.
618 : *
619 : * Do so before we try to read the checkpoint record (which can fail),
620 : * as otherwise it can be hard to understand why a checkpoint other
621 : * than ControlFile->checkPoint is used.
622 : */
623 81 : ereport(LOG,
624 : errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
625 : LSN_FORMAT_ARGS(RedoStartLSN),
626 : LSN_FORMAT_ARGS(CheckPointLoc),
627 : CheckPointTLI));
628 :
629 : /*
630 : * When a backup_label file is present, we want to roll forward from
631 : * the checkpoint it identifies, rather than using pg_control.
632 : */
633 81 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
634 : CheckPointTLI);
635 81 : if (record != NULL)
636 : {
637 81 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
638 81 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
639 81 : ereport(DEBUG1,
640 : errmsg_internal("checkpoint record is at %X/%08X",
641 : LSN_FORMAT_ARGS(CheckPointLoc)));
642 81 : InRecovery = true; /* force recovery even if SHUTDOWNED */
643 :
644 : /*
645 : * Make sure that REDO location exists. This may not be the case
646 : * if there was a crash during an online backup, which left a
647 : * backup_label around that references a WAL segment that's
648 : * already been archived.
649 : */
650 81 : if (checkPoint.redo < CheckPointLoc)
651 : {
652 81 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
653 81 : if (!ReadRecord(xlogprefetcher, LOG, false,
654 : checkPoint.ThisTimeLineID))
655 0 : ereport(FATAL,
656 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
657 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
658 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
659 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
660 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
661 : DataDir, DataDir, DataDir, DataDir));
662 : }
663 : }
664 : else
665 : {
666 0 : ereport(FATAL,
667 : errmsg("could not locate required checkpoint record at %X/%08X",
668 : LSN_FORMAT_ARGS(CheckPointLoc)),
669 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
670 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
671 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
672 : DataDir, DataDir, DataDir, DataDir));
673 : wasShutdown = false; /* keep compiler quiet */
674 : }
675 :
676 : /* Read the tablespace_map file if present and create symlinks. */
677 81 : if (read_tablespace_map(&tablespaces))
678 : {
679 : ListCell *lc;
680 :
681 4 : foreach(lc, tablespaces)
682 : {
683 2 : tablespaceinfo *ti = lfirst(lc);
684 : char *linkloc;
685 :
686 2 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
687 :
688 : /*
689 : * Remove the existing symlink if any and Create the symlink
690 : * under PGDATA.
691 : */
692 2 : remove_tablespace_symlink(linkloc);
693 :
694 2 : if (symlink(ti->path, linkloc) < 0)
695 0 : ereport(ERROR,
696 : (errcode_for_file_access(),
697 : errmsg("could not create symbolic link \"%s\": %m",
698 : linkloc)));
699 :
700 2 : pfree(ti->path);
701 2 : pfree(ti);
702 : }
703 :
704 : /* tell the caller to delete it later */
705 2 : haveTblspcMap = true;
706 : }
707 :
708 : /* tell the caller to delete it later */
709 81 : haveBackupLabel = true;
710 : }
711 : else
712 : {
713 : /* No backup_label file has been found if we are here. */
714 :
715 : /*
716 : * If tablespace_map file is present without backup_label file, there
717 : * is no use of such file. There is no harm in retaining it, but it
718 : * is better to get rid of the map file so that we don't have any
719 : * redundant file in data directory and it will avoid any sort of
720 : * confusion. It seems prudent though to just rename the file out of
721 : * the way rather than delete it completely, also we ignore any error
722 : * that occurs in rename operation as even if map file is present
723 : * without backup_label file, it is harmless.
724 : */
725 925 : if (stat(TABLESPACE_MAP, &st) == 0)
726 : {
727 1 : unlink(TABLESPACE_MAP_OLD);
728 1 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
729 1 : ereport(LOG,
730 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
732 : errdetail("File \"%s\" was renamed to \"%s\".",
733 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
734 : else
735 0 : ereport(LOG,
736 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
737 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
738 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
739 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
740 : }
741 :
742 : /*
743 : * It's possible that archive recovery was requested, but we don't
744 : * know how far we need to replay the WAL before we reach consistency.
745 : * This can happen for example if a base backup is taken from a
746 : * running server using an atomic filesystem snapshot, without calling
747 : * pg_backup_start/stop. Or if you just kill a running primary server
748 : * and put it into archive recovery by creating a recovery signal
749 : * file.
750 : *
751 : * Our strategy in that case is to perform crash recovery first,
752 : * replaying all the WAL present in pg_wal, and only enter archive
753 : * recovery after that.
754 : *
755 : * But usually we already know how far we need to replay the WAL (up
756 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
757 : * end-of-backup record), and we can enter archive recovery directly.
758 : */
759 925 : if (ArchiveRecoveryRequested &&
760 44 : (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) ||
761 9 : ControlFile->backupEndRequired ||
762 9 : XLogRecPtrIsValid(ControlFile->backupEndPoint) ||
763 9 : ControlFile->state == DB_SHUTDOWNED))
764 : {
765 42 : InArchiveRecovery = true;
766 42 : if (StandbyModeRequested)
767 42 : EnableStandbyMode();
768 : }
769 :
770 : /*
771 : * For the same reason as when starting up with backup_label present,
772 : * emit a log message when we continue initializing from a base
773 : * backup.
774 : */
775 925 : if (XLogRecPtrIsValid(ControlFile->backupStartPoint))
776 0 : ereport(LOG,
777 : errmsg("restarting backup recovery with redo LSN %X/%08X",
778 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
779 :
780 : /* Get the last valid checkpoint record. */
781 925 : CheckPointLoc = ControlFile->checkPoint;
782 925 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
783 925 : RedoStartLSN = ControlFile->checkPointCopy.redo;
784 925 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
785 925 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
786 : CheckPointTLI);
787 925 : if (record != NULL)
788 : {
789 925 : ereport(DEBUG1,
790 : errmsg_internal("checkpoint record is at %X/%08X",
791 : LSN_FORMAT_ARGS(CheckPointLoc)));
792 : }
793 : else
794 : {
795 : /*
796 : * We used to attempt to go back to a secondary checkpoint record
797 : * here, but only when not in standby mode. We now just fail if we
798 : * can't read the last checkpoint because this allows us to
799 : * simplify processing around checkpoints.
800 : */
801 0 : ereport(PANIC,
802 : errmsg("could not locate a valid checkpoint record at %X/%08X",
803 : LSN_FORMAT_ARGS(CheckPointLoc)));
804 : }
805 925 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
806 925 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
807 :
808 : /* Make sure that REDO location exists. */
809 925 : if (checkPoint.redo < CheckPointLoc)
810 : {
811 44 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
812 44 : if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
813 1 : ereport(FATAL,
814 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
815 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
816 : }
817 : }
818 :
819 1005 : if (ArchiveRecoveryRequested)
820 : {
821 118 : if (StandbyModeRequested)
822 113 : ereport(LOG,
823 : (errmsg("entering standby mode")));
824 5 : else if (recoveryTarget == RECOVERY_TARGET_XID)
825 0 : ereport(LOG,
826 : (errmsg("starting point-in-time recovery to XID %u",
827 : recoveryTargetXid)));
828 5 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
829 0 : ereport(LOG,
830 : (errmsg("starting point-in-time recovery to %s",
831 : timestamptz_to_str(recoveryTargetTime))));
832 5 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
833 3 : ereport(LOG,
834 : (errmsg("starting point-in-time recovery to \"%s\"",
835 : recoveryTargetName)));
836 2 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
837 0 : ereport(LOG,
838 : errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
839 : LSN_FORMAT_ARGS(recoveryTargetLSN)));
840 2 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
841 0 : ereport(LOG,
842 : (errmsg("starting point-in-time recovery to earliest consistent point")));
843 : else
844 2 : ereport(LOG,
845 : (errmsg("starting archive recovery")));
846 : }
847 :
848 : /*
849 : * If the location of the checkpoint record is not on the expected
850 : * timeline in the history of the requested timeline, we cannot proceed:
851 : * the backup is not part of the history of the requested timeline.
852 : */
853 : Assert(expectedTLEs); /* was initialized by reading checkpoint
854 : * record */
855 1005 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
856 : CheckPointTLI)
857 : {
858 : XLogRecPtr switchpoint;
859 :
860 : /*
861 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
862 : * not in expectedTLEs at all.
863 : */
864 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
865 0 : ereport(FATAL,
866 : (errmsg("requested timeline %u is not a child of this server's history",
867 : recoveryTargetTLI),
868 : /* translator: %s is a backup_label file or a pg_control file */
869 : errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
870 : haveBackupLabel ? "backup_label" : "pg_control",
871 : LSN_FORMAT_ARGS(CheckPointLoc),
872 : CheckPointTLI,
873 : LSN_FORMAT_ARGS(switchpoint))));
874 : }
875 :
876 : /*
877 : * The min recovery point should be part of the requested timeline's
878 : * history, too.
879 : */
880 1005 : if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) &&
881 42 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
882 42 : ControlFile->minRecoveryPointTLI)
883 0 : ereport(FATAL,
884 : errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
885 : recoveryTargetTLI,
886 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
887 : ControlFile->minRecoveryPointTLI));
888 :
889 1005 : ereport(DEBUG1,
890 : errmsg_internal("redo record is at %X/%08X; shutdown %s",
891 : LSN_FORMAT_ARGS(checkPoint.redo),
892 : wasShutdown ? "true" : "false"));
893 1005 : ereport(DEBUG1,
894 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
895 : U64FromFullTransactionId(checkPoint.nextXid),
896 : checkPoint.nextOid)));
897 1005 : ereport(DEBUG1,
898 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
899 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
900 1005 : ereport(DEBUG1,
901 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
902 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
903 1005 : ereport(DEBUG1,
904 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
905 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
906 1005 : ereport(DEBUG1,
907 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
908 : checkPoint.oldestCommitTsXid,
909 : checkPoint.newestCommitTsXid)));
910 1005 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
911 0 : ereport(PANIC,
912 : (errmsg("invalid next transaction ID")));
913 :
914 : /* sanity check */
915 1005 : if (checkPoint.redo > CheckPointLoc)
916 0 : ereport(PANIC,
917 : (errmsg("invalid redo in checkpoint record")));
918 :
919 : /*
920 : * Check whether we need to force recovery from WAL. If it appears to
921 : * have been a clean shutdown and we did not have a recovery signal file,
922 : * then assume no recovery needed.
923 : */
924 1005 : if (checkPoint.redo < CheckPointLoc)
925 : {
926 124 : if (wasShutdown)
927 0 : ereport(PANIC,
928 : (errmsg("invalid redo record in shutdown checkpoint")));
929 124 : InRecovery = true;
930 : }
931 881 : else if (ControlFile->state != DB_SHUTDOWNED)
932 95 : InRecovery = true;
933 786 : else if (ArchiveRecoveryRequested)
934 : {
935 : /* force recovery due to presence of recovery signal file */
936 7 : InRecovery = true;
937 : }
938 :
939 : /*
940 : * If recovery is needed, update our in-memory copy of pg_control to show
941 : * that we are recovering and to show the selected checkpoint as the place
942 : * we are starting from. We also mark pg_control with any minimum recovery
943 : * stop point obtained from a backup history file.
944 : *
945 : * We don't write the changes to disk yet, though. Only do that after
946 : * initializing various subsystems.
947 : */
948 1005 : if (InRecovery)
949 : {
950 226 : if (InArchiveRecovery)
951 : {
952 123 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
953 : }
954 : else
955 : {
956 103 : ereport(LOG,
957 : (errmsg("database system was not properly shut down; "
958 : "automatic recovery in progress")));
959 103 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
960 2 : ereport(LOG,
961 : (errmsg("crash recovery starts in timeline %u "
962 : "and has target timeline %u",
963 : ControlFile->checkPointCopy.ThisTimeLineID,
964 : recoveryTargetTLI)));
965 103 : ControlFile->state = DB_IN_CRASH_RECOVERY;
966 : }
967 226 : ControlFile->checkPoint = CheckPointLoc;
968 226 : ControlFile->checkPointCopy = checkPoint;
969 226 : if (InArchiveRecovery)
970 : {
971 : /* initialize minRecoveryPoint if not set yet */
972 123 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
973 : {
974 83 : ControlFile->minRecoveryPoint = checkPoint.redo;
975 83 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
976 : }
977 : }
978 :
979 : /*
980 : * Set backupStartPoint if we're starting recovery from a base backup.
981 : *
982 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
983 : * location if we're starting recovery from a base backup which was
984 : * taken from a standby. In this case, the database system status in
985 : * pg_control must indicate that the database was already in recovery.
986 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
987 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
988 : * before reaching this point; e.g. because restore_command or
989 : * primary_conninfo were faulty.
990 : *
991 : * Any other state indicates that the backup somehow became corrupted
992 : * and we can't sensibly continue with recovery.
993 : */
994 226 : if (haveBackupLabel)
995 : {
996 81 : ControlFile->backupStartPoint = checkPoint.redo;
997 81 : ControlFile->backupEndRequired = backupEndRequired;
998 :
999 81 : if (backupFromStandby)
1000 : {
1001 5 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
1002 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
1003 0 : ereport(FATAL,
1004 : (errmsg("backup_label contains data inconsistent with control file"),
1005 : errhint("This means that the backup is corrupted and you will "
1006 : "have to use another backup for recovery.")));
1007 5 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
1008 : }
1009 : }
1010 : }
1011 :
1012 : /* remember these, so that we know when we have reached consistency */
1013 1005 : backupStartPoint = ControlFile->backupStartPoint;
1014 1005 : backupEndRequired = ControlFile->backupEndRequired;
1015 1005 : backupEndPoint = ControlFile->backupEndPoint;
1016 1005 : if (InArchiveRecovery)
1017 : {
1018 123 : minRecoveryPoint = ControlFile->minRecoveryPoint;
1019 123 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1020 : }
1021 : else
1022 : {
1023 882 : minRecoveryPoint = InvalidXLogRecPtr;
1024 882 : minRecoveryPointTLI = 0;
1025 : }
1026 :
1027 : /*
1028 : * Start recovery assuming that the final record isn't lost.
1029 : */
1030 1005 : abortedRecPtr = InvalidXLogRecPtr;
1031 1005 : missingContrecPtr = InvalidXLogRecPtr;
1032 :
1033 1005 : *wasShutdown_ptr = wasShutdown;
1034 1005 : *haveBackupLabel_ptr = haveBackupLabel;
1035 1005 : *haveTblspcMap_ptr = haveTblspcMap;
1036 1005 : }
1037 :
1038 : /*
1039 : * See if there are any recovery signal files and if so, set state for
1040 : * recovery.
1041 : *
1042 : * See if there is a recovery command file (recovery.conf), and if so
1043 : * throw an ERROR since as of PG12 we no longer recognize that.
1044 : */
1045 : static void
1046 1006 : readRecoverySignalFile(void)
1047 : {
1048 : struct stat stat_buf;
1049 :
1050 1006 : if (IsBootstrapProcessingMode())
1051 888 : return;
1052 :
1053 : /*
1054 : * Check for old recovery API file: recovery.conf
1055 : */
1056 955 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1057 0 : ereport(FATAL,
1058 : (errcode_for_file_access(),
1059 : errmsg("using recovery command file \"%s\" is not supported",
1060 : RECOVERY_COMMAND_FILE)));
1061 :
1062 : /*
1063 : * Remove unused .done file, if present. Ignore if absent.
1064 : */
1065 955 : unlink(RECOVERY_COMMAND_DONE);
1066 :
1067 : /*
1068 : * Check for recovery signal files and if found, fsync them since they
1069 : * represent server state information. We don't sweat too much about the
1070 : * possibility of fsync failure, however.
1071 : */
1072 955 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1073 : {
1074 : int fd;
1075 :
1076 113 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1077 : S_IRUSR | S_IWUSR);
1078 113 : if (fd >= 0)
1079 : {
1080 113 : (void) pg_fsync(fd);
1081 113 : close(fd);
1082 : }
1083 113 : standby_signal_file_found = true;
1084 : }
1085 :
1086 955 : if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1087 : {
1088 : int fd;
1089 :
1090 6 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1091 : S_IRUSR | S_IWUSR);
1092 6 : if (fd >= 0)
1093 : {
1094 6 : (void) pg_fsync(fd);
1095 6 : close(fd);
1096 : }
1097 6 : recovery_signal_file_found = true;
1098 : }
1099 :
1100 : /*
1101 : * If both signal files are present, standby signal file takes precedence.
1102 : * If neither is present then we won't enter archive recovery.
1103 : */
1104 955 : StandbyModeRequested = false;
1105 955 : ArchiveRecoveryRequested = false;
1106 955 : if (standby_signal_file_found)
1107 : {
1108 113 : StandbyModeRequested = true;
1109 113 : ArchiveRecoveryRequested = true;
1110 : }
1111 842 : else if (recovery_signal_file_found)
1112 : {
1113 5 : StandbyModeRequested = false;
1114 5 : ArchiveRecoveryRequested = true;
1115 : }
1116 : else
1117 837 : return;
1118 :
1119 : /*
1120 : * We don't support standby mode in standalone backends; that requires
1121 : * other processes such as the WAL receiver to be alive.
1122 : */
1123 118 : if (StandbyModeRequested && !IsUnderPostmaster)
1124 0 : ereport(FATAL,
1125 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1126 : errmsg("standby mode is not supported by single-user servers")));
1127 : }
1128 :
1129 : static void
1130 1006 : validateRecoveryParameters(void)
1131 : {
1132 1006 : if (!ArchiveRecoveryRequested)
1133 888 : return;
1134 :
1135 : /*
1136 : * Check for compulsory parameters
1137 : */
1138 118 : if (StandbyModeRequested)
1139 : {
1140 113 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1141 12 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1142 2 : ereport(WARNING,
1143 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1144 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1145 : }
1146 : else
1147 : {
1148 5 : if (recoveryRestoreCommand == NULL ||
1149 5 : strcmp(recoveryRestoreCommand, "") == 0)
1150 0 : ereport(FATAL,
1151 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1152 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1153 : }
1154 :
1155 : /*
1156 : * Override any inconsistent requests. Note that this is a change of
1157 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1158 : * hot_standby = off, which was surprising behaviour.
1159 : */
1160 118 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1161 111 : !EnableHotStandby)
1162 3 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1163 :
1164 : /*
1165 : * Final parsing of recovery_target_time string; see also
1166 : * check_recovery_target_time().
1167 : */
1168 118 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1169 : {
1170 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1171 : CStringGetDatum(recovery_target_time_string),
1172 : ObjectIdGetDatum(InvalidOid),
1173 : Int32GetDatum(-1)));
1174 : }
1175 :
1176 : /*
1177 : * If user specified recovery_target_timeline, validate it or compute the
1178 : * "latest" value. We can't do this until after we've gotten the restore
1179 : * command and set InArchiveRecovery, because we need to fetch timeline
1180 : * history files from the archive.
1181 : */
1182 118 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1183 : {
1184 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1185 :
1186 : /* Timeline 1 does not have a history file, all else should */
1187 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1188 0 : ereport(FATAL,
1189 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1190 : errmsg("recovery target timeline %u does not exist",
1191 : rtli)));
1192 0 : recoveryTargetTLI = rtli;
1193 : }
1194 118 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1195 : {
1196 : /* We start the "latest" search from pg_control's timeline */
1197 118 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1198 : }
1199 : else
1200 : {
1201 : /*
1202 : * else we just use the recoveryTargetTLI as already read from
1203 : * ControlFile
1204 : */
1205 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1206 : }
1207 : }
1208 :
1209 : /*
1210 : * read_backup_label: check to see if a backup_label file is present
1211 : *
1212 : * If we see a backup_label during recovery, we assume that we are recovering
1213 : * from a backup dump file, and we therefore roll forward from the checkpoint
1214 : * identified by the label file, NOT what pg_control says. This avoids the
1215 : * problem that pg_control might have been archived one or more checkpoints
1216 : * later than the start of the dump, and so if we rely on it as the start
1217 : * point, we will fail to restore a consistent database state.
1218 : *
1219 : * Returns true if a backup_label was found (and fills the checkpoint
1220 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1221 : * returns false if not. If this backup_label came from a streamed backup,
1222 : * *backupEndRequired is set to true. If this backup_label was created during
1223 : * recovery, *backupFromStandby is set to true.
1224 : *
1225 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1226 : * and TLI read from the backup file.
1227 : */
1228 : static bool
1229 1006 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1230 : bool *backupEndRequired, bool *backupFromStandby)
1231 : {
1232 : char startxlogfilename[MAXFNAMELEN];
1233 : TimeLineID tli_from_walseg,
1234 : tli_from_file;
1235 : FILE *lfp;
1236 : char ch;
1237 : char backuptype[20];
1238 : char backupfrom[20];
1239 : char backuplabel[MAXPGPATH];
1240 : char backuptime[128];
1241 : uint32 hi,
1242 : lo;
1243 :
1244 : /* suppress possible uninitialized-variable warnings */
1245 1006 : *checkPointLoc = InvalidXLogRecPtr;
1246 1006 : *backupLabelTLI = 0;
1247 1006 : *backupEndRequired = false;
1248 1006 : *backupFromStandby = false;
1249 :
1250 : /*
1251 : * See if label file is present
1252 : */
1253 1006 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1254 1006 : if (!lfp)
1255 : {
1256 925 : if (errno != ENOENT)
1257 0 : ereport(FATAL,
1258 : (errcode_for_file_access(),
1259 : errmsg("could not read file \"%s\": %m",
1260 : BACKUP_LABEL_FILE)));
1261 925 : return false; /* it's not there, all is fine */
1262 : }
1263 :
1264 : /*
1265 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1266 : * is pretty crude, but we are not expecting any variability in the file
1267 : * format).
1268 : */
1269 81 : if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1270 81 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1271 0 : ereport(FATAL,
1272 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1273 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1274 81 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1275 81 : RedoStartTLI = tli_from_walseg;
1276 81 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1277 81 : &hi, &lo, &ch) != 3 || ch != '\n')
1278 0 : ereport(FATAL,
1279 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1280 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1281 81 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1282 81 : *backupLabelTLI = tli_from_walseg;
1283 :
1284 : /*
1285 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1286 : * which could mean either pg_basebackup or the pg_backup_start/stop
1287 : * method was used) or if this label came from somewhere else (the only
1288 : * other option today being from pg_rewind). If this was a streamed
1289 : * backup then we know that we need to play through until we get to the
1290 : * end of the WAL which was generated during the backup (at which point we
1291 : * will have reached consistency and backupEndRequired will be reset to be
1292 : * false).
1293 : */
1294 81 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1295 : {
1296 81 : if (strcmp(backuptype, "streamed") == 0)
1297 80 : *backupEndRequired = true;
1298 : }
1299 :
1300 : /*
1301 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1302 : * it was from a standby, we'll double-check that the control file state
1303 : * matches that of a standby.
1304 : */
1305 81 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1306 : {
1307 81 : if (strcmp(backupfrom, "standby") == 0)
1308 5 : *backupFromStandby = true;
1309 : }
1310 :
1311 : /*
1312 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1313 : * but checking for their presence is useful for debugging and the next
1314 : * sanity checks. Cope also with the fact that the result buffers have a
1315 : * pre-allocated size, hence if the backup_label file has been generated
1316 : * with strings longer than the maximum assumed here an incorrect parsing
1317 : * happens. That's fine as only minor consistency checks are done
1318 : * afterwards.
1319 : */
1320 81 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1321 81 : ereport(DEBUG1,
1322 : (errmsg_internal("backup time %s in file \"%s\"",
1323 : backuptime, BACKUP_LABEL_FILE)));
1324 :
1325 81 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1326 80 : ereport(DEBUG1,
1327 : (errmsg_internal("backup label %s in file \"%s\"",
1328 : backuplabel, BACKUP_LABEL_FILE)));
1329 :
1330 : /*
1331 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1332 : * it as a sanity check if present.
1333 : */
1334 81 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1335 : {
1336 80 : if (tli_from_walseg != tli_from_file)
1337 0 : ereport(FATAL,
1338 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1339 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1340 : errdetail("Timeline ID parsed is %u, but expected %u.",
1341 : tli_from_file, tli_from_walseg)));
1342 :
1343 80 : ereport(DEBUG1,
1344 : (errmsg_internal("backup timeline %u in file \"%s\"",
1345 : tli_from_file, BACKUP_LABEL_FILE)));
1346 : }
1347 :
1348 81 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1349 0 : ereport(FATAL,
1350 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1351 : errmsg("this is an incremental backup, not a data directory"),
1352 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1353 :
1354 81 : if (ferror(lfp) || FreeFile(lfp))
1355 0 : ereport(FATAL,
1356 : (errcode_for_file_access(),
1357 : errmsg("could not read file \"%s\": %m",
1358 : BACKUP_LABEL_FILE)));
1359 :
1360 81 : return true;
1361 : }
1362 :
1363 : /*
1364 : * read_tablespace_map: check to see if a tablespace_map file is present
1365 : *
1366 : * If we see a tablespace_map file during recovery, we assume that we are
1367 : * recovering from a backup dump file, and we therefore need to create symlinks
1368 : * as per the information present in tablespace_map file.
1369 : *
1370 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1371 : * with a tablespaceinfo struct for each tablespace listed in the file);
1372 : * returns false if not.
1373 : */
1374 : static bool
1375 81 : read_tablespace_map(List **tablespaces)
1376 : {
1377 : tablespaceinfo *ti;
1378 : FILE *lfp;
1379 : char str[MAXPGPATH];
1380 : int ch,
1381 : i,
1382 : n;
1383 : bool was_backslash;
1384 :
1385 : /*
1386 : * See if tablespace_map file is present
1387 : */
1388 81 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1389 81 : if (!lfp)
1390 : {
1391 79 : if (errno != ENOENT)
1392 0 : ereport(FATAL,
1393 : (errcode_for_file_access(),
1394 : errmsg("could not read file \"%s\": %m",
1395 : TABLESPACE_MAP)));
1396 79 : return false; /* it's not there, all is fine */
1397 : }
1398 :
1399 : /*
1400 : * Read and parse the link name and path lines from tablespace_map file
1401 : * (this code is pretty crude, but we are not expecting any variability in
1402 : * the file format). De-escape any backslashes that were inserted.
1403 : */
1404 2 : i = 0;
1405 2 : was_backslash = false;
1406 77 : while ((ch = fgetc(lfp)) != EOF)
1407 : {
1408 75 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1409 2 : {
1410 : char *endp;
1411 :
1412 2 : if (i == 0)
1413 0 : continue; /* \r immediately followed by \n */
1414 :
1415 : /*
1416 : * The de-escaped line should contain an OID followed by exactly
1417 : * one space followed by a path. The path might start with
1418 : * spaces, so don't be too liberal about parsing.
1419 : */
1420 2 : str[i] = '\0';
1421 2 : n = 0;
1422 12 : while (str[n] && str[n] != ' ')
1423 10 : n++;
1424 2 : if (n < 1 || n >= i - 1)
1425 0 : ereport(FATAL,
1426 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1427 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1428 2 : str[n++] = '\0';
1429 :
1430 2 : ti = palloc0_object(tablespaceinfo);
1431 2 : errno = 0;
1432 2 : ti->oid = strtoul(str, &endp, 10);
1433 2 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1434 0 : ereport(FATAL,
1435 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1436 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1437 2 : ti->path = pstrdup(str + n);
1438 2 : *tablespaces = lappend(*tablespaces, ti);
1439 :
1440 2 : i = 0;
1441 2 : continue;
1442 : }
1443 73 : else if (!was_backslash && ch == '\\')
1444 0 : was_backslash = true;
1445 : else
1446 : {
1447 73 : if (i < sizeof(str) - 1)
1448 73 : str[i++] = ch;
1449 73 : was_backslash = false;
1450 : }
1451 : }
1452 :
1453 2 : if (i != 0 || was_backslash) /* last line not terminated? */
1454 0 : ereport(FATAL,
1455 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1456 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1457 :
1458 2 : if (ferror(lfp) || FreeFile(lfp))
1459 0 : ereport(FATAL,
1460 : (errcode_for_file_access(),
1461 : errmsg("could not read file \"%s\": %m",
1462 : TABLESPACE_MAP)));
1463 :
1464 2 : return true;
1465 : }
1466 :
1467 : /*
1468 : * Finish WAL recovery.
1469 : *
1470 : * This does not close the 'xlogreader' yet, because in some cases the caller
1471 : * still wants to re-read the last checkpoint record by calling
1472 : * ReadCheckpointRecord().
1473 : *
1474 : * Returns the position of the last valid or applied record, after which new
1475 : * WAL should be appended, information about why recovery was ended, and some
1476 : * other things. See the EndOfWalRecoveryInfo struct for details.
1477 : */
1478 : EndOfWalRecoveryInfo *
1479 941 : FinishWalRecovery(void)
1480 : {
1481 941 : EndOfWalRecoveryInfo *result = palloc_object(EndOfWalRecoveryInfo);
1482 : XLogRecPtr lastRec;
1483 : TimeLineID lastRecTLI;
1484 : XLogRecPtr endOfLog;
1485 :
1486 : /*
1487 : * Kill WAL receiver, if it's still running, before we continue to write
1488 : * the startup checkpoint and aborted-contrecord records. It will trump
1489 : * over these records and subsequent ones if it's still alive when we
1490 : * start writing WAL.
1491 : */
1492 941 : XLogShutdownWalRcv();
1493 :
1494 : /*
1495 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1496 : * it and to prevent it from keep trying to fetch the failover slots.
1497 : *
1498 : * We do not update the 'synced' column in 'pg_replication_slots' system
1499 : * view from true to false here, as any failed update could leave 'synced'
1500 : * column false for some slots. This could cause issues during slot sync
1501 : * after restarting the server as a standby. While updating the 'synced'
1502 : * column after switching to the new timeline is an option, it does not
1503 : * simplify the handling for the 'synced' column. Therefore, we retain the
1504 : * 'synced' column as true after promotion as it may provide useful
1505 : * information about the slot origin.
1506 : */
1507 941 : ShutDownSlotSync();
1508 :
1509 : /*
1510 : * We are now done reading the xlog from stream. Turn off streaming
1511 : * recovery to force fetching the files (which would be required at end of
1512 : * recovery, e.g., timeline history file) from archive or pg_wal.
1513 : *
1514 : * Note that standby mode must be turned off after killing WAL receiver,
1515 : * i.e., calling XLogShutdownWalRcv().
1516 : */
1517 : Assert(!WalRcvStreaming());
1518 941 : StandbyMode = false;
1519 :
1520 : /*
1521 : * Determine where to start writing WAL next.
1522 : *
1523 : * Re-fetch the last valid or last applied record, so we can identify the
1524 : * exact endpoint of what we consider the valid portion of WAL. There may
1525 : * be an incomplete continuation record after that, in which case
1526 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1527 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1528 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1529 : *
1530 : * An important side-effect of this is to load the last page into
1531 : * xlogreader. The caller uses it to initialize the WAL for writing.
1532 : */
1533 941 : if (!InRecovery)
1534 : {
1535 778 : lastRec = CheckPointLoc;
1536 778 : lastRecTLI = CheckPointTLI;
1537 : }
1538 : else
1539 : {
1540 163 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1541 163 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1542 : }
1543 941 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1544 941 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1545 941 : endOfLog = xlogreader->EndRecPtr;
1546 :
1547 : /*
1548 : * Remember the TLI in the filename of the XLOG segment containing the
1549 : * end-of-log. It could be different from the timeline that endOfLog
1550 : * nominally belongs to, if there was a timeline switch in that segment,
1551 : * and we were reading the old WAL from a segment belonging to a higher
1552 : * timeline.
1553 : */
1554 941 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1555 :
1556 941 : if (ArchiveRecoveryRequested)
1557 : {
1558 : /*
1559 : * We are no longer in archive recovery state.
1560 : *
1561 : * We are now done reading the old WAL. Turn off archive fetching if
1562 : * it was active.
1563 : */
1564 : Assert(InArchiveRecovery);
1565 55 : InArchiveRecovery = false;
1566 :
1567 : /*
1568 : * If the ending log segment is still open, close it (to avoid
1569 : * problems on Windows with trying to rename or delete an open file).
1570 : */
1571 55 : if (readFile >= 0)
1572 : {
1573 55 : close(readFile);
1574 55 : readFile = -1;
1575 : }
1576 : }
1577 :
1578 : /*
1579 : * Copy the last partial block to the caller, for initializing the WAL
1580 : * buffer for appending new WAL.
1581 : */
1582 941 : if (endOfLog % XLOG_BLCKSZ != 0)
1583 : {
1584 : char *page;
1585 : int len;
1586 : XLogRecPtr pageBeginPtr;
1587 :
1588 920 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1589 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1590 :
1591 : /* Copy the valid part of the last block */
1592 920 : len = endOfLog % XLOG_BLCKSZ;
1593 920 : page = palloc(len);
1594 920 : memcpy(page, xlogreader->readBuf, len);
1595 :
1596 920 : result->lastPageBeginPtr = pageBeginPtr;
1597 920 : result->lastPage = page;
1598 : }
1599 : else
1600 : {
1601 : /* There is no partial block to copy. */
1602 21 : result->lastPageBeginPtr = endOfLog;
1603 21 : result->lastPage = NULL;
1604 : }
1605 :
1606 : /*
1607 : * Create a comment for the history file to explain why and where timeline
1608 : * changed.
1609 : */
1610 941 : result->recoveryStopReason = getRecoveryStopReason();
1611 :
1612 941 : result->lastRec = lastRec;
1613 941 : result->lastRecTLI = lastRecTLI;
1614 941 : result->endOfLog = endOfLog;
1615 :
1616 941 : result->abortedRecPtr = abortedRecPtr;
1617 941 : result->missingContrecPtr = missingContrecPtr;
1618 :
1619 941 : result->standby_signal_file_found = standby_signal_file_found;
1620 941 : result->recovery_signal_file_found = recovery_signal_file_found;
1621 :
1622 941 : return result;
1623 : }
1624 :
1625 : /*
1626 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1627 : */
1628 : void
1629 941 : ShutdownWalRecovery(void)
1630 : {
1631 : char recoveryPath[MAXPGPATH];
1632 :
1633 : /* Final update of pg_stat_recovery_prefetch. */
1634 941 : XLogPrefetcherComputeStats(xlogprefetcher);
1635 :
1636 : /* Shut down xlogreader */
1637 941 : if (readFile >= 0)
1638 : {
1639 886 : close(readFile);
1640 886 : readFile = -1;
1641 : }
1642 941 : pfree(xlogreader->private_data);
1643 941 : XLogReaderFree(xlogreader);
1644 941 : XLogPrefetcherFree(xlogprefetcher);
1645 :
1646 941 : if (ArchiveRecoveryRequested)
1647 : {
1648 : /*
1649 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1650 : * rid of it.
1651 : */
1652 55 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1653 55 : unlink(recoveryPath); /* ignore any error */
1654 :
1655 : /* Get rid of any remaining recovered timeline-history file, too */
1656 55 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1657 55 : unlink(recoveryPath); /* ignore any error */
1658 : }
1659 :
1660 : /*
1661 : * We don't need the latch anymore. It's not strictly necessary to disown
1662 : * it, but let's do it for the sake of tidiness.
1663 : */
1664 941 : if (ArchiveRecoveryRequested)
1665 55 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1666 941 : }
1667 :
1668 : /*
1669 : * Perform WAL recovery.
1670 : *
1671 : * If the system was shut down cleanly, this is never called.
1672 : */
1673 : void
1674 225 : PerformWalRecovery(void)
1675 : {
1676 : XLogRecord *record;
1677 225 : bool reachedRecoveryTarget = false;
1678 : TimeLineID replayTLI;
1679 :
1680 : /*
1681 : * Initialize shared variables for tracking progress of WAL replay, as if
1682 : * we had just replayed the record before the REDO location (or the
1683 : * checkpoint record itself, if it's a shutdown checkpoint).
1684 : */
1685 225 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1686 225 : if (RedoStartLSN < CheckPointLoc)
1687 : {
1688 123 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1689 123 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1690 123 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1691 : }
1692 : else
1693 : {
1694 102 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1695 102 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1696 102 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1697 : }
1698 225 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1699 225 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1700 225 : XLogRecoveryCtl->recoveryLastXTime = 0;
1701 225 : XLogRecoveryCtl->currentChunkStartTime = 0;
1702 225 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1703 225 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1704 :
1705 : /* Also ensure XLogReceiptTime has a sane value */
1706 225 : XLogReceiptTime = GetCurrentTimestamp();
1707 :
1708 : /*
1709 : * Let postmaster know we've started redo now, so that it can launch the
1710 : * archiver if necessary.
1711 : */
1712 225 : if (IsUnderPostmaster)
1713 216 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1714 :
1715 : /*
1716 : * Allow read-only connections immediately if we're consistent already.
1717 : */
1718 225 : CheckRecoveryConsistency();
1719 :
1720 : /*
1721 : * Find the first record that logically follows the checkpoint --- it
1722 : * might physically precede it, though.
1723 : */
1724 225 : if (RedoStartLSN < CheckPointLoc)
1725 : {
1726 : /* back up to find the record */
1727 123 : replayTLI = RedoStartTLI;
1728 123 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1729 123 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1730 :
1731 : /*
1732 : * If a checkpoint record's redo pointer points back to an earlier
1733 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1734 : * record.
1735 : */
1736 123 : if (record->xl_rmid != RM_XLOG_ID ||
1737 123 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1738 0 : ereport(FATAL,
1739 : errmsg("unexpected record type found at redo point %X/%08X",
1740 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1741 : }
1742 : else
1743 : {
1744 : /* just have to read next record after CheckPoint */
1745 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1746 102 : replayTLI = CheckPointTLI;
1747 102 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1748 : }
1749 :
1750 225 : if (record != NULL)
1751 : {
1752 : TimestampTz xtime;
1753 : PGRUsage ru0;
1754 :
1755 216 : pg_rusage_init(&ru0);
1756 :
1757 216 : InRedo = true;
1758 :
1759 216 : RmgrStartup();
1760 :
1761 216 : ereport(LOG,
1762 : errmsg("redo starts at %X/%08X",
1763 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1764 :
1765 : /* Prepare to report progress of the redo phase. */
1766 216 : if (!StandbyMode)
1767 109 : begin_startup_progress_phase();
1768 :
1769 : /*
1770 : * main redo apply loop
1771 : */
1772 : do
1773 : {
1774 2797193 : if (!StandbyMode)
1775 263715 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1776 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1777 :
1778 : #ifdef WAL_DEBUG
1779 : if (XLOG_DEBUG)
1780 : {
1781 : StringInfoData buf;
1782 :
1783 : initStringInfo(&buf);
1784 : appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1785 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1786 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1787 : xlog_outrec(&buf, xlogreader);
1788 : appendStringInfoString(&buf, " - ");
1789 : xlog_outdesc(&buf, xlogreader);
1790 : elog(LOG, "%s", buf.data);
1791 : pfree(buf.data);
1792 : }
1793 : #endif
1794 :
1795 : /* Handle interrupt signals of startup process */
1796 2797193 : ProcessStartupProcInterrupts();
1797 :
1798 : /*
1799 : * Pause WAL replay, if requested by a hot-standby session via
1800 : * SetRecoveryPause().
1801 : *
1802 : * Note that we intentionally don't take the info_lck spinlock
1803 : * here. We might therefore read a slightly stale value of the
1804 : * recoveryPause flag, but it can't be very stale (no worse than
1805 : * the last spinlock we did acquire). Since a pause request is a
1806 : * pretty asynchronous thing anyway, possibly responding to it one
1807 : * WAL record later than we otherwise would is a minor issue, so
1808 : * it doesn't seem worth adding another spinlock cycle to prevent
1809 : * that.
1810 : */
1811 2797193 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1812 : RECOVERY_NOT_PAUSED)
1813 0 : recoveryPausesHere(false);
1814 :
1815 : /*
1816 : * Have we reached our recovery target?
1817 : */
1818 2797193 : if (recoveryStopsBefore(xlogreader))
1819 : {
1820 2 : reachedRecoveryTarget = true;
1821 2 : break;
1822 : }
1823 :
1824 : /*
1825 : * If we've been asked to lag the primary, wait on latch until
1826 : * enough time has passed.
1827 : */
1828 2797191 : if (recoveryApplyDelay(xlogreader))
1829 : {
1830 : /*
1831 : * We test for paused recovery again here. If user sets
1832 : * delayed apply, it may be because they expect to pause
1833 : * recovery in case of problems, so we must test again here
1834 : * otherwise pausing during the delay-wait wouldn't work.
1835 : */
1836 28 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1837 : RECOVERY_NOT_PAUSED)
1838 0 : recoveryPausesHere(false);
1839 : }
1840 :
1841 : /*
1842 : * Apply the record
1843 : */
1844 2797191 : ApplyWalRecord(xlogreader, record, &replayTLI);
1845 :
1846 : /*
1847 : * If we replayed an LSN that someone was waiting for then walk
1848 : * over the shared memory array and set latches to notify the
1849 : * waiters.
1850 : */
1851 5594378 : if (waitLSNState &&
1852 2797189 : (XLogRecoveryCtl->lastReplayedEndRecPtr >=
1853 2797189 : pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_STANDBY_REPLAY])))
1854 8 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY, XLogRecoveryCtl->lastReplayedEndRecPtr);
1855 :
1856 : /* Exit loop if we reached inclusive recovery target */
1857 2797189 : if (recoveryStopsAfter(xlogreader))
1858 : {
1859 5 : reachedRecoveryTarget = true;
1860 5 : break;
1861 : }
1862 :
1863 : /* Else, try to fetch the next WAL record */
1864 2797184 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1865 2797125 : } while (record != NULL);
1866 :
1867 : /*
1868 : * end of main redo apply loop
1869 : */
1870 :
1871 155 : if (reachedRecoveryTarget)
1872 : {
1873 7 : if (!reachedConsistency)
1874 0 : ereport(FATAL,
1875 : (errmsg("requested recovery stop point is before consistent recovery point")));
1876 :
1877 : /*
1878 : * This is the last point where we can restart recovery with a new
1879 : * recovery target, if we shutdown and begin again. After this,
1880 : * Resource Managers may choose to do permanent corrective actions
1881 : * at end of recovery.
1882 : */
1883 7 : switch (recoveryTargetAction)
1884 : {
1885 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1886 :
1887 : /*
1888 : * exit with special return code to request shutdown of
1889 : * postmaster. Log messages issued from postmaster.
1890 : */
1891 0 : proc_exit(3);
1892 :
1893 1 : case RECOVERY_TARGET_ACTION_PAUSE:
1894 1 : SetRecoveryPause(true);
1895 1 : recoveryPausesHere(true);
1896 :
1897 : /* drop into promote */
1898 : pg_fallthrough;
1899 :
1900 7 : case RECOVERY_TARGET_ACTION_PROMOTE:
1901 7 : break;
1902 : }
1903 : }
1904 :
1905 155 : RmgrCleanup();
1906 :
1907 155 : ereport(LOG,
1908 : errmsg("redo done at %X/%08X system usage: %s",
1909 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1910 : pg_rusage_show(&ru0)));
1911 155 : xtime = GetLatestXTime();
1912 155 : if (xtime)
1913 38 : ereport(LOG,
1914 : (errmsg("last completed transaction was at log time %s",
1915 : timestamptz_to_str(xtime))));
1916 :
1917 155 : InRedo = false;
1918 : }
1919 : else
1920 : {
1921 : /* there are no WAL records following the checkpoint */
1922 9 : ereport(LOG,
1923 : (errmsg("redo is not required")));
1924 : }
1925 :
1926 : /*
1927 : * This check is intentionally after the above log messages that indicate
1928 : * how far recovery went.
1929 : */
1930 164 : if (ArchiveRecoveryRequested &&
1931 56 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1932 8 : !reachedRecoveryTarget)
1933 1 : ereport(FATAL,
1934 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1935 : errmsg("recovery ended before configured recovery target was reached")));
1936 163 : }
1937 :
1938 : /*
1939 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1940 : */
1941 : static void
1942 2797191 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1943 : {
1944 : ErrorContextCallback errcallback;
1945 2797191 : bool switchedTLI = false;
1946 :
1947 : /* Setup error traceback support for ereport() */
1948 2797191 : errcallback.callback = rm_redo_error_callback;
1949 2797191 : errcallback.arg = xlogreader;
1950 2797191 : errcallback.previous = error_context_stack;
1951 2797191 : error_context_stack = &errcallback;
1952 :
1953 : /*
1954 : * TransamVariables->nextXid must be beyond record's xid.
1955 : */
1956 2797191 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1957 :
1958 : /*
1959 : * Before replaying this record, check if this record causes the current
1960 : * timeline to change. The record is already considered to be part of the
1961 : * new timeline, so we update replayTLI before replaying it. That's
1962 : * important so that replayEndTLI, which is recorded as the minimum
1963 : * recovery point's TLI if recovery stops after this record, is set
1964 : * correctly.
1965 : */
1966 2797191 : if (record->xl_rmid == RM_XLOG_ID)
1967 : {
1968 44606 : TimeLineID newReplayTLI = *replayTLI;
1969 44606 : TimeLineID prevReplayTLI = *replayTLI;
1970 44606 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1971 :
1972 44606 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1973 : {
1974 : CheckPoint checkPoint;
1975 :
1976 39 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1977 39 : newReplayTLI = checkPoint.ThisTimeLineID;
1978 39 : prevReplayTLI = checkPoint.PrevTimeLineID;
1979 : }
1980 44567 : else if (info == XLOG_END_OF_RECOVERY)
1981 : {
1982 : xl_end_of_recovery xlrec;
1983 :
1984 11 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1985 11 : newReplayTLI = xlrec.ThisTimeLineID;
1986 11 : prevReplayTLI = xlrec.PrevTimeLineID;
1987 : }
1988 :
1989 44606 : if (newReplayTLI != *replayTLI)
1990 : {
1991 : /* Check that it's OK to switch to this TLI */
1992 12 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1993 : newReplayTLI, prevReplayTLI, *replayTLI);
1994 :
1995 : /* Following WAL records should be run with new TLI */
1996 12 : *replayTLI = newReplayTLI;
1997 12 : switchedTLI = true;
1998 : }
1999 : }
2000 :
2001 : /*
2002 : * Update shared replayEndRecPtr before replaying this record, so that
2003 : * XLogFlush will update minRecoveryPoint correctly.
2004 : */
2005 2797191 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2006 2797191 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
2007 2797191 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
2008 2797191 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2009 :
2010 : /*
2011 : * If we are attempting to enter Hot Standby mode, process XIDs we see
2012 : */
2013 2797191 : if (standbyState >= STANDBY_INITIALIZED &&
2014 2553283 : TransactionIdIsValid(record->xl_xid))
2015 2498833 : RecordKnownAssignedTransactionIds(record->xl_xid);
2016 :
2017 : /*
2018 : * Some XLOG record types that are related to recovery are processed
2019 : * directly here, rather than in xlog_redo()
2020 : */
2021 2797191 : if (record->xl_rmid == RM_XLOG_ID)
2022 44606 : xlogrecovery_redo(xlogreader, *replayTLI);
2023 :
2024 : /* Now apply the WAL record itself */
2025 2797191 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
2026 :
2027 : /*
2028 : * After redo, check whether the backup pages associated with the WAL
2029 : * record are consistent with the existing pages. This check is done only
2030 : * if consistency check is enabled for this record.
2031 : */
2032 2797189 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2033 2211560 : verifyBackupPageConsistency(xlogreader);
2034 :
2035 : /* Pop the error context stack */
2036 2797189 : error_context_stack = errcallback.previous;
2037 :
2038 : /*
2039 : * Update lastReplayedEndRecPtr after this record has been successfully
2040 : * replayed.
2041 : */
2042 2797189 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2043 2797189 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2044 2797189 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2045 2797189 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2046 2797189 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2047 :
2048 : /* ------
2049 : * Wakeup walsenders:
2050 : *
2051 : * On the standby, the WAL is flushed first (which will only wake up
2052 : * physical walsenders) and then applied, which will only wake up logical
2053 : * walsenders.
2054 : *
2055 : * Indeed, logical walsenders on standby can't decode and send data until
2056 : * it's been applied.
2057 : *
2058 : * Physical walsenders don't need to be woken up during replay unless
2059 : * cascading replication is allowed and time line change occurred (so that
2060 : * they can notice that they are on a new time line).
2061 : *
2062 : * That's why the wake up conditions are for:
2063 : *
2064 : * - physical walsenders in case of new time line and cascade
2065 : * replication is allowed
2066 : * - logical walsenders in case cascade replication is allowed (could not
2067 : * be created otherwise)
2068 : * ------
2069 : */
2070 2797189 : if (AllowCascadeReplication())
2071 2607858 : WalSndWakeup(switchedTLI, true);
2072 :
2073 : /*
2074 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2075 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2076 : * a reply to the primary.
2077 : */
2078 2797189 : if (doRequestWalReceiverReply)
2079 : {
2080 2 : doRequestWalReceiverReply = false;
2081 2 : WalRcvForceReply();
2082 : }
2083 :
2084 : /* Allow read-only connections if we're consistent now */
2085 2797189 : CheckRecoveryConsistency();
2086 :
2087 : /* Is this a timeline switch? */
2088 2797189 : if (switchedTLI)
2089 : {
2090 : /*
2091 : * Before we continue on the new timeline, clean up any (possibly
2092 : * bogus) future WAL segments on the old timeline.
2093 : */
2094 12 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2095 :
2096 : /* Reset the prefetcher. */
2097 12 : XLogPrefetchReconfigure();
2098 : }
2099 2797189 : }
2100 :
2101 : /*
2102 : * Some XLOG RM record types that are directly related to WAL recovery are
2103 : * handled here rather than in the xlog_redo()
2104 : */
2105 : static void
2106 44606 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2107 : {
2108 44606 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2109 44606 : XLogRecPtr lsn = record->EndRecPtr;
2110 :
2111 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2112 :
2113 44606 : if (info == XLOG_OVERWRITE_CONTRECORD)
2114 : {
2115 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2116 : xl_overwrite_contrecord xlrec;
2117 :
2118 1 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2119 1 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2120 0 : elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2121 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2122 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2123 :
2124 : /* We have safely skipped the aborted record */
2125 1 : abortedRecPtr = InvalidXLogRecPtr;
2126 1 : missingContrecPtr = InvalidXLogRecPtr;
2127 :
2128 1 : ereport(LOG,
2129 : errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2130 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2131 : timestamptz_to_str(xlrec.overwrite_time)));
2132 :
2133 : /* Verifying the record should only happen once */
2134 1 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2135 : }
2136 44605 : else if (info == XLOG_BACKUP_END)
2137 : {
2138 : XLogRecPtr startpoint;
2139 :
2140 96 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2141 :
2142 96 : if (backupStartPoint == startpoint)
2143 : {
2144 : /*
2145 : * We have reached the end of base backup, the point where
2146 : * pg_backup_stop() was done. The data on disk is now consistent
2147 : * (assuming we have also reached minRecoveryPoint). Set
2148 : * backupEndPoint to the current LSN, so that the next call to
2149 : * CheckRecoveryConsistency() will notice it and do the
2150 : * end-of-backup processing.
2151 : */
2152 79 : elog(DEBUG1, "end of backup record reached");
2153 :
2154 79 : backupEndPoint = lsn;
2155 : }
2156 : else
2157 17 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2158 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2159 : }
2160 44606 : }
2161 :
2162 : /*
2163 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2164 : * directories.
2165 : *
2166 : * Replay of database creation XLOG records for databases that were later
2167 : * dropped can create fake directories in pg_tblspc. By the time consistency
2168 : * is reached these directories should have been removed; here we verify
2169 : * that this did indeed happen. This is to be called at the point where
2170 : * consistent state is reached.
2171 : *
2172 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2173 : * useful for testing purposes, and also allows for an escape hatch in case
2174 : * things go south.
2175 : */
2176 : static void
2177 124 : CheckTablespaceDirectory(void)
2178 : {
2179 : DIR *dir;
2180 : struct dirent *de;
2181 :
2182 124 : dir = AllocateDir(PG_TBLSPC_DIR);
2183 379 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2184 : {
2185 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2186 :
2187 : /* Skip entries of non-oid names */
2188 255 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2189 248 : continue;
2190 :
2191 7 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2192 :
2193 7 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2194 4 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2195 : (errcode(ERRCODE_DATA_CORRUPTED),
2196 : errmsg("unexpected directory entry \"%s\" found in %s",
2197 : de->d_name, PG_TBLSPC_DIR),
2198 : errdetail("All directory entries in %s/ should be symbolic links.",
2199 : PG_TBLSPC_DIR),
2200 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2201 : }
2202 124 : }
2203 :
2204 : /*
2205 : * Checks if recovery has reached a consistent state. When consistency is
2206 : * reached and we have a valid starting standby snapshot, tell postmaster
2207 : * that it can start accepting read-only connections.
2208 : */
2209 : static void
2210 2797416 : CheckRecoveryConsistency(void)
2211 : {
2212 : XLogRecPtr lastReplayedEndRecPtr;
2213 : TimeLineID lastReplayedTLI;
2214 :
2215 : /*
2216 : * During crash recovery, we don't reach a consistent state until we've
2217 : * replayed all the WAL.
2218 : */
2219 2797416 : if (!XLogRecPtrIsValid(minRecoveryPoint))
2220 258601 : return;
2221 :
2222 : Assert(InArchiveRecovery);
2223 :
2224 : /*
2225 : * assume that we are called in the startup process, and hence don't need
2226 : * a lock to read lastReplayedEndRecPtr
2227 : */
2228 2538815 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2229 2538815 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2230 :
2231 : /*
2232 : * Have we reached the point where our base backup was completed?
2233 : */
2234 2538815 : if (XLogRecPtrIsValid(backupEndPoint) &&
2235 114 : backupEndPoint <= lastReplayedEndRecPtr)
2236 : {
2237 81 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2238 81 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2239 :
2240 81 : elog(DEBUG1, "end of backup reached");
2241 :
2242 : /*
2243 : * We have reached the end of base backup, as indicated by pg_control.
2244 : * Update the control file accordingly.
2245 : */
2246 81 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2247 81 : backupStartPoint = InvalidXLogRecPtr;
2248 81 : backupEndPoint = InvalidXLogRecPtr;
2249 81 : backupEndRequired = false;
2250 :
2251 81 : ereport(LOG,
2252 : errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2253 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2254 : LSN_FORMAT_ARGS(saveBackupEndPoint)));
2255 : }
2256 :
2257 : /*
2258 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2259 : * known to be incorrectly set if recovering from a backup, until the
2260 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2261 : * All we know prior to that is that we're not consistent yet.
2262 : */
2263 2538815 : if (!reachedConsistency && !backupEndRequired &&
2264 7692 : minRecoveryPoint <= lastReplayedEndRecPtr)
2265 : {
2266 : /*
2267 : * Check to see if the XLOG sequence contained any unresolved
2268 : * references to uninitialized pages.
2269 : */
2270 124 : XLogCheckInvalidPages();
2271 :
2272 : /*
2273 : * Check that pg_tblspc doesn't contain any real directories. Replay
2274 : * of Database/CREATE_* records may have created fictitious tablespace
2275 : * directories that should have been removed by the time consistency
2276 : * was reached.
2277 : */
2278 124 : CheckTablespaceDirectory();
2279 :
2280 124 : reachedConsistency = true;
2281 124 : SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2282 124 : ereport(LOG,
2283 : errmsg("consistent recovery state reached at %X/%08X",
2284 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2285 : }
2286 :
2287 : /*
2288 : * Have we got a valid starting snapshot that will allow queries to be
2289 : * run? If so, we can tell postmaster that the database is consistent now,
2290 : * enabling connections.
2291 : */
2292 2538815 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2293 2538569 : !LocalHotStandbyActive &&
2294 115 : reachedConsistency &&
2295 : IsUnderPostmaster)
2296 : {
2297 115 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2298 115 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2299 115 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2300 :
2301 115 : LocalHotStandbyActive = true;
2302 :
2303 115 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2304 : }
2305 : }
2306 :
2307 : /*
2308 : * Error context callback for errors occurring during rm_redo().
2309 : */
2310 : static void
2311 156 : rm_redo_error_callback(void *arg)
2312 : {
2313 156 : XLogReaderState *record = (XLogReaderState *) arg;
2314 : StringInfoData buf;
2315 :
2316 156 : initStringInfo(&buf);
2317 156 : xlog_outdesc(&buf, record);
2318 156 : xlog_block_info(&buf, record);
2319 :
2320 : /* translator: %s is a WAL record description */
2321 156 : errcontext("WAL redo at %X/%08X for %s",
2322 156 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2323 : buf.data);
2324 :
2325 156 : pfree(buf.data);
2326 156 : }
2327 :
2328 : /*
2329 : * Returns a string describing an XLogRecord, consisting of its identity
2330 : * optionally followed by a colon, a space, and a further description.
2331 : */
2332 : void
2333 156 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2334 : {
2335 156 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2336 156 : uint8 info = XLogRecGetInfo(record);
2337 : const char *id;
2338 :
2339 156 : appendStringInfoString(buf, rmgr.rm_name);
2340 156 : appendStringInfoChar(buf, '/');
2341 :
2342 156 : id = rmgr.rm_identify(info);
2343 156 : if (id == NULL)
2344 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2345 : else
2346 156 : appendStringInfo(buf, "%s: ", id);
2347 :
2348 156 : rmgr.rm_desc(buf, record);
2349 156 : }
2350 :
2351 : #ifdef WAL_DEBUG
2352 :
2353 : static void
2354 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2355 : {
2356 : appendStringInfo(buf, "prev %X/%08X; xid %u",
2357 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2358 : XLogRecGetXid(record));
2359 :
2360 : appendStringInfo(buf, "; len %u",
2361 : XLogRecGetDataLen(record));
2362 :
2363 : xlog_block_info(buf, record);
2364 : }
2365 : #endif /* WAL_DEBUG */
2366 :
2367 : /*
2368 : * Returns a string giving information about all the blocks in an
2369 : * XLogRecord.
2370 : */
2371 : static void
2372 156 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2373 : {
2374 : int block_id;
2375 :
2376 : /* decode block references */
2377 209 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2378 : {
2379 : RelFileLocator rlocator;
2380 : ForkNumber forknum;
2381 : BlockNumber blk;
2382 :
2383 53 : if (!XLogRecGetBlockTagExtended(record, block_id,
2384 : &rlocator, &forknum, &blk, NULL))
2385 0 : continue;
2386 :
2387 53 : if (forknum != MAIN_FORKNUM)
2388 5 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2389 : block_id,
2390 : rlocator.spcOid, rlocator.dbOid,
2391 : rlocator.relNumber,
2392 : forknum,
2393 : blk);
2394 : else
2395 48 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2396 : block_id,
2397 : rlocator.spcOid, rlocator.dbOid,
2398 : rlocator.relNumber,
2399 : blk);
2400 53 : if (XLogRecHasBlockImage(record, block_id))
2401 34 : appendStringInfoString(buf, " FPW");
2402 : }
2403 156 : }
2404 :
2405 :
2406 : /*
2407 : * Check that it's OK to switch to new timeline during recovery.
2408 : *
2409 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2410 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2411 : */
2412 : static void
2413 12 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2414 : TimeLineID replayTLI)
2415 : {
2416 : /* Check that the record agrees on what the current (old) timeline is */
2417 12 : if (prevTLI != replayTLI)
2418 0 : ereport(PANIC,
2419 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2420 : prevTLI, replayTLI)));
2421 :
2422 : /*
2423 : * The new timeline better be in the list of timelines we expect to see,
2424 : * according to the timeline history. It should also not decrease.
2425 : */
2426 12 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2427 0 : ereport(PANIC,
2428 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2429 : newTLI, replayTLI)));
2430 :
2431 : /*
2432 : * If we have not yet reached min recovery point, and we're about to
2433 : * switch to a timeline greater than the timeline of the min recovery
2434 : * point: trouble. After switching to the new timeline, we could not
2435 : * possibly visit the min recovery point on the correct timeline anymore.
2436 : * This can happen if there is a newer timeline in the archive that
2437 : * branched before the timeline the min recovery point is on, and you
2438 : * attempt to do PITR to the new timeline.
2439 : */
2440 12 : if (XLogRecPtrIsValid(minRecoveryPoint) &&
2441 10 : lsn < minRecoveryPoint &&
2442 1 : newTLI > minRecoveryPointTLI)
2443 0 : ereport(PANIC,
2444 : errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2445 : newTLI,
2446 : LSN_FORMAT_ARGS(minRecoveryPoint),
2447 : minRecoveryPointTLI));
2448 :
2449 : /* Looks good */
2450 12 : }
2451 :
2452 :
2453 : /*
2454 : * Extract timestamp from WAL record.
2455 : *
2456 : * If the record contains a timestamp, returns true, and saves the timestamp
2457 : * in *recordXtime. If the record type has no timestamp, returns false.
2458 : * Currently, only transaction commit/abort records and restore points contain
2459 : * timestamps.
2460 : */
2461 : static bool
2462 44829 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2463 : {
2464 44829 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2465 44829 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2466 44829 : uint8 rmid = XLogRecGetRmid(record);
2467 :
2468 44829 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2469 : {
2470 2 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2471 2 : return true;
2472 : }
2473 44827 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2474 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2475 : {
2476 41085 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2477 41085 : return true;
2478 : }
2479 3742 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2480 : xact_info == XLOG_XACT_ABORT_PREPARED))
2481 : {
2482 3742 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2483 3742 : return true;
2484 : }
2485 0 : return false;
2486 : }
2487 :
2488 : /*
2489 : * Checks whether the current buffer page and backup page stored in the
2490 : * WAL record are consistent or not. Before comparing the two pages, a
2491 : * masking can be applied to the pages to ignore certain areas like hint bits,
2492 : * unused space between pd_lower and pd_upper among other things. This
2493 : * function should be called once WAL replay has been completed for a
2494 : * given record.
2495 : */
2496 : static void
2497 2211560 : verifyBackupPageConsistency(XLogReaderState *record)
2498 : {
2499 2211560 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2500 : RelFileLocator rlocator;
2501 : ForkNumber forknum;
2502 : BlockNumber blkno;
2503 : int block_id;
2504 :
2505 : /* Records with no backup blocks have no need for consistency checks. */
2506 2211560 : if (!XLogRecHasAnyBlockRefs(record))
2507 79 : return;
2508 :
2509 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2510 :
2511 4593138 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2512 : {
2513 : Buffer buf;
2514 : Page page;
2515 :
2516 2381657 : if (!XLogRecGetBlockTagExtended(record, block_id,
2517 : &rlocator, &forknum, &blkno, NULL))
2518 : {
2519 : /*
2520 : * WAL record doesn't contain a block reference with the given id.
2521 : * Do nothing.
2522 : */
2523 2102 : continue;
2524 : }
2525 :
2526 : Assert(XLogRecHasBlockImage(record, block_id));
2527 :
2528 2379555 : if (XLogRecBlockImageApply(record, block_id))
2529 : {
2530 : /*
2531 : * WAL record has already applied the page, so bypass the
2532 : * consistency check as that would result in comparing the full
2533 : * page stored in the record with itself.
2534 : */
2535 27512 : continue;
2536 : }
2537 :
2538 : /*
2539 : * Read the contents from the current buffer and store it in a
2540 : * temporary page.
2541 : */
2542 2352043 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2543 : RBM_NORMAL_NO_LOG,
2544 : InvalidBuffer);
2545 2352043 : if (!BufferIsValid(buf))
2546 0 : continue;
2547 :
2548 2352043 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2549 2352043 : page = BufferGetPage(buf);
2550 :
2551 : /*
2552 : * Take a copy of the local page where WAL has been applied to have a
2553 : * comparison base before masking it...
2554 : */
2555 2352043 : memcpy(replay_image_masked, page, BLCKSZ);
2556 :
2557 : /* No need for this page anymore now that a copy is in. */
2558 2352043 : UnlockReleaseBuffer(buf);
2559 :
2560 : /*
2561 : * If the block LSN is already ahead of this WAL record, we can't
2562 : * expect contents to match. This can happen if recovery is
2563 : * restarted.
2564 : */
2565 2352043 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2566 0 : continue;
2567 :
2568 : /*
2569 : * Read the contents from the backup copy, stored in WAL record and
2570 : * store it in a temporary page. There is no need to allocate a new
2571 : * page here, a local buffer is fine to hold its contents and a mask
2572 : * can be directly applied on it.
2573 : */
2574 2352043 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2575 0 : ereport(ERROR,
2576 : (errcode(ERRCODE_INTERNAL_ERROR),
2577 : errmsg_internal("%s", record->errormsg_buf)));
2578 :
2579 : /*
2580 : * If masking function is defined, mask both the primary and replay
2581 : * images
2582 : */
2583 2352043 : if (rmgr.rm_mask != NULL)
2584 : {
2585 2352043 : rmgr.rm_mask(replay_image_masked, blkno);
2586 2352043 : rmgr.rm_mask(primary_image_masked, blkno);
2587 : }
2588 :
2589 : /* Time to compare the primary and replay images. */
2590 2352043 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2591 : {
2592 0 : elog(FATAL,
2593 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2594 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2595 : forknum, blkno);
2596 : }
2597 : }
2598 : }
2599 :
2600 : /*
2601 : * For point-in-time recovery, this function decides whether we want to
2602 : * stop applying the XLOG before the current record.
2603 : *
2604 : * Returns true if we are stopping, false otherwise. If stopping, some
2605 : * information is saved in recoveryStopXid et al for use in annotating the
2606 : * new timeline's history file.
2607 : */
2608 : static bool
2609 2797193 : recoveryStopsBefore(XLogReaderState *record)
2610 : {
2611 2797193 : bool stopsHere = false;
2612 : uint8 xact_info;
2613 : bool isCommit;
2614 2797193 : TimestampTz recordXtime = 0;
2615 : TransactionId recordXid;
2616 :
2617 : /*
2618 : * Ignore recovery target settings when not in archive recovery (meaning
2619 : * we are in crash recovery).
2620 : */
2621 2797193 : if (!ArchiveRecoveryRequested)
2622 243894 : return false;
2623 :
2624 : /* Check if we should stop as soon as reaching consistency */
2625 2553299 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2626 : {
2627 0 : ereport(LOG,
2628 : (errmsg("recovery stopping after reaching consistency")));
2629 :
2630 0 : recoveryStopAfter = false;
2631 0 : recoveryStopXid = InvalidTransactionId;
2632 0 : recoveryStopLSN = InvalidXLogRecPtr;
2633 0 : recoveryStopTime = 0;
2634 0 : recoveryStopName[0] = '\0';
2635 0 : return true;
2636 : }
2637 :
2638 : /* Check if target LSN has been reached */
2639 2553299 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2640 8512 : !recoveryTargetInclusive &&
2641 482 : record->ReadRecPtr >= recoveryTargetLSN)
2642 : {
2643 2 : recoveryStopAfter = false;
2644 2 : recoveryStopXid = InvalidTransactionId;
2645 2 : recoveryStopLSN = record->ReadRecPtr;
2646 2 : recoveryStopTime = 0;
2647 2 : recoveryStopName[0] = '\0';
2648 2 : ereport(LOG,
2649 : errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2650 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2651 2 : return true;
2652 : }
2653 :
2654 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2655 2553297 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2656 2530594 : return false;
2657 :
2658 22703 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2659 :
2660 22703 : if (xact_info == XLOG_XACT_COMMIT)
2661 : {
2662 20500 : isCommit = true;
2663 20500 : recordXid = XLogRecGetXid(record);
2664 : }
2665 2203 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2666 : {
2667 29 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2668 : xl_xact_parsed_commit parsed;
2669 :
2670 29 : isCommit = true;
2671 29 : ParseCommitRecord(XLogRecGetInfo(record),
2672 : xlrec,
2673 : &parsed);
2674 29 : recordXid = parsed.twophase_xid;
2675 : }
2676 2174 : else if (xact_info == XLOG_XACT_ABORT)
2677 : {
2678 1858 : isCommit = false;
2679 1858 : recordXid = XLogRecGetXid(record);
2680 : }
2681 316 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2682 : {
2683 13 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2684 : xl_xact_parsed_abort parsed;
2685 :
2686 13 : isCommit = false;
2687 13 : ParseAbortRecord(XLogRecGetInfo(record),
2688 : xlrec,
2689 : &parsed);
2690 13 : recordXid = parsed.twophase_xid;
2691 : }
2692 : else
2693 303 : return false;
2694 :
2695 22400 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2696 : {
2697 : /*
2698 : * There can be only one transaction end record with this exact
2699 : * transactionid
2700 : *
2701 : * when testing for an xid, we MUST test for equality only, since
2702 : * transactions are numbered in the order they start, not the order
2703 : * they complete. A higher numbered xid will complete before you about
2704 : * 50% of the time...
2705 : */
2706 0 : stopsHere = (recordXid == recoveryTargetXid);
2707 : }
2708 :
2709 : /*
2710 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2711 : * We don't expect getRecordTimestamp ever to fail, since we already know
2712 : * this is a commit or abort record; but test its result anyway.
2713 : */
2714 22400 : if (getRecordTimestamp(record, &recordXtime) &&
2715 22400 : recoveryTarget == RECOVERY_TARGET_TIME)
2716 : {
2717 : /*
2718 : * There can be many transactions that share the same commit time, so
2719 : * we stop after the last one, if we are inclusive, or stop at the
2720 : * first one if we are exclusive
2721 : */
2722 0 : if (recoveryTargetInclusive)
2723 0 : stopsHere = (recordXtime > recoveryTargetTime);
2724 : else
2725 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2726 : }
2727 :
2728 22400 : if (stopsHere)
2729 : {
2730 0 : recoveryStopAfter = false;
2731 0 : recoveryStopXid = recordXid;
2732 0 : recoveryStopTime = recordXtime;
2733 0 : recoveryStopLSN = InvalidXLogRecPtr;
2734 0 : recoveryStopName[0] = '\0';
2735 :
2736 0 : if (isCommit)
2737 : {
2738 0 : ereport(LOG,
2739 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2740 : recoveryStopXid,
2741 : timestamptz_to_str(recoveryStopTime))));
2742 : }
2743 : else
2744 : {
2745 0 : ereport(LOG,
2746 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2747 : recoveryStopXid,
2748 : timestamptz_to_str(recoveryStopTime))));
2749 : }
2750 : }
2751 :
2752 22400 : return stopsHere;
2753 : }
2754 :
2755 : /*
2756 : * Same as recoveryStopsBefore, but called after applying the record.
2757 : *
2758 : * We also track the timestamp of the latest applied COMMIT/ABORT
2759 : * record in XLogRecoveryCtl->recoveryLastXTime.
2760 : */
2761 : static bool
2762 2797189 : recoveryStopsAfter(XLogReaderState *record)
2763 : {
2764 : uint8 info;
2765 : uint8 xact_info;
2766 : uint8 rmid;
2767 2797189 : TimestampTz recordXtime = 0;
2768 :
2769 : /*
2770 : * Ignore recovery target settings when not in archive recovery (meaning
2771 : * we are in crash recovery).
2772 : */
2773 2797189 : if (!ArchiveRecoveryRequested)
2774 243894 : return false;
2775 :
2776 2553295 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2777 2553295 : rmid = XLogRecGetRmid(record);
2778 :
2779 : /*
2780 : * There can be many restore points that share the same name; we stop at
2781 : * the first one.
2782 : */
2783 2553295 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2784 20 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2785 : {
2786 : xl_restore_point *recordRestorePointData;
2787 :
2788 3 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2789 :
2790 3 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2791 : {
2792 2 : recoveryStopAfter = true;
2793 2 : recoveryStopXid = InvalidTransactionId;
2794 2 : recoveryStopLSN = InvalidXLogRecPtr;
2795 2 : (void) getRecordTimestamp(record, &recoveryStopTime);
2796 2 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2797 :
2798 2 : ereport(LOG,
2799 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2800 : recoveryStopName,
2801 : timestamptz_to_str(recoveryStopTime))));
2802 2 : return true;
2803 : }
2804 : }
2805 :
2806 : /* Check if the target LSN has been reached */
2807 2553293 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2808 8030 : recoveryTargetInclusive &&
2809 8030 : record->ReadRecPtr >= recoveryTargetLSN)
2810 : {
2811 3 : recoveryStopAfter = true;
2812 3 : recoveryStopXid = InvalidTransactionId;
2813 3 : recoveryStopLSN = record->ReadRecPtr;
2814 3 : recoveryStopTime = 0;
2815 3 : recoveryStopName[0] = '\0';
2816 3 : ereport(LOG,
2817 : errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2818 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2819 3 : return true;
2820 : }
2821 :
2822 2553290 : if (rmid != RM_XACT_ID)
2823 2530589 : return false;
2824 :
2825 22701 : xact_info = info & XLOG_XACT_OPMASK;
2826 :
2827 22701 : if (xact_info == XLOG_XACT_COMMIT ||
2828 2174 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2829 316 : xact_info == XLOG_XACT_ABORT ||
2830 : xact_info == XLOG_XACT_ABORT_PREPARED)
2831 : {
2832 : TransactionId recordXid;
2833 :
2834 : /* Update the last applied transaction timestamp */
2835 22398 : if (getRecordTimestamp(record, &recordXtime))
2836 22398 : SetLatestXTime(recordXtime);
2837 :
2838 : /* Extract the XID of the committed/aborted transaction */
2839 22398 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2840 : {
2841 29 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2842 : xl_xact_parsed_commit parsed;
2843 :
2844 29 : ParseCommitRecord(XLogRecGetInfo(record),
2845 : xlrec,
2846 : &parsed);
2847 29 : recordXid = parsed.twophase_xid;
2848 : }
2849 22369 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2850 : {
2851 13 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2852 : xl_xact_parsed_abort parsed;
2853 :
2854 13 : ParseAbortRecord(XLogRecGetInfo(record),
2855 : xlrec,
2856 : &parsed);
2857 13 : recordXid = parsed.twophase_xid;
2858 : }
2859 : else
2860 22356 : recordXid = XLogRecGetXid(record);
2861 :
2862 : /*
2863 : * There can be only one transaction end record with this exact
2864 : * transactionid
2865 : *
2866 : * when testing for an xid, we MUST test for equality only, since
2867 : * transactions are numbered in the order they start, not the order
2868 : * they complete. A higher numbered xid will complete before you about
2869 : * 50% of the time...
2870 : */
2871 22398 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2872 0 : recordXid == recoveryTargetXid)
2873 : {
2874 0 : recoveryStopAfter = true;
2875 0 : recoveryStopXid = recordXid;
2876 0 : recoveryStopTime = recordXtime;
2877 0 : recoveryStopLSN = InvalidXLogRecPtr;
2878 0 : recoveryStopName[0] = '\0';
2879 :
2880 0 : if (xact_info == XLOG_XACT_COMMIT ||
2881 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2882 : {
2883 0 : ereport(LOG,
2884 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2885 : recoveryStopXid,
2886 : timestamptz_to_str(recoveryStopTime))));
2887 : }
2888 0 : else if (xact_info == XLOG_XACT_ABORT ||
2889 : xact_info == XLOG_XACT_ABORT_PREPARED)
2890 : {
2891 0 : ereport(LOG,
2892 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2893 : recoveryStopXid,
2894 : timestamptz_to_str(recoveryStopTime))));
2895 : }
2896 0 : return true;
2897 : }
2898 : }
2899 :
2900 : /* Check if we should stop as soon as reaching consistency */
2901 22701 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2902 : {
2903 0 : ereport(LOG,
2904 : (errmsg("recovery stopping after reaching consistency")));
2905 :
2906 0 : recoveryStopAfter = true;
2907 0 : recoveryStopXid = InvalidTransactionId;
2908 0 : recoveryStopTime = 0;
2909 0 : recoveryStopLSN = InvalidXLogRecPtr;
2910 0 : recoveryStopName[0] = '\0';
2911 0 : return true;
2912 : }
2913 :
2914 22701 : return false;
2915 : }
2916 :
2917 : /*
2918 : * Create a comment for the history file to explain why and where
2919 : * timeline changed.
2920 : */
2921 : static char *
2922 941 : getRecoveryStopReason(void)
2923 : {
2924 : char reason[200];
2925 :
2926 941 : if (recoveryTarget == RECOVERY_TARGET_XID)
2927 0 : snprintf(reason, sizeof(reason),
2928 : "%s transaction %u",
2929 0 : recoveryStopAfter ? "after" : "before",
2930 : recoveryStopXid);
2931 941 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2932 0 : snprintf(reason, sizeof(reason),
2933 : "%s %s\n",
2934 0 : recoveryStopAfter ? "after" : "before",
2935 : timestamptz_to_str(recoveryStopTime));
2936 941 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2937 6 : snprintf(reason, sizeof(reason),
2938 : "%s LSN %X/%08X\n",
2939 6 : recoveryStopAfter ? "after" : "before",
2940 6 : LSN_FORMAT_ARGS(recoveryStopLSN));
2941 935 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2942 3 : snprintf(reason, sizeof(reason),
2943 : "at restore point \"%s\"",
2944 : recoveryStopName);
2945 932 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2946 0 : snprintf(reason, sizeof(reason), "reached consistency");
2947 : else
2948 932 : snprintf(reason, sizeof(reason), "no recovery target specified");
2949 :
2950 941 : return pstrdup(reason);
2951 : }
2952 :
2953 : /*
2954 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2955 : *
2956 : * endOfRecovery is true if the recovery target is reached and
2957 : * the paused state starts at the end of recovery because of
2958 : * recovery_target_action=pause, and false otherwise.
2959 : */
2960 : static void
2961 4 : recoveryPausesHere(bool endOfRecovery)
2962 : {
2963 : /* Don't pause unless users can connect! */
2964 4 : if (!LocalHotStandbyActive)
2965 0 : return;
2966 :
2967 : /* Don't pause after standby promotion has been triggered */
2968 4 : if (LocalPromoteIsTriggered)
2969 0 : return;
2970 :
2971 4 : if (endOfRecovery)
2972 1 : ereport(LOG,
2973 : (errmsg("pausing at the end of recovery"),
2974 : errhint("Execute pg_wal_replay_resume() to promote.")));
2975 : else
2976 3 : ereport(LOG,
2977 : (errmsg("recovery has paused"),
2978 : errhint("Execute pg_wal_replay_resume() to continue.")));
2979 :
2980 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2981 12 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2982 : {
2983 10 : ProcessStartupProcInterrupts();
2984 10 : if (CheckForStandbyTrigger())
2985 2 : return;
2986 :
2987 : /*
2988 : * If recovery pause is requested then set it paused. While we are in
2989 : * the loop, user might resume and pause again so set this every time.
2990 : */
2991 8 : ConfirmRecoveryPaused();
2992 :
2993 : /*
2994 : * We wait on a condition variable that will wake us as soon as the
2995 : * pause ends, but we use a timeout so we can check the above exit
2996 : * condition periodically too.
2997 : */
2998 8 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2999 : WAIT_EVENT_RECOVERY_PAUSE);
3000 : }
3001 2 : ConditionVariableCancelSleep();
3002 : }
3003 :
3004 : /*
3005 : * When recovery_min_apply_delay is set, we wait long enough to make sure
3006 : * certain record types are applied at least that interval behind the primary.
3007 : *
3008 : * Returns true if we waited.
3009 : *
3010 : * Note that the delay is calculated between the WAL record log time and
3011 : * the current time on standby. We would prefer to keep track of when this
3012 : * standby received each WAL record, which would allow a more consistent
3013 : * approach and one not affected by time synchronisation issues, but that
3014 : * is significantly more effort and complexity for little actual gain in
3015 : * usability.
3016 : */
3017 : static bool
3018 2797191 : recoveryApplyDelay(XLogReaderState *record)
3019 : {
3020 : uint8 xact_info;
3021 : TimestampTz xtime;
3022 : TimestampTz delayUntil;
3023 : long msecs;
3024 :
3025 : /* nothing to do if no delay configured */
3026 2797191 : if (recovery_min_apply_delay <= 0)
3027 2797049 : return false;
3028 :
3029 : /* no delay is applied on a database not yet consistent */
3030 142 : if (!reachedConsistency)
3031 4 : return false;
3032 :
3033 : /* nothing to do if crash recovery is requested */
3034 138 : if (!ArchiveRecoveryRequested)
3035 0 : return false;
3036 :
3037 : /*
3038 : * Is it a COMMIT record?
3039 : *
3040 : * We deliberately choose not to delay aborts since they have no effect on
3041 : * MVCC. We already allow replay of records that don't have a timestamp,
3042 : * so there is already opportunity for issues caused by early conflicts on
3043 : * standbys.
3044 : */
3045 138 : if (XLogRecGetRmid(record) != RM_XACT_ID)
3046 109 : return false;
3047 :
3048 29 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3049 :
3050 29 : if (xact_info != XLOG_XACT_COMMIT &&
3051 : xact_info != XLOG_XACT_COMMIT_PREPARED)
3052 0 : return false;
3053 :
3054 29 : if (!getRecordTimestamp(record, &xtime))
3055 0 : return false;
3056 :
3057 29 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3058 :
3059 : /*
3060 : * Exit without arming the latch if it's already past time to apply this
3061 : * record
3062 : */
3063 29 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3064 29 : if (msecs <= 0)
3065 1 : return false;
3066 :
3067 : while (true)
3068 : {
3069 75 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3070 :
3071 : /* This might change recovery_min_apply_delay. */
3072 75 : ProcessStartupProcInterrupts();
3073 :
3074 75 : if (CheckForStandbyTrigger())
3075 0 : break;
3076 :
3077 : /*
3078 : * Recalculate delayUntil as recovery_min_apply_delay could have
3079 : * changed while waiting in this loop.
3080 : */
3081 75 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3082 :
3083 : /*
3084 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3085 : */
3086 75 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3087 : delayUntil);
3088 :
3089 75 : if (msecs <= 0)
3090 28 : break;
3091 :
3092 47 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3093 :
3094 47 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3095 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3096 : msecs,
3097 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3098 : }
3099 28 : return true;
3100 : }
3101 :
3102 : /*
3103 : * Get the current state of the recovery pause request.
3104 : */
3105 : RecoveryPauseState
3106 18 : GetRecoveryPauseState(void)
3107 : {
3108 : RecoveryPauseState state;
3109 :
3110 18 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3111 18 : state = XLogRecoveryCtl->recoveryPauseState;
3112 18 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3113 :
3114 18 : return state;
3115 : }
3116 :
3117 : /*
3118 : * Set the recovery pause state.
3119 : *
3120 : * If recovery pause is requested then sets the recovery pause state to
3121 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3122 : * to 'not paused' to resume the recovery. The recovery pause will be
3123 : * confirmed by the ConfirmRecoveryPaused.
3124 : */
3125 : void
3126 56 : SetRecoveryPause(bool recoveryPause)
3127 : {
3128 56 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3129 :
3130 56 : if (!recoveryPause)
3131 51 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3132 5 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3133 5 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3134 :
3135 56 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3136 :
3137 56 : if (!recoveryPause)
3138 51 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3139 56 : }
3140 :
3141 : /*
3142 : * Confirm the recovery pause by setting the recovery pause state to
3143 : * RECOVERY_PAUSED.
3144 : */
3145 : static void
3146 8 : ConfirmRecoveryPaused(void)
3147 : {
3148 : /* If recovery pause is requested then set it paused */
3149 8 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3150 8 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3151 4 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3152 8 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3153 8 : }
3154 :
3155 :
3156 : /*
3157 : * Attempt to read the next XLOG record.
3158 : *
3159 : * Before first call, the reader needs to be positioned to the first record
3160 : * by calling XLogPrefetcherBeginRead().
3161 : *
3162 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3163 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3164 : * record is available.
3165 : */
3166 : static XLogRecord *
3167 2799481 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3168 : bool fetching_ckpt, TimeLineID replayTLI)
3169 : {
3170 : XLogRecord *record;
3171 2799481 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3172 2799481 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3173 :
3174 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3175 :
3176 : /* Pass through parameters to XLogPageRead */
3177 2799481 : private->fetching_ckpt = fetching_ckpt;
3178 2799481 : private->emode = emode;
3179 2799481 : private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3180 2799481 : private->replayTLI = replayTLI;
3181 :
3182 : /* This is the first attempt to read this page. */
3183 2799481 : lastSourceFailed = false;
3184 :
3185 : for (;;)
3186 138 : {
3187 : char *errormsg;
3188 :
3189 2799619 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3190 2799560 : if (record == NULL)
3191 : {
3192 : /*
3193 : * When we find that WAL ends in an incomplete record, keep track
3194 : * of that record. After recovery is done, we'll write a record
3195 : * to indicate to downstream WAL readers that that portion is to
3196 : * be ignored.
3197 : *
3198 : * However, when ArchiveRecoveryRequested = true, we're going to
3199 : * switch to a new timeline at the end of recovery. We will only
3200 : * copy WAL over to the new timeline up to the end of the last
3201 : * complete record, so if we did this, we would later create an
3202 : * overwrite contrecord in the wrong place, breaking everything.
3203 : */
3204 296 : if (!ArchiveRecoveryRequested &&
3205 109 : XLogRecPtrIsValid(xlogreader->abortedRecPtr))
3206 : {
3207 11 : abortedRecPtr = xlogreader->abortedRecPtr;
3208 11 : missingContrecPtr = xlogreader->missingContrecPtr;
3209 : }
3210 :
3211 296 : if (readFile >= 0)
3212 : {
3213 271 : close(readFile);
3214 271 : readFile = -1;
3215 : }
3216 :
3217 : /*
3218 : * We only end up here without a message when XLogPageRead()
3219 : * failed - in that case we already logged something. In
3220 : * StandbyMode that only happens if we have been triggered, so we
3221 : * shouldn't loop anymore in that case.
3222 : */
3223 296 : if (errormsg)
3224 271 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3225 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3226 : }
3227 :
3228 : /*
3229 : * Check page TLI is one of the expected values.
3230 : */
3231 2799264 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3232 : {
3233 : char fname[MAXFNAMELEN];
3234 : XLogSegNo segno;
3235 : int32 offset;
3236 :
3237 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3238 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3239 : wal_segment_size);
3240 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3241 : wal_segment_size);
3242 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3243 : errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3244 : xlogreader->latestPageTLI,
3245 : fname,
3246 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3247 : offset));
3248 0 : record = NULL;
3249 : }
3250 :
3251 2799560 : if (record)
3252 : {
3253 : /* Great, got a record */
3254 2799422 : return record;
3255 : }
3256 : else
3257 : {
3258 : /* No valid record available from this source */
3259 296 : lastSourceFailed = true;
3260 :
3261 : /*
3262 : * If archive recovery was requested, but we were still doing
3263 : * crash recovery, switch to archive recovery and retry using the
3264 : * offline archive. We have now replayed all the valid WAL in
3265 : * pg_wal, so we are presumably now consistent.
3266 : *
3267 : * We require that there's at least some valid WAL present in
3268 : * pg_wal, however (!fetching_ckpt). We could recover using the
3269 : * WAL from the archive, even if pg_wal is completely empty, but
3270 : * we'd have no idea how far we'd have to replay to reach
3271 : * consistency. So err on the safe side and give up.
3272 : */
3273 296 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3274 2 : !fetching_ckpt)
3275 : {
3276 2 : ereport(DEBUG1,
3277 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3278 2 : InArchiveRecovery = true;
3279 2 : if (StandbyModeRequested)
3280 2 : EnableStandbyMode();
3281 :
3282 2 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3283 2 : minRecoveryPoint = xlogreader->EndRecPtr;
3284 2 : minRecoveryPointTLI = replayTLI;
3285 :
3286 2 : CheckRecoveryConsistency();
3287 :
3288 : /*
3289 : * Before we retry, reset lastSourceFailed and currentSource
3290 : * so that we will check the archive next.
3291 : */
3292 2 : lastSourceFailed = false;
3293 2 : currentSource = XLOG_FROM_ANY;
3294 :
3295 138 : continue;
3296 : }
3297 :
3298 : /* In standby mode, loop back to retry. Otherwise, give up. */
3299 294 : if (StandbyMode && !CheckForStandbyTrigger())
3300 136 : continue;
3301 : else
3302 158 : return NULL;
3303 : }
3304 : }
3305 : }
3306 :
3307 : /*
3308 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3309 : * already). Returns number of bytes read, if the page is read successfully,
3310 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3311 : * but only if they have not been previously reported.
3312 : *
3313 : * See XLogReaderRoutine.page_read for more details.
3314 : *
3315 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3316 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3317 : *
3318 : * This is responsible for restoring files from archive as needed, as well
3319 : * as for waiting for the requested WAL record to arrive in standby mode.
3320 : *
3321 : * xlogreader->private_data->emode specifies the log level used for reporting
3322 : * "file not found" or "end of WAL" situations in archive recovery, or in
3323 : * standby mode when promotion is triggered. If set to WARNING or below,
3324 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3325 : * levels the ereport() won't return.
3326 : *
3327 : * In standby mode, if after a successful return of XLogPageRead() the
3328 : * caller finds the record it's interested in to be broken, it should
3329 : * ereport the error with the level determined by
3330 : * emode_for_corrupt_record(), and then set lastSourceFailed
3331 : * and call XLogPageRead() again with the same arguments. This lets
3332 : * XLogPageRead() to try fetching the record from another source, or to
3333 : * sleep and retry.
3334 : */
3335 : static int
3336 1454158 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3337 : XLogRecPtr targetRecPtr, char *readBuf)
3338 : {
3339 1454158 : XLogPageReadPrivate *private =
3340 : (XLogPageReadPrivate *) xlogreader->private_data;
3341 1454158 : int emode = private->emode;
3342 : uint32 targetPageOff;
3343 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3344 : int r;
3345 : instr_time io_start;
3346 :
3347 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3348 :
3349 1454158 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3350 1454158 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3351 :
3352 : /*
3353 : * See if we need to switch to a new segment because the requested record
3354 : * is not in the currently open one.
3355 : */
3356 1454158 : if (readFile >= 0 &&
3357 1452254 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3358 : {
3359 : /*
3360 : * Request a restartpoint if we've replayed too much xlog since the
3361 : * last one.
3362 : */
3363 1634 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3364 : {
3365 1617 : if (XLogCheckpointNeeded(readSegNo))
3366 : {
3367 1495 : (void) GetRedoRecPtr();
3368 1495 : if (XLogCheckpointNeeded(readSegNo))
3369 1488 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3370 : }
3371 : }
3372 :
3373 1634 : close(readFile);
3374 1634 : readFile = -1;
3375 1634 : readSource = XLOG_FROM_ANY;
3376 : }
3377 :
3378 1454158 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3379 :
3380 1454163 : retry:
3381 : /* See if we need to retrieve more data */
3382 1454163 : if (readFile < 0 ||
3383 1450620 : (readSource == XLOG_FROM_STREAM &&
3384 1438532 : flushedUpto < targetPagePtr + reqLen))
3385 : {
3386 13935 : if (readFile >= 0 &&
3387 10392 : xlogreader->nonblocking &&
3388 5104 : readSource == XLOG_FROM_STREAM &&
3389 5104 : flushedUpto < targetPagePtr + reqLen)
3390 5104 : return XLREAD_WOULDBLOCK;
3391 :
3392 8772 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3393 8831 : private->randAccess,
3394 8831 : private->fetching_ckpt,
3395 : targetRecPtr,
3396 : private->replayTLI,
3397 : xlogreader->EndRecPtr,
3398 8831 : xlogreader->nonblocking))
3399 : {
3400 581 : case XLREAD_WOULDBLOCK:
3401 581 : return XLREAD_WOULDBLOCK;
3402 48 : case XLREAD_FAIL:
3403 48 : if (readFile >= 0)
3404 0 : close(readFile);
3405 48 : readFile = -1;
3406 48 : readLen = 0;
3407 48 : readSource = XLOG_FROM_ANY;
3408 48 : return XLREAD_FAIL;
3409 8143 : case XLREAD_SUCCESS:
3410 8143 : break;
3411 : }
3412 : }
3413 :
3414 : /*
3415 : * At this point, we have the right segment open and if we're streaming we
3416 : * know the requested record is in it.
3417 : */
3418 : Assert(readFile != -1);
3419 :
3420 : /*
3421 : * If the current segment is being streamed from the primary, calculate
3422 : * how much of the current page we have received already. We know the
3423 : * requested record has been received, but this is for the benefit of
3424 : * future calls, to allow quick exit at the top of this function.
3425 : */
3426 1448371 : if (readSource == XLOG_FROM_STREAM)
3427 : {
3428 1434630 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3429 1431695 : readLen = XLOG_BLCKSZ;
3430 : else
3431 2935 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3432 : targetPageOff;
3433 : }
3434 : else
3435 13741 : readLen = XLOG_BLCKSZ;
3436 :
3437 : /* Read the requested page */
3438 1448371 : readOff = targetPageOff;
3439 :
3440 : /* Measure I/O timing when reading segment */
3441 1448371 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3442 :
3443 1448371 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3444 1448371 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3445 1448371 : if (r != XLOG_BLCKSZ)
3446 : {
3447 : char fname[MAXFNAMELEN];
3448 0 : int save_errno = errno;
3449 :
3450 0 : pgstat_report_wait_end();
3451 :
3452 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3453 : io_start, 1, r);
3454 :
3455 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3456 0 : if (r < 0)
3457 : {
3458 0 : errno = save_errno;
3459 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3460 : (errcode_for_file_access(),
3461 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3462 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3463 : readOff)));
3464 : }
3465 : else
3466 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3467 : (errcode(ERRCODE_DATA_CORRUPTED),
3468 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3469 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3470 : readOff, r, (Size) XLOG_BLCKSZ)));
3471 0 : goto next_record_is_invalid;
3472 : }
3473 1448371 : pgstat_report_wait_end();
3474 :
3475 1448371 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3476 : io_start, 1, r);
3477 :
3478 : Assert(targetSegNo == readSegNo);
3479 : Assert(targetPageOff == readOff);
3480 : Assert(reqLen <= readLen);
3481 :
3482 1448371 : xlogreader->seg.ws_tli = curFileTLI;
3483 :
3484 : /*
3485 : * Check the page header immediately, so that we can retry immediately if
3486 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3487 : * validates the page header anyway, and would propagate the failure up to
3488 : * ReadRecord(), which would retry. However, there's a corner case with
3489 : * continuation records, if a record is split across two pages such that
3490 : * we would need to read the two pages from different sources across two
3491 : * WAL segments.
3492 : *
3493 : * The first page is only available locally, in pg_wal, because it's
3494 : * already been recycled on the primary. The second page, however, is not
3495 : * present in pg_wal, and we should stream it from the primary. There is a
3496 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3497 : * We would read the first page from the local WAL segment, but when
3498 : * reading the second page, we would read the bogus, recycled, WAL
3499 : * segment. If we didn't catch that case here, we would never recover,
3500 : * because ReadRecord() would retry reading the whole record from the
3501 : * beginning.
3502 : *
3503 : * Of course, this only catches errors in the page header, which is what
3504 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3505 : * corruption still has the same problem. But this at least fixes the
3506 : * common case, which can happen as part of normal operation.
3507 : *
3508 : * Validating the page header is cheap enough that doing it twice
3509 : * shouldn't be a big deal from a performance point of view.
3510 : *
3511 : * When not in standby mode, an invalid page header should cause recovery
3512 : * to end, not retry reading the page, so we don't need to validate the
3513 : * page header here for the retry. Instead, ReadPageInternal() is
3514 : * responsible for the validation.
3515 : */
3516 1448371 : if (StandbyMode &&
3517 1438382 : (targetPagePtr % wal_segment_size) == 0 &&
3518 1414 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3519 : {
3520 : /*
3521 : * Emit this error right now then retry this page immediately. Use
3522 : * errmsg_internal() because the message was already translated.
3523 : */
3524 6 : if (xlogreader->errormsg_buf[0])
3525 6 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3526 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3527 :
3528 : /* reset any error XLogReaderValidatePageHeader() might have set */
3529 6 : XLogReaderResetError(xlogreader);
3530 6 : goto next_record_is_invalid;
3531 : }
3532 :
3533 1448365 : return readLen;
3534 :
3535 6 : next_record_is_invalid:
3536 :
3537 : /*
3538 : * If we're reading ahead, give up fast. Retries and error reporting will
3539 : * be handled by a later read when recovery catches up to this point.
3540 : */
3541 6 : if (xlogreader->nonblocking)
3542 1 : return XLREAD_WOULDBLOCK;
3543 :
3544 5 : lastSourceFailed = true;
3545 :
3546 5 : if (readFile >= 0)
3547 5 : close(readFile);
3548 5 : readFile = -1;
3549 5 : readLen = 0;
3550 5 : readSource = XLOG_FROM_ANY;
3551 :
3552 : /* In standby-mode, keep trying */
3553 5 : if (StandbyMode)
3554 5 : goto retry;
3555 : else
3556 0 : return XLREAD_FAIL;
3557 : }
3558 :
3559 : /*
3560 : * Open the WAL segment containing WAL location 'RecPtr'.
3561 : *
3562 : * The segment can be fetched via restore_command, or via walreceiver having
3563 : * streamed the record, or it can already be present in pg_wal. Checking
3564 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3565 : * too, in case someone copies a new segment directly to pg_wal. That is not
3566 : * documented or recommended, though.
3567 : *
3568 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3569 : * prepare to read WAL starting from RedoStartLSN after this.
3570 : *
3571 : * 'RecPtr' might not point to the beginning of the record we're interested
3572 : * in, it might also point to the page or segment header. In that case,
3573 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3574 : * used to decide which timeline to stream the requested WAL from.
3575 : *
3576 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3577 : * timelines, we can reject a switch to a timeline that branched off before
3578 : * this point.
3579 : *
3580 : * If the record is not immediately available, the function returns XLREAD_FAIL
3581 : * if we're not in standby mode. In standby mode, the function waits for it to
3582 : * become available.
3583 : *
3584 : * When the requested record becomes available, the function opens the file
3585 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3586 : * of standby mode is triggered by the user, and there is no more WAL
3587 : * available, returns XLREAD_FAIL.
3588 : *
3589 : * If nonblocking is true, then give up immediately if we can't satisfy the
3590 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3591 : */
3592 : static XLogPageReadResult
3593 8831 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3594 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3595 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3596 : bool nonblocking)
3597 : {
3598 : static TimestampTz last_fail_time = 0;
3599 : TimestampTz now;
3600 8831 : bool streaming_reply_sent = false;
3601 :
3602 : /*-------
3603 : * Standby mode is implemented by a state machine:
3604 : *
3605 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3606 : * pg_wal (XLOG_FROM_PG_WAL)
3607 : * 2. Check for promotion trigger request
3608 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3609 : * 4. Rescan timelines
3610 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3611 : *
3612 : * Failure to read from the current source advances the state machine to
3613 : * the next state.
3614 : *
3615 : * 'currentSource' indicates the current state. There are no currentSource
3616 : * values for "check trigger", "rescan timelines", and "sleep" states,
3617 : * those actions are taken when reading from the previous source fails, as
3618 : * part of advancing to the next state.
3619 : *
3620 : * If standby mode is turned off while reading WAL from stream, we move
3621 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3622 : * the files (which would be required at end of recovery, e.g., timeline
3623 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3624 : * here because it's already stopped when standby mode is turned off at
3625 : * the end of recovery.
3626 : *-------
3627 : */
3628 8831 : if (!InArchiveRecovery)
3629 1003 : currentSource = XLOG_FROM_PG_WAL;
3630 7828 : else if (currentSource == XLOG_FROM_ANY ||
3631 7703 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3632 : {
3633 125 : lastSourceFailed = false;
3634 125 : currentSource = XLOG_FROM_ARCHIVE;
3635 : }
3636 :
3637 : for (;;)
3638 7263 : {
3639 16094 : XLogSource oldSource = currentSource;
3640 16094 : bool startWalReceiver = false;
3641 :
3642 : /*
3643 : * First check if we failed to read from the current source, and
3644 : * advance the state machine if so. The failure to read might've
3645 : * happened outside this function, e.g when a CRC check fails on a
3646 : * record, or within this loop.
3647 : */
3648 16094 : if (lastSourceFailed)
3649 : {
3650 : /*
3651 : * Don't allow any retry loops to occur during nonblocking
3652 : * readahead. Let the caller process everything that has been
3653 : * decoded already first.
3654 : */
3655 532 : if (nonblocking)
3656 80 : return XLREAD_WOULDBLOCK;
3657 :
3658 452 : switch (currentSource)
3659 : {
3660 273 : case XLOG_FROM_ARCHIVE:
3661 : case XLOG_FROM_PG_WAL:
3662 :
3663 : /*
3664 : * Check to see if promotion is requested. Note that we do
3665 : * this only after failure, so when you promote, we still
3666 : * finish replaying as much as we can from archive and
3667 : * pg_wal before failover.
3668 : */
3669 273 : if (StandbyMode && CheckForStandbyTrigger())
3670 : {
3671 22 : XLogShutdownWalRcv();
3672 22 : return XLREAD_FAIL;
3673 : }
3674 :
3675 : /*
3676 : * Not in standby mode, and we've now tried the archive
3677 : * and pg_wal.
3678 : */
3679 251 : if (!StandbyMode)
3680 26 : return XLREAD_FAIL;
3681 :
3682 : /*
3683 : * Move to XLOG_FROM_STREAM state, and set to start a
3684 : * walreceiver if necessary.
3685 : */
3686 225 : currentSource = XLOG_FROM_STREAM;
3687 225 : startWalReceiver = true;
3688 225 : break;
3689 :
3690 179 : case XLOG_FROM_STREAM:
3691 :
3692 : /*
3693 : * Failure while streaming. Most likely, we got here
3694 : * because streaming replication was terminated, or
3695 : * promotion was triggered. But we also get here if we
3696 : * find an invalid record in the WAL streamed from the
3697 : * primary, in which case something is seriously wrong.
3698 : * There's little chance that the problem will just go
3699 : * away, but PANIC is not good for availability either,
3700 : * especially in hot standby mode. So, we treat that the
3701 : * same as disconnection, and retry from archive/pg_wal
3702 : * again. The WAL in the archive should be identical to
3703 : * what was streamed, so it's unlikely that it helps, but
3704 : * one can hope...
3705 : */
3706 :
3707 : /*
3708 : * We should be able to move to XLOG_FROM_STREAM only in
3709 : * standby mode.
3710 : */
3711 : Assert(StandbyMode);
3712 :
3713 : /*
3714 : * Before we leave XLOG_FROM_STREAM state, make sure that
3715 : * walreceiver is not active, so that it won't overwrite
3716 : * WAL that we restore from archive.
3717 : *
3718 : * If walreceiver is actively streaming (or attempting to
3719 : * connect), we must shut it down. However, if it's
3720 : * already in WAITING state (e.g., due to timeline
3721 : * divergence), we only need to reset the install flag to
3722 : * allow archive restoration.
3723 : */
3724 179 : if (WalRcvStreaming())
3725 34 : XLogShutdownWalRcv();
3726 : else
3727 : {
3728 : /*
3729 : * WALRCV_STOPPING state is a transient state while
3730 : * the startup process is in ShutdownWalRcv(). It
3731 : * should never appear here since we would be waiting
3732 : * for the walreceiver to reach WALRCV_STOPPED in that
3733 : * case.
3734 : */
3735 : Assert(WalRcvGetState() != WALRCV_STOPPING);
3736 145 : ResetInstallXLogFileSegmentActive();
3737 : }
3738 :
3739 : /*
3740 : * Before we sleep, re-scan for possible new timelines if
3741 : * we were requested to recover to the latest timeline.
3742 : */
3743 179 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3744 : {
3745 179 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3746 : {
3747 7 : currentSource = XLOG_FROM_ARCHIVE;
3748 7 : break;
3749 : }
3750 : }
3751 :
3752 : /*
3753 : * XLOG_FROM_STREAM is the last state in our state
3754 : * machine, so we've exhausted all the options for
3755 : * obtaining the requested WAL. We're going to loop back
3756 : * and retry from the archive, but if it hasn't been long
3757 : * since last attempt, sleep wal_retrieve_retry_interval
3758 : * milliseconds to avoid busy-waiting.
3759 : */
3760 172 : now = GetCurrentTimestamp();
3761 172 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3762 : wal_retrieve_retry_interval))
3763 : {
3764 : long wait_time;
3765 :
3766 184 : wait_time = wal_retrieve_retry_interval -
3767 92 : TimestampDifferenceMilliseconds(last_fail_time, now);
3768 :
3769 92 : elog(LOG, "waiting for WAL to become available at %X/%08X",
3770 : LSN_FORMAT_ARGS(RecPtr));
3771 :
3772 : /* Do background tasks that might benefit us later. */
3773 92 : KnownAssignedTransactionIdsIdleMaintenance();
3774 :
3775 92 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3776 : WL_LATCH_SET | WL_TIMEOUT |
3777 : WL_EXIT_ON_PM_DEATH,
3778 : wait_time,
3779 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3780 92 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3781 92 : now = GetCurrentTimestamp();
3782 :
3783 : /* Handle interrupt signals of startup process */
3784 92 : ProcessStartupProcInterrupts();
3785 : }
3786 158 : last_fail_time = now;
3787 158 : currentSource = XLOG_FROM_ARCHIVE;
3788 158 : break;
3789 :
3790 0 : default:
3791 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3792 : }
3793 : }
3794 15562 : else if (currentSource == XLOG_FROM_PG_WAL)
3795 : {
3796 : /*
3797 : * We just successfully read a file in pg_wal. We prefer files in
3798 : * the archive over ones in pg_wal, so try the next file again
3799 : * from the archive first.
3800 : */
3801 999 : if (InArchiveRecovery)
3802 0 : currentSource = XLOG_FROM_ARCHIVE;
3803 : }
3804 :
3805 15952 : if (currentSource != oldSource)
3806 390 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3807 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3808 : lastSourceFailed ? "failure" : "success");
3809 :
3810 : /*
3811 : * We've now handled possible failure. Try to read from the chosen
3812 : * source.
3813 : */
3814 15952 : lastSourceFailed = false;
3815 :
3816 15952 : switch (currentSource)
3817 : {
3818 1843 : case XLOG_FROM_ARCHIVE:
3819 : case XLOG_FROM_PG_WAL:
3820 :
3821 : /*
3822 : * WAL receiver must not be running when reading WAL from
3823 : * archive or pg_wal.
3824 : */
3825 : Assert(!WalRcvStreaming());
3826 :
3827 : /* Close any old file we might have open. */
3828 1843 : if (readFile >= 0)
3829 : {
3830 87 : close(readFile);
3831 87 : readFile = -1;
3832 : }
3833 : /* Reset curFileTLI if random fetch. */
3834 1843 : if (randAccess)
3835 1166 : curFileTLI = 0;
3836 :
3837 : /*
3838 : * Try to restore the file from archive, or read an existing
3839 : * file from pg_wal.
3840 : */
3841 1843 : readFile = XLogFileReadAnyTLI(readSegNo,
3842 1843 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3843 : currentSource);
3844 1842 : if (readFile >= 0)
3845 1653 : return XLREAD_SUCCESS; /* success! */
3846 :
3847 : /*
3848 : * Nope, not found in archive or pg_wal.
3849 : */
3850 189 : lastSourceFailed = true;
3851 189 : break;
3852 :
3853 14109 : case XLOG_FROM_STREAM:
3854 : {
3855 : bool havedata;
3856 :
3857 : /*
3858 : * We should be able to move to XLOG_FROM_STREAM only in
3859 : * standby mode.
3860 : */
3861 : Assert(StandbyMode);
3862 :
3863 : /*
3864 : * First, shutdown walreceiver if its restart has been
3865 : * requested -- but no point if we're already slated for
3866 : * starting it.
3867 : */
3868 14109 : if (pendingWalRcvRestart && !startWalReceiver)
3869 : {
3870 7 : XLogShutdownWalRcv();
3871 :
3872 : /*
3873 : * Re-scan for possible new timelines if we were
3874 : * requested to recover to the latest timeline.
3875 : */
3876 7 : if (recoveryTargetTimeLineGoal ==
3877 : RECOVERY_TARGET_TIMELINE_LATEST)
3878 7 : rescanLatestTimeLine(replayTLI, replayLSN);
3879 :
3880 7 : startWalReceiver = true;
3881 : }
3882 14109 : pendingWalRcvRestart = false;
3883 :
3884 : /*
3885 : * Launch walreceiver if needed.
3886 : *
3887 : * If fetching_ckpt is true, RecPtr points to the initial
3888 : * checkpoint location. In that case, we use RedoStartLSN
3889 : * as the streaming start position instead of RecPtr, so
3890 : * that when we later jump backwards to start redo at
3891 : * RedoStartLSN, we will have the logs streamed already.
3892 : */
3893 14109 : if (startWalReceiver &&
3894 232 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3895 : {
3896 : XLogRecPtr ptr;
3897 : TimeLineID tli;
3898 :
3899 194 : if (fetching_ckpt)
3900 : {
3901 0 : ptr = RedoStartLSN;
3902 0 : tli = RedoStartTLI;
3903 : }
3904 : else
3905 : {
3906 194 : ptr = RecPtr;
3907 :
3908 : /*
3909 : * Use the record begin position to determine the
3910 : * TLI, rather than the position we're reading.
3911 : */
3912 194 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3913 :
3914 194 : if (curFileTLI > 0 && tli < curFileTLI)
3915 0 : elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3916 : LSN_FORMAT_ARGS(tliRecPtr),
3917 : tli, curFileTLI);
3918 : }
3919 194 : curFileTLI = tli;
3920 194 : SetInstallXLogFileSegmentActive();
3921 194 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3922 : PrimarySlotName,
3923 : wal_receiver_create_temp_slot);
3924 194 : flushedUpto = InvalidXLogRecPtr;
3925 : }
3926 :
3927 : /*
3928 : * Check if WAL receiver is active or wait to start up.
3929 : */
3930 14109 : if (!WalRcvStreaming())
3931 : {
3932 145 : lastSourceFailed = true;
3933 145 : break;
3934 : }
3935 :
3936 : /*
3937 : * Walreceiver is active, so see if new data has arrived.
3938 : *
3939 : * We only advance XLogReceiptTime when we obtain fresh
3940 : * WAL from walreceiver and observe that we had already
3941 : * processed everything before the most recent "chunk"
3942 : * that it flushed to disk. In steady state where we are
3943 : * keeping up with the incoming data, XLogReceiptTime will
3944 : * be updated on each cycle. When we are behind,
3945 : * XLogReceiptTime will not advance, so the grace time
3946 : * allotted to conflicting queries will decrease.
3947 : */
3948 13964 : if (RecPtr < flushedUpto)
3949 1922 : havedata = true;
3950 : else
3951 : {
3952 : XLogRecPtr latestChunkStart;
3953 :
3954 12042 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3955 12042 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3956 : {
3957 5890 : havedata = true;
3958 5890 : if (latestChunkStart <= RecPtr)
3959 : {
3960 3632 : XLogReceiptTime = GetCurrentTimestamp();
3961 3632 : SetCurrentChunkStartTime(XLogReceiptTime);
3962 : }
3963 : }
3964 : else
3965 6152 : havedata = false;
3966 : }
3967 13964 : if (havedata)
3968 : {
3969 : /*
3970 : * Great, streamed far enough. Open the file if it's
3971 : * not open already. Also read the timeline history
3972 : * file if we haven't initialized timeline history
3973 : * yet; it should be streamed over and present in
3974 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3975 : * info is set correctly and XLogReceiptTime isn't
3976 : * changed.
3977 : *
3978 : * NB: We must set readTimeLineHistory based on
3979 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3980 : * be the same, but if recovery_target_timeline is
3981 : * 'latest' and archiving is configured, then it's
3982 : * possible that we managed to retrieve one or more
3983 : * new timeline history files from the archive,
3984 : * updating recoveryTargetTLI.
3985 : */
3986 7812 : if (readFile < 0)
3987 : {
3988 1322 : if (!expectedTLEs)
3989 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3990 1322 : readFile = XLogFileRead(readSegNo, receiveTLI,
3991 : XLOG_FROM_STREAM, false);
3992 : Assert(readFile >= 0);
3993 : }
3994 : else
3995 : {
3996 : /* just make sure source info is correct... */
3997 6490 : readSource = XLOG_FROM_STREAM;
3998 6490 : XLogReceiptSource = XLOG_FROM_STREAM;
3999 6490 : return XLREAD_SUCCESS;
4000 : }
4001 1322 : break;
4002 : }
4003 :
4004 : /* In nonblocking mode, return rather than sleeping. */
4005 6152 : if (nonblocking)
4006 501 : return XLREAD_WOULDBLOCK;
4007 :
4008 : /*
4009 : * Data not here yet. Check for trigger, then wait for
4010 : * walreceiver to wake us up when new WAL arrives.
4011 : */
4012 5651 : if (CheckForStandbyTrigger())
4013 : {
4014 : /*
4015 : * Note that we don't return XLREAD_FAIL immediately
4016 : * here. After being triggered, we still want to
4017 : * replay all the WAL that was already streamed. It's
4018 : * in pg_wal now, so we just treat this as a failure,
4019 : * and the state machine will move on to replay the
4020 : * streamed WAL from pg_wal, and then recheck the
4021 : * trigger and exit replay.
4022 : */
4023 34 : lastSourceFailed = true;
4024 34 : break;
4025 : }
4026 :
4027 : /*
4028 : * Since we have replayed everything we have received so
4029 : * far and are about to start waiting for more WAL, let's
4030 : * tell the upstream server our replay location now so
4031 : * that pg_stat_replication doesn't show stale
4032 : * information.
4033 : */
4034 5617 : if (!streaming_reply_sent)
4035 : {
4036 4336 : WalRcvForceReply();
4037 4336 : streaming_reply_sent = true;
4038 : }
4039 :
4040 : /* Do any background tasks that might benefit us later. */
4041 5617 : KnownAssignedTransactionIdsIdleMaintenance();
4042 :
4043 : /* Update pg_stat_recovery_prefetch before sleeping. */
4044 5617 : XLogPrefetcherComputeStats(xlogprefetcher);
4045 :
4046 : /*
4047 : * Wait for more WAL to arrive, when we will be woken
4048 : * immediately by the WAL receiver.
4049 : */
4050 5617 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
4051 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
4052 : -1L,
4053 : WAIT_EVENT_RECOVERY_WAL_STREAM);
4054 5617 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4055 5617 : break;
4056 : }
4057 :
4058 0 : default:
4059 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
4060 : }
4061 :
4062 : /*
4063 : * Check for recovery pause here so that we can confirm more quickly
4064 : * that a requested pause has actually taken effect.
4065 : */
4066 7307 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4067 : RECOVERY_NOT_PAUSED)
4068 3 : recoveryPausesHere(false);
4069 :
4070 : /*
4071 : * This possibly-long loop needs to handle interrupts of startup
4072 : * process.
4073 : */
4074 7307 : ProcessStartupProcInterrupts();
4075 : }
4076 :
4077 : return XLREAD_FAIL; /* not reached */
4078 : }
4079 :
4080 :
4081 : /*
4082 : * Determine what log level should be used to report a corrupt WAL record
4083 : * in the current WAL page, previously read by XLogPageRead().
4084 : *
4085 : * 'emode' is the error mode that would be used to report a file-not-found
4086 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4087 : * we're retrying the exact same record that we've tried previously, only
4088 : * complain the first time to keep the noise down. However, we only do when
4089 : * reading from pg_wal, because we don't expect any invalid records in archive
4090 : * or in records streamed from the primary. Files in the archive should be complete,
4091 : * and we should never hit the end of WAL because we stop and wait for more WAL
4092 : * to arrive before replaying it.
4093 : *
4094 : * NOTE: This function remembers the RecPtr value it was last called with,
4095 : * to suppress repeated messages about the same record. Only call this when
4096 : * you are about to ereport(), or you might cause a later message to be
4097 : * erroneously suppressed.
4098 : */
4099 : static int
4100 277 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4101 : {
4102 : static XLogRecPtr lastComplaint = InvalidXLogRecPtr;
4103 :
4104 277 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4105 : {
4106 273 : if (RecPtr == lastComplaint)
4107 68 : emode = DEBUG1;
4108 : else
4109 205 : lastComplaint = RecPtr;
4110 : }
4111 277 : return emode;
4112 : }
4113 :
4114 :
4115 : /*
4116 : * Subroutine to try to fetch and validate a prior checkpoint record.
4117 : */
4118 : static XLogRecord *
4119 1006 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4120 : TimeLineID replayTLI)
4121 : {
4122 : XLogRecord *record;
4123 : uint8 info;
4124 :
4125 : Assert(xlogreader != NULL);
4126 :
4127 1006 : if (!XRecOffIsValid(RecPtr))
4128 : {
4129 0 : ereport(LOG,
4130 : (errmsg("invalid checkpoint location")));
4131 0 : return NULL;
4132 : }
4133 :
4134 1006 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4135 1006 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4136 :
4137 1006 : if (record == NULL)
4138 : {
4139 0 : ereport(LOG,
4140 : (errmsg("invalid checkpoint record")));
4141 0 : return NULL;
4142 : }
4143 1006 : if (record->xl_rmid != RM_XLOG_ID)
4144 : {
4145 0 : ereport(LOG,
4146 : (errmsg("invalid resource manager ID in checkpoint record")));
4147 0 : return NULL;
4148 : }
4149 1006 : info = record->xl_info & ~XLR_INFO_MASK;
4150 1006 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4151 : info != XLOG_CHECKPOINT_ONLINE)
4152 : {
4153 0 : ereport(LOG,
4154 : (errmsg("invalid xl_info in checkpoint record")));
4155 0 : return NULL;
4156 : }
4157 1006 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4158 : {
4159 0 : ereport(LOG,
4160 : (errmsg("invalid length of checkpoint record")));
4161 0 : return NULL;
4162 : }
4163 1006 : return record;
4164 : }
4165 :
4166 : /*
4167 : * Scan for new timelines that might have appeared in the archive since we
4168 : * started recovery.
4169 : *
4170 : * If there are any, the function changes recovery target TLI to the latest
4171 : * one and returns 'true'.
4172 : */
4173 : static bool
4174 186 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4175 : {
4176 : List *newExpectedTLEs;
4177 : bool found;
4178 : ListCell *cell;
4179 : TimeLineID newtarget;
4180 186 : TimeLineID oldtarget = recoveryTargetTLI;
4181 186 : TimeLineHistoryEntry *currentTle = NULL;
4182 :
4183 186 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4184 186 : if (newtarget == recoveryTargetTLI)
4185 : {
4186 : /* No new timelines found */
4187 179 : return false;
4188 : }
4189 :
4190 : /*
4191 : * Determine the list of expected TLIs for the new TLI
4192 : */
4193 :
4194 7 : newExpectedTLEs = readTimeLineHistory(newtarget);
4195 :
4196 : /*
4197 : * If the current timeline is not part of the history of the new timeline,
4198 : * we cannot proceed to it.
4199 : */
4200 7 : found = false;
4201 14 : foreach(cell, newExpectedTLEs)
4202 : {
4203 14 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4204 :
4205 14 : if (currentTle->tli == recoveryTargetTLI)
4206 : {
4207 7 : found = true;
4208 7 : break;
4209 : }
4210 : }
4211 7 : if (!found)
4212 : {
4213 0 : ereport(LOG,
4214 : (errmsg("new timeline %u is not a child of database system timeline %u",
4215 : newtarget,
4216 : replayTLI)));
4217 0 : return false;
4218 : }
4219 :
4220 : /*
4221 : * The current timeline was found in the history file, but check that the
4222 : * next timeline was forked off from it *after* the current recovery
4223 : * location.
4224 : */
4225 7 : if (currentTle->end < replayLSN)
4226 : {
4227 0 : ereport(LOG,
4228 : errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4229 : newtarget,
4230 : replayTLI,
4231 : LSN_FORMAT_ARGS(replayLSN)));
4232 0 : return false;
4233 : }
4234 :
4235 : /* The new timeline history seems valid. Switch target */
4236 7 : recoveryTargetTLI = newtarget;
4237 7 : list_free_deep(expectedTLEs);
4238 7 : expectedTLEs = newExpectedTLEs;
4239 :
4240 : /*
4241 : * As in StartupXLOG(), try to ensure we have all the history files
4242 : * between the old target and new target in pg_wal.
4243 : */
4244 7 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4245 :
4246 7 : ereport(LOG,
4247 : (errmsg("new target timeline is %u",
4248 : recoveryTargetTLI)));
4249 :
4250 7 : return true;
4251 : }
4252 :
4253 :
4254 : /*
4255 : * Open a logfile segment for reading (during recovery).
4256 : *
4257 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4258 : * Otherwise, it's assumed to be already available in pg_wal.
4259 : */
4260 : static int
4261 3666 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4262 : XLogSource source, bool notfoundOk)
4263 : {
4264 : char xlogfname[MAXFNAMELEN];
4265 : char activitymsg[MAXFNAMELEN + 16];
4266 : char path[MAXPGPATH];
4267 : int fd;
4268 :
4269 3666 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4270 :
4271 3666 : switch (source)
4272 : {
4273 855 : case XLOG_FROM_ARCHIVE:
4274 : /* Report recovery progress in PS display */
4275 855 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4276 : xlogfname);
4277 855 : set_ps_display(activitymsg);
4278 :
4279 855 : if (!RestoreArchivedFile(path, xlogfname,
4280 : "RECOVERYXLOG",
4281 : wal_segment_size,
4282 : InRedo))
4283 490 : return -1;
4284 364 : break;
4285 :
4286 2811 : case XLOG_FROM_PG_WAL:
4287 : case XLOG_FROM_STREAM:
4288 2811 : XLogFilePath(path, tli, segno, wal_segment_size);
4289 2811 : break;
4290 :
4291 0 : default:
4292 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4293 : }
4294 :
4295 : /*
4296 : * If the segment was fetched from archival storage, replace the existing
4297 : * xlog segment (if any) with the archival version.
4298 : */
4299 3175 : if (source == XLOG_FROM_ARCHIVE)
4300 : {
4301 : Assert(!IsInstallXLogFileSegmentActive());
4302 364 : KeepFileRestoredFromArchive(path, xlogfname);
4303 :
4304 : /*
4305 : * Set path to point at the new file in pg_wal.
4306 : */
4307 364 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4308 : }
4309 :
4310 3175 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4311 3175 : if (fd >= 0)
4312 : {
4313 : /* Success! */
4314 2975 : curFileTLI = tli;
4315 :
4316 : /* Report recovery progress in PS display */
4317 2975 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4318 : xlogfname);
4319 2975 : set_ps_display(activitymsg);
4320 :
4321 : /* Track source of data in assorted state variables */
4322 2975 : readSource = source;
4323 2975 : XLogReceiptSource = source;
4324 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4325 2975 : if (source != XLOG_FROM_STREAM)
4326 1653 : XLogReceiptTime = GetCurrentTimestamp();
4327 :
4328 2975 : return fd;
4329 : }
4330 200 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4331 0 : ereport(PANIC,
4332 : (errcode_for_file_access(),
4333 : errmsg("could not open file \"%s\": %m", path)));
4334 200 : return -1;
4335 : }
4336 :
4337 : /*
4338 : * Open a logfile segment for reading (during recovery).
4339 : *
4340 : * This version searches for the segment with any TLI listed in expectedTLEs.
4341 : */
4342 : static int
4343 1843 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4344 : {
4345 : char path[MAXPGPATH];
4346 : ListCell *cell;
4347 : int fd;
4348 : List *tles;
4349 :
4350 : /*
4351 : * Loop looking for a suitable timeline ID: we might need to read any of
4352 : * the timelines listed in expectedTLEs.
4353 : *
4354 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4355 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4356 : * to go backwards; this prevents us from picking up the wrong file when a
4357 : * parent timeline extends to higher segment numbers than the child we
4358 : * want to read.
4359 : *
4360 : * If we haven't read the timeline history file yet, read it now, so that
4361 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4362 : * however, unless we actually find a valid segment. That way if there is
4363 : * neither a timeline history file nor a WAL segment in the archive, and
4364 : * streaming replication is set up, we'll read the timeline history file
4365 : * streamed from the primary when we start streaming, instead of
4366 : * recovering with a dummy history generated here.
4367 : */
4368 1843 : if (expectedTLEs)
4369 837 : tles = expectedTLEs;
4370 : else
4371 1006 : tles = readTimeLineHistory(recoveryTargetTLI);
4372 :
4373 2050 : foreach(cell, tles)
4374 : {
4375 1867 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4376 1867 : TimeLineID tli = hent->tli;
4377 :
4378 1867 : if (tli < curFileTLI)
4379 6 : break; /* don't bother looking at too-old TLIs */
4380 :
4381 : /*
4382 : * Skip scanning the timeline ID that the logfile segment to read
4383 : * doesn't belong to
4384 : */
4385 1861 : if (XLogRecPtrIsValid(hent->begin))
4386 : {
4387 77 : XLogSegNo beginseg = 0;
4388 :
4389 77 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4390 :
4391 : /*
4392 : * The logfile segment that doesn't belong to the timeline is
4393 : * older or newer than the segment that the timeline started or
4394 : * ended at, respectively. It's sufficient to check only the
4395 : * starting segment of the timeline here. Since the timelines are
4396 : * scanned in descending order in this loop, any segments newer
4397 : * than the ending segment should belong to newer timeline and
4398 : * have already been read before. So it's not necessary to check
4399 : * the ending segment of the timeline here.
4400 : */
4401 77 : if (segno < beginseg)
4402 7 : continue;
4403 : }
4404 :
4405 1854 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4406 : {
4407 855 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4408 854 : if (fd != -1)
4409 : {
4410 364 : elog(DEBUG1, "got WAL segment from archive");
4411 364 : if (!expectedTLEs)
4412 19 : expectedTLEs = tles;
4413 1653 : return fd;
4414 : }
4415 : }
4416 :
4417 1489 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4418 : {
4419 1489 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4420 1489 : if (fd != -1)
4421 : {
4422 1289 : if (!expectedTLEs)
4423 987 : expectedTLEs = tles;
4424 1289 : return fd;
4425 : }
4426 : }
4427 : }
4428 :
4429 : /* Couldn't find it. For simplicity, complain about front timeline */
4430 189 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4431 189 : errno = ENOENT;
4432 189 : ereport(DEBUG2,
4433 : (errcode_for_file_access(),
4434 : errmsg("could not open file \"%s\": %m", path)));
4435 189 : return -1;
4436 : }
4437 :
4438 : /*
4439 : * Set flag to signal the walreceiver to restart. (The startup process calls
4440 : * this on noticing a relevant configuration change.)
4441 : */
4442 : void
4443 11 : StartupRequestWalReceiverRestart(void)
4444 : {
4445 11 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4446 : {
4447 7 : ereport(LOG,
4448 : (errmsg("WAL receiver process shutdown requested")));
4449 :
4450 7 : pendingWalRcvRestart = true;
4451 : }
4452 11 : }
4453 :
4454 :
4455 : /*
4456 : * Has a standby promotion already been triggered?
4457 : *
4458 : * Unlike CheckForStandbyTrigger(), this works in any process
4459 : * that's connected to shared memory.
4460 : */
4461 : bool
4462 72 : PromoteIsTriggered(void)
4463 : {
4464 : /*
4465 : * We check shared state each time only until a standby promotion is
4466 : * triggered. We can't trigger a promotion again, so there's no need to
4467 : * keep checking after the shared variable has once been seen true.
4468 : */
4469 72 : if (LocalPromoteIsTriggered)
4470 51 : return true;
4471 :
4472 21 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4473 21 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4474 21 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4475 :
4476 21 : return LocalPromoteIsTriggered;
4477 : }
4478 :
4479 : static void
4480 48 : SetPromoteIsTriggered(void)
4481 : {
4482 48 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4483 48 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4484 48 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4485 :
4486 : /*
4487 : * Mark the recovery pause state as 'not paused' because the paused state
4488 : * ends and promotion continues if a promotion is triggered while recovery
4489 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4490 : * return 'paused' while a promotion is ongoing.
4491 : */
4492 48 : SetRecoveryPause(false);
4493 :
4494 48 : LocalPromoteIsTriggered = true;
4495 48 : }
4496 :
4497 : /*
4498 : * Check whether a promote request has arrived.
4499 : */
4500 : static bool
4501 6166 : CheckForStandbyTrigger(void)
4502 : {
4503 6166 : if (LocalPromoteIsTriggered)
4504 57 : return true;
4505 :
4506 6109 : if (IsPromoteSignaled() && CheckPromoteSignal())
4507 : {
4508 48 : ereport(LOG, (errmsg("received promote request")));
4509 48 : RemovePromoteSignalFiles();
4510 48 : ResetPromoteSignaled();
4511 48 : SetPromoteIsTriggered();
4512 48 : return true;
4513 : }
4514 :
4515 6061 : return false;
4516 : }
4517 :
4518 : /*
4519 : * Remove the files signaling a standby promotion request.
4520 : */
4521 : void
4522 970 : RemovePromoteSignalFiles(void)
4523 : {
4524 970 : unlink(PROMOTE_SIGNAL_FILE);
4525 970 : }
4526 :
4527 : /*
4528 : * Check to see if a promote request has arrived.
4529 : */
4530 : bool
4531 747 : CheckPromoteSignal(void)
4532 : {
4533 : struct stat stat_buf;
4534 :
4535 747 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4536 96 : return true;
4537 :
4538 651 : return false;
4539 : }
4540 :
4541 : /*
4542 : * Wake up startup process to replay newly arrived WAL, or to notice that
4543 : * failover has been requested.
4544 : */
4545 : void
4546 24435 : WakeupRecovery(void)
4547 : {
4548 24435 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4549 24435 : }
4550 :
4551 : /*
4552 : * Schedule a walreceiver wakeup in the main recovery loop.
4553 : */
4554 : void
4555 2 : XLogRequestWalReceiverReply(void)
4556 : {
4557 2 : doRequestWalReceiverReply = true;
4558 2 : }
4559 :
4560 : /*
4561 : * Is HotStandby active yet? This is only important in special backends
4562 : * since normal backends won't ever be able to connect until this returns
4563 : * true. Postmaster knows this by way of signal, not via shared memory.
4564 : *
4565 : * Unlike testing standbyState, this works in any process that's connected to
4566 : * shared memory. (And note that standbyState alone doesn't tell the truth
4567 : * anyway.)
4568 : */
4569 : bool
4570 166 : HotStandbyActive(void)
4571 : {
4572 : /*
4573 : * We check shared state each time only until Hot Standby is active. We
4574 : * can't de-activate Hot Standby, so there's no need to keep checking
4575 : * after the shared variable has once been seen true.
4576 : */
4577 166 : if (LocalHotStandbyActive)
4578 23 : return true;
4579 : else
4580 : {
4581 : /* spinlock is essential on machines with weak memory ordering! */
4582 143 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4583 143 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4584 143 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4585 :
4586 143 : return LocalHotStandbyActive;
4587 : }
4588 : }
4589 :
4590 : /*
4591 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4592 : * where we don't need to ask any other process what the state is.
4593 : */
4594 : static bool
4595 0 : HotStandbyActiveInReplay(void)
4596 : {
4597 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4598 0 : return LocalHotStandbyActive;
4599 : }
4600 :
4601 : /*
4602 : * Get latest redo apply position.
4603 : *
4604 : * Exported to allow WALReceiver to read the pointer directly.
4605 : */
4606 : XLogRecPtr
4607 62212 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4608 : {
4609 : XLogRecPtr recptr;
4610 : TimeLineID tli;
4611 :
4612 62212 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4613 62212 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4614 62212 : tli = XLogRecoveryCtl->lastReplayedTLI;
4615 62212 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4616 :
4617 62212 : if (replayTLI)
4618 2134 : *replayTLI = tli;
4619 62212 : return recptr;
4620 : }
4621 :
4622 :
4623 : /*
4624 : * Get position of last applied, or the record being applied.
4625 : *
4626 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4627 : * record is currently being applied, this includes that record.
4628 : */
4629 : XLogRecPtr
4630 6388 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4631 : {
4632 : XLogRecPtr recptr;
4633 : TimeLineID tli;
4634 :
4635 6388 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4636 6388 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4637 6388 : tli = XLogRecoveryCtl->replayEndTLI;
4638 6388 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4639 :
4640 6388 : if (replayEndTLI)
4641 6388 : *replayEndTLI = tli;
4642 6388 : return recptr;
4643 : }
4644 :
4645 : /*
4646 : * Save timestamp of latest processed commit/abort record.
4647 : *
4648 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4649 : * seen by processes other than the startup process. Note in particular
4650 : * that CreateRestartPoint is executed in the checkpointer.
4651 : */
4652 : static void
4653 22398 : SetLatestXTime(TimestampTz xtime)
4654 : {
4655 22398 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4656 22398 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4657 22398 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4658 22398 : }
4659 :
4660 : /*
4661 : * Fetch timestamp of latest processed commit/abort record.
4662 : */
4663 : TimestampTz
4664 349 : GetLatestXTime(void)
4665 : {
4666 : TimestampTz xtime;
4667 :
4668 349 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4669 349 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4670 349 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4671 :
4672 349 : return xtime;
4673 : }
4674 :
4675 : /*
4676 : * Save timestamp of the next chunk of WAL records to apply.
4677 : *
4678 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4679 : * seen by all backends.
4680 : */
4681 : static void
4682 3632 : SetCurrentChunkStartTime(TimestampTz xtime)
4683 : {
4684 3632 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4685 3632 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4686 3632 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4687 3632 : }
4688 :
4689 : /*
4690 : * Fetch timestamp of latest processed commit/abort record.
4691 : * Startup process maintains an accurate local copy in XLogReceiptTime
4692 : */
4693 : TimestampTz
4694 246 : GetCurrentChunkReplayStartTime(void)
4695 : {
4696 : TimestampTz xtime;
4697 :
4698 246 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4699 246 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4700 246 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4701 :
4702 246 : return xtime;
4703 : }
4704 :
4705 : /*
4706 : * Returns time of receipt of current chunk of XLOG data, as well as
4707 : * whether it was received from streaming replication or from archives.
4708 : */
4709 : void
4710 29 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4711 : {
4712 : /*
4713 : * This must be executed in the startup process, since we don't export the
4714 : * relevant state to shared memory.
4715 : */
4716 : Assert(InRecovery);
4717 :
4718 29 : *rtime = XLogReceiptTime;
4719 29 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4720 29 : }
4721 :
4722 : /*
4723 : * Note that text field supplied is a parameter name and does not require
4724 : * translation
4725 : */
4726 : void
4727 685 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4728 : {
4729 685 : if (currValue < minValue)
4730 : {
4731 0 : if (HotStandbyActiveInReplay())
4732 : {
4733 0 : bool warned_for_promote = false;
4734 :
4735 0 : ereport(WARNING,
4736 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4737 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4738 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4739 : param_name,
4740 : currValue,
4741 : minValue)));
4742 :
4743 0 : SetRecoveryPause(true);
4744 :
4745 0 : ereport(LOG,
4746 : (errmsg("recovery has paused"),
4747 : errdetail("If recovery is unpaused, the server will shut down."),
4748 : errhint("You can then restart the server after making the necessary configuration changes.")));
4749 :
4750 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4751 : {
4752 0 : ProcessStartupProcInterrupts();
4753 :
4754 0 : if (CheckForStandbyTrigger())
4755 : {
4756 0 : if (!warned_for_promote)
4757 0 : ereport(WARNING,
4758 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4759 : errmsg("promotion is not possible because of insufficient parameter settings"),
4760 :
4761 : /*
4762 : * Repeat the detail from above so it's easy to find
4763 : * in the log.
4764 : */
4765 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4766 : param_name,
4767 : currValue,
4768 : minValue),
4769 : errhint("Restart the server after making the necessary configuration changes.")));
4770 0 : warned_for_promote = true;
4771 : }
4772 :
4773 : /*
4774 : * If recovery pause is requested then set it paused. While
4775 : * we are in the loop, user might resume and pause again so
4776 : * set this every time.
4777 : */
4778 0 : ConfirmRecoveryPaused();
4779 :
4780 : /*
4781 : * We wait on a condition variable that will wake us as soon
4782 : * as the pause ends, but we use a timeout so we can check the
4783 : * above conditions periodically too.
4784 : */
4785 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4786 : WAIT_EVENT_RECOVERY_PAUSE);
4787 : }
4788 0 : ConditionVariableCancelSleep();
4789 : }
4790 :
4791 0 : ereport(FATAL,
4792 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4793 : errmsg("recovery aborted because of insufficient parameter settings"),
4794 : /* Repeat the detail from above so it's easy to find in the log. */
4795 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4796 : param_name,
4797 : currValue,
4798 : minValue),
4799 : errhint("You can restart the server after making the necessary configuration changes.")));
4800 : }
4801 685 : }
4802 :
4803 :
4804 : /*
4805 : * GUC check_hook for primary_slot_name
4806 : */
4807 : bool
4808 1393 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4809 : {
4810 : int err_code;
4811 1393 : char *err_msg = NULL;
4812 1393 : char *err_hint = NULL;
4813 :
4814 1393 : if (*newval && strcmp(*newval, "") != 0 &&
4815 203 : !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
4816 : &err_msg, &err_hint))
4817 : {
4818 0 : GUC_check_errcode(err_code);
4819 0 : GUC_check_errdetail("%s", err_msg);
4820 0 : if (err_hint != NULL)
4821 0 : GUC_check_errhint("%s", err_hint);
4822 0 : return false;
4823 : }
4824 :
4825 1393 : return true;
4826 : }
4827 :
4828 : /*
4829 : * Recovery target settings: Only one of the several recovery_target* settings
4830 : * may be set. Setting a second one results in an error. The global variable
4831 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4832 : * variables store the actual target value (for example a string or a xid).
4833 : * The assign functions of the parameters check whether a competing parameter
4834 : * was already set. But we want to allow setting the same parameter multiple
4835 : * times. We also want to allow unsetting a parameter and setting a different
4836 : * one, so we unset recoveryTarget when the parameter is set to an empty
4837 : * string.
4838 : *
4839 : * XXX this code is broken by design. Throwing an error from a GUC assign
4840 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4841 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4842 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4843 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4844 : */
4845 :
4846 : pg_noreturn static void
4847 1 : error_multiple_recovery_targets(void)
4848 : {
4849 1 : ereport(ERROR,
4850 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4851 : errmsg("multiple recovery targets specified"),
4852 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4853 : }
4854 :
4855 : /*
4856 : * GUC check_hook for recovery_target
4857 : */
4858 : bool
4859 1190 : check_recovery_target(char **newval, void **extra, GucSource source)
4860 : {
4861 1190 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4862 : {
4863 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4864 0 : return false;
4865 : }
4866 1190 : return true;
4867 : }
4868 :
4869 : /*
4870 : * GUC assign_hook for recovery_target
4871 : */
4872 : void
4873 1190 : assign_recovery_target(const char *newval, void *extra)
4874 : {
4875 1190 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4876 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4877 0 : error_multiple_recovery_targets();
4878 :
4879 1190 : if (newval && strcmp(newval, "") != 0)
4880 1 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4881 : else
4882 1189 : recoveryTarget = RECOVERY_TARGET_UNSET;
4883 1190 : }
4884 :
4885 : /*
4886 : * GUC check_hook for recovery_target_lsn
4887 : */
4888 : bool
4889 1196 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4890 : {
4891 1196 : if (strcmp(*newval, "") != 0)
4892 : {
4893 : XLogRecPtr lsn;
4894 : XLogRecPtr *myextra;
4895 8 : ErrorSaveContext escontext = {T_ErrorSaveContext};
4896 :
4897 8 : lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4898 8 : if (escontext.error_occurred)
4899 0 : return false;
4900 :
4901 8 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4902 8 : if (!myextra)
4903 0 : return false;
4904 8 : *myextra = lsn;
4905 8 : *extra = myextra;
4906 : }
4907 1196 : return true;
4908 : }
4909 :
4910 : /*
4911 : * GUC assign_hook for recovery_target_lsn
4912 : */
4913 : void
4914 1196 : assign_recovery_target_lsn(const char *newval, void *extra)
4915 : {
4916 1196 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4917 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4918 0 : error_multiple_recovery_targets();
4919 :
4920 1196 : if (newval && strcmp(newval, "") != 0)
4921 : {
4922 8 : recoveryTarget = RECOVERY_TARGET_LSN;
4923 8 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4924 : }
4925 : else
4926 1188 : recoveryTarget = RECOVERY_TARGET_UNSET;
4927 1196 : }
4928 :
4929 : /*
4930 : * GUC check_hook for recovery_target_name
4931 : */
4932 : bool
4933 1196 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4934 : {
4935 : /* Use the value of newval directly */
4936 1196 : if (strlen(*newval) >= MAXFNAMELEN)
4937 : {
4938 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4939 : "recovery_target_name", MAXFNAMELEN - 1);
4940 0 : return false;
4941 : }
4942 1196 : return true;
4943 : }
4944 :
4945 : /*
4946 : * GUC assign_hook for recovery_target_name
4947 : */
4948 : void
4949 1196 : assign_recovery_target_name(const char *newval, void *extra)
4950 : {
4951 1196 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4952 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4953 0 : error_multiple_recovery_targets();
4954 :
4955 1196 : if (newval && strcmp(newval, "") != 0)
4956 : {
4957 6 : recoveryTarget = RECOVERY_TARGET_NAME;
4958 6 : recoveryTargetName = newval;
4959 : }
4960 : else
4961 1190 : recoveryTarget = RECOVERY_TARGET_UNSET;
4962 1196 : }
4963 :
4964 : /*
4965 : * GUC check_hook for recovery_target_time
4966 : *
4967 : * The interpretation of the recovery_target_time string can depend on the
4968 : * time zone setting, so we need to wait until after all GUC processing is
4969 : * done before we can do the final parsing of the string. This check function
4970 : * only does a parsing pass to catch syntax errors, but we store the string
4971 : * and parse it again when we need to use it.
4972 : */
4973 : bool
4974 1192 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4975 : {
4976 1192 : if (strcmp(*newval, "") != 0)
4977 : {
4978 : /* reject some special values */
4979 3 : if (strcmp(*newval, "now") == 0 ||
4980 3 : strcmp(*newval, "today") == 0 ||
4981 3 : strcmp(*newval, "tomorrow") == 0 ||
4982 3 : strcmp(*newval, "yesterday") == 0)
4983 : {
4984 0 : return false;
4985 : }
4986 :
4987 : /*
4988 : * parse timestamp value (see also timestamptz_in())
4989 : */
4990 : {
4991 3 : char *str = *newval;
4992 : fsec_t fsec;
4993 : struct pg_tm tt,
4994 3 : *tm = &tt;
4995 : int tz;
4996 : int dtype;
4997 : int nf;
4998 : int dterr;
4999 : char *field[MAXDATEFIELDS];
5000 : int ftype[MAXDATEFIELDS];
5001 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
5002 : DateTimeErrorExtra dtextra;
5003 : TimestampTz timestamp;
5004 :
5005 3 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
5006 : field, ftype, MAXDATEFIELDS, &nf);
5007 3 : if (dterr == 0)
5008 3 : dterr = DecodeDateTime(field, ftype, nf,
5009 : &dtype, tm, &fsec, &tz, &dtextra);
5010 3 : if (dterr != 0)
5011 0 : return false;
5012 3 : if (dtype != DTK_DATE)
5013 0 : return false;
5014 :
5015 3 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
5016 : {
5017 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
5018 0 : return false;
5019 : }
5020 : }
5021 : }
5022 1192 : return true;
5023 : }
5024 :
5025 : /*
5026 : * GUC assign_hook for recovery_target_time
5027 : */
5028 : void
5029 1192 : assign_recovery_target_time(const char *newval, void *extra)
5030 : {
5031 1192 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5032 1 : recoveryTarget != RECOVERY_TARGET_TIME)
5033 1 : error_multiple_recovery_targets();
5034 :
5035 1191 : if (newval && strcmp(newval, "") != 0)
5036 2 : recoveryTarget = RECOVERY_TARGET_TIME;
5037 : else
5038 1189 : recoveryTarget = RECOVERY_TARGET_UNSET;
5039 1191 : }
5040 :
5041 : /*
5042 : * GUC check_hook for recovery_target_timeline
5043 : */
5044 : bool
5045 1193 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
5046 : {
5047 : RecoveryTargetTimeLineGoal rttg;
5048 : RecoveryTargetTimeLineGoal *myextra;
5049 :
5050 1193 : if (strcmp(*newval, "current") == 0)
5051 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
5052 1193 : else if (strcmp(*newval, "latest") == 0)
5053 1190 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
5054 : else
5055 : {
5056 : char *endp;
5057 : uint64 timeline;
5058 :
5059 3 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
5060 :
5061 3 : errno = 0;
5062 3 : timeline = strtou64(*newval, &endp, 0);
5063 :
5064 3 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5065 : {
5066 1 : GUC_check_errdetail("\"%s\" is not a valid number.",
5067 : "recovery_target_timeline");
5068 3 : return false;
5069 : }
5070 :
5071 2 : if (timeline < 1 || timeline > PG_UINT32_MAX)
5072 : {
5073 2 : GUC_check_errdetail("\"%s\" must be between %u and %u.",
5074 : "recovery_target_timeline", 1, PG_UINT32_MAX);
5075 2 : return false;
5076 : }
5077 : }
5078 :
5079 1190 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5080 1190 : if (!myextra)
5081 0 : return false;
5082 1190 : *myextra = rttg;
5083 1190 : *extra = myextra;
5084 :
5085 1190 : return true;
5086 : }
5087 :
5088 : /*
5089 : * GUC assign_hook for recovery_target_timeline
5090 : */
5091 : void
5092 1190 : assign_recovery_target_timeline(const char *newval, void *extra)
5093 : {
5094 1190 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5095 1190 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5096 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5097 : else
5098 1190 : recoveryTargetTLIRequested = 0;
5099 1190 : }
5100 :
5101 : /*
5102 : * GUC check_hook for recovery_target_xid
5103 : */
5104 : bool
5105 1190 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5106 : {
5107 1190 : if (strcmp(*newval, "") != 0)
5108 : {
5109 : TransactionId xid;
5110 : TransactionId *myextra;
5111 :
5112 1 : errno = 0;
5113 1 : xid = (TransactionId) strtou64(*newval, NULL, 0);
5114 1 : if (errno == EINVAL || errno == ERANGE)
5115 0 : return false;
5116 :
5117 1 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5118 1 : if (!myextra)
5119 0 : return false;
5120 1 : *myextra = xid;
5121 1 : *extra = myextra;
5122 : }
5123 1190 : return true;
5124 : }
5125 :
5126 : /*
5127 : * GUC assign_hook for recovery_target_xid
5128 : */
5129 : void
5130 1190 : assign_recovery_target_xid(const char *newval, void *extra)
5131 : {
5132 1190 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5133 0 : recoveryTarget != RECOVERY_TARGET_XID)
5134 0 : error_multiple_recovery_targets();
5135 :
5136 1190 : if (newval && strcmp(newval, "") != 0)
5137 : {
5138 1 : recoveryTarget = RECOVERY_TARGET_XID;
5139 1 : recoveryTargetXid = *((TransactionId *) extra);
5140 : }
5141 : else
5142 1189 : recoveryTarget = RECOVERY_TARGET_UNSET;
5143 1190 : }
|