Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <math.h>
29 : #include <time.h>
30 : #include <sys/stat.h>
31 : #include <sys/time.h>
32 : #include <unistd.h>
33 :
34 : #include "access/timeline.h"
35 : #include "access/transam.h"
36 : #include "access/xact.h"
37 : #include "access/xlog_internal.h"
38 : #include "access/xlogarchive.h"
39 : #include "access/xlogprefetcher.h"
40 : #include "access/xlogreader.h"
41 : #include "access/xlogrecovery.h"
42 : #include "access/xlogutils.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "pgstat.h"
49 : #include "postmaster/bgwriter.h"
50 : #include "postmaster/startup.h"
51 : #include "replication/slot.h"
52 : #include "replication/slotsync.h"
53 : #include "replication/walreceiver.h"
54 : #include "storage/fd.h"
55 : #include "storage/ipc.h"
56 : #include "storage/latch.h"
57 : #include "storage/pmsignal.h"
58 : #include "storage/procarray.h"
59 : #include "storage/spin.h"
60 : #include "utils/datetime.h"
61 : #include "utils/fmgrprotos.h"
62 : #include "utils/guc_hooks.h"
63 : #include "utils/pgstat_internal.h"
64 : #include "utils/pg_lsn.h"
65 : #include "utils/ps_status.h"
66 : #include "utils/pg_rusage.h"
67 :
68 : /* Unsupported old recovery command file names (relative to $PGDATA) */
69 : #define RECOVERY_COMMAND_FILE "recovery.conf"
70 : #define RECOVERY_COMMAND_DONE "recovery.done"
71 :
72 : /*
73 : * GUC support
74 : */
75 : const struct config_enum_entry recovery_target_action_options[] = {
76 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
77 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
78 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
79 : {NULL, 0, false}
80 : };
81 :
82 : /* options formerly taken from recovery.conf for archive recovery */
83 : char *recoveryRestoreCommand = NULL;
84 : char *recoveryEndCommand = NULL;
85 : char *archiveCleanupCommand = NULL;
86 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
87 : bool recoveryTargetInclusive = true;
88 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
89 : TransactionId recoveryTargetXid;
90 : char *recovery_target_time_string;
91 : TimestampTz recoveryTargetTime;
92 : const char *recoveryTargetName;
93 : XLogRecPtr recoveryTargetLSN;
94 : int recovery_min_apply_delay = 0;
95 :
96 : /* options formerly taken from recovery.conf for XLOG streaming */
97 : char *PrimaryConnInfo = NULL;
98 : char *PrimarySlotName = NULL;
99 : bool wal_receiver_create_temp_slot = false;
100 :
101 : /*
102 : * recoveryTargetTimeLineGoal: what the user requested, if any
103 : *
104 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105 : *
106 : * recoveryTargetTLI: the currently understood target timeline; changes
107 : *
108 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
109 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
110 : * always the first list member). Only these TLIs are expected to be seen in
111 : * the WAL segments we read, and indeed only these TLIs will be considered as
112 : * candidate WAL files to open at all.
113 : *
114 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
115 : * (This is not necessarily the same as the timeline from which we are
116 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
117 : * scanning data that was copied from an ancestor timeline when the current
118 : * file was created.) During a sequential scan we do not allow this value
119 : * to decrease.
120 : */
121 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
122 : TimeLineID recoveryTargetTLIRequested = 0;
123 : TimeLineID recoveryTargetTLI = 0;
124 : static List *expectedTLEs;
125 : static TimeLineID curFileTLI;
126 :
127 : /*
128 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
129 : * ie. signal files were present. When InArchiveRecovery is set, we are
130 : * currently recovering using offline XLOG archives. These variables are only
131 : * valid in the startup process.
132 : *
133 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
134 : * currently performing crash recovery using only XLOG files in pg_wal, but
135 : * will switch to using offline XLOG archives as soon as we reach the end of
136 : * WAL in pg_wal.
137 : */
138 : bool ArchiveRecoveryRequested = false;
139 : bool InArchiveRecovery = false;
140 :
141 : /*
142 : * When StandbyModeRequested is set, standby mode was requested, i.e.
143 : * standby.signal file was present. When StandbyMode is set, we are currently
144 : * in standby mode. These variables are only valid in the startup process.
145 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146 : */
147 : static bool StandbyModeRequested = false;
148 : bool StandbyMode = false;
149 :
150 : /* was a signal file present at startup? */
151 : static bool standby_signal_file_found = false;
152 : static bool recovery_signal_file_found = false;
153 :
154 : /*
155 : * CheckPointLoc is the position of the checkpoint record that determines
156 : * where to start the replay. It comes from the backup label file or the
157 : * control file.
158 : *
159 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
160 : * file or the control file. In standby mode, XLOG streaming usually starts
161 : * from the position where an invalid record was found. But if we fail to
162 : * read even the initial checkpoint record, we use the REDO location instead
163 : * of the checkpoint location as the start position of XLOG streaming.
164 : * Otherwise we would have to jump backwards to the REDO location after
165 : * reading the checkpoint record, because the REDO record can precede the
166 : * checkpoint record.
167 : */
168 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
169 : static TimeLineID CheckPointTLI = 0;
170 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
171 : static TimeLineID RedoStartTLI = 0;
172 :
173 : /*
174 : * Local copy of SharedHotStandbyActive variable. False actually means "not
175 : * known, need to check the shared state".
176 : */
177 : static bool LocalHotStandbyActive = false;
178 :
179 : /*
180 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
181 : * known, need to check the shared state".
182 : */
183 : static bool LocalPromoteIsTriggered = false;
184 :
185 : /* Has the recovery code requested a walreceiver wakeup? */
186 : static bool doRequestWalReceiverReply;
187 :
188 : /* XLogReader object used to parse the WAL records */
189 : static XLogReaderState *xlogreader = NULL;
190 :
191 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 : static XLogPrefetcher *xlogprefetcher = NULL;
193 :
194 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
195 : typedef struct XLogPageReadPrivate
196 : {
197 : int emode;
198 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
199 : bool randAccess;
200 : TimeLineID replayTLI;
201 : } XLogPageReadPrivate;
202 :
203 : /* flag to tell XLogPageRead that we have started replaying */
204 : static bool InRedo = false;
205 :
206 : /*
207 : * Codes indicating where we got a WAL file from during recovery, or where
208 : * to attempt to get one.
209 : */
210 : typedef enum
211 : {
212 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
213 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
214 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
215 : XLOG_FROM_STREAM, /* streamed from primary */
216 : } XLogSource;
217 :
218 : /* human-readable names for XLogSources, for debugging output */
219 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
220 :
221 : /*
222 : * readFile is -1 or a kernel FD for the log file segment that's currently
223 : * open for reading. readSegNo identifies the segment. readOff is the offset
224 : * of the page just read, readLen indicates how much of it has been read into
225 : * readBuf, and readSource indicates where we got the currently open file from.
226 : *
227 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
228 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
229 : * worthwhile, since the XLOG is not read by general-purpose sessions.
230 : */
231 : static int readFile = -1;
232 : static XLogSegNo readSegNo = 0;
233 : static uint32 readOff = 0;
234 : static uint32 readLen = 0;
235 : static XLogSource readSource = XLOG_FROM_ANY;
236 :
237 : /*
238 : * Keeps track of which source we're currently reading from. This is
239 : * different from readSource in that this is always set, even when we don't
240 : * currently have a WAL file open. If lastSourceFailed is set, our last
241 : * attempt to read from currentSource failed, and we should try another source
242 : * next.
243 : *
244 : * pendingWalRcvRestart is set when a config change occurs that requires a
245 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
246 : */
247 : static XLogSource currentSource = XLOG_FROM_ANY;
248 : static bool lastSourceFailed = false;
249 : static bool pendingWalRcvRestart = false;
250 :
251 : /*
252 : * These variables track when we last obtained some WAL data to process,
253 : * and where we got it from. (XLogReceiptSource is initially the same as
254 : * readSource, but readSource gets reset to zero when we don't have data
255 : * to process right now. It is also different from currentSource, which
256 : * also changes when we try to read from a source and fail, while
257 : * XLogReceiptSource tracks where we last successfully read some WAL.)
258 : */
259 : static TimestampTz XLogReceiptTime = 0;
260 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
261 :
262 : /* Local copy of WalRcv->flushedUpto */
263 : static XLogRecPtr flushedUpto = 0;
264 : static TimeLineID receiveTLI = 0;
265 :
266 : /*
267 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
268 : *
269 : * In order to reach consistency, we must replay the WAL up to
270 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
271 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
272 : * to backupStartPoint.
273 : *
274 : * Note: In archive recovery, after consistency has been reached, the
275 : * functions in xlog.c will start updating minRecoveryPoint in the control
276 : * file. But this copy of minRecoveryPoint variable reflects the value at the
277 : * beginning of recovery, and is *not* updated after consistency is reached.
278 : */
279 : static XLogRecPtr minRecoveryPoint;
280 : static TimeLineID minRecoveryPointTLI;
281 :
282 : static XLogRecPtr backupStartPoint;
283 : static XLogRecPtr backupEndPoint;
284 : static bool backupEndRequired = false;
285 :
286 : /*
287 : * Have we reached a consistent database state? In crash recovery, we have
288 : * to replay all the WAL, so reachedConsistency is never set. During archive
289 : * recovery, the database is consistent once minRecoveryPoint is reached.
290 : *
291 : * Consistent state means that the system is internally consistent, all
292 : * the WAL has been replayed up to a certain point, and importantly, there
293 : * is no trace of later actions on disk.
294 : */
295 : bool reachedConsistency = false;
296 :
297 : /* Buffers dedicated to consistency checks of size BLCKSZ */
298 : static char *replay_image_masked = NULL;
299 : static char *primary_image_masked = NULL;
300 :
301 :
302 : /*
303 : * Shared-memory state for WAL recovery.
304 : */
305 : typedef struct XLogRecoveryCtlData
306 : {
307 : /*
308 : * SharedHotStandbyActive indicates if we allow hot standby queries to be
309 : * run. Protected by info_lck.
310 : */
311 : bool SharedHotStandbyActive;
312 :
313 : /*
314 : * SharedPromoteIsTriggered indicates if a standby promotion has been
315 : * triggered. Protected by info_lck.
316 : */
317 : bool SharedPromoteIsTriggered;
318 :
319 : /*
320 : * recoveryWakeupLatch is used to wake up the startup process to continue
321 : * WAL replay, if it is waiting for WAL to arrive or promotion to be
322 : * requested.
323 : *
324 : * Note that the startup process also uses another latch, its procLatch,
325 : * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
326 : * signaling the startup process in favor of using its procLatch, which
327 : * comports better with possible generic signal handlers using that latch.
328 : * But we should not do that because the startup process doesn't assume
329 : * that it's waken up by walreceiver process or SIGHUP signal handler
330 : * while it's waiting for recovery conflict. The separate latches,
331 : * recoveryWakeupLatch and procLatch, should be used for inter-process
332 : * communication for WAL replay and recovery conflict, respectively.
333 : */
334 : Latch recoveryWakeupLatch;
335 :
336 : /*
337 : * Last record successfully replayed.
338 : */
339 : XLogRecPtr lastReplayedReadRecPtr; /* start position */
340 : XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
341 : TimeLineID lastReplayedTLI; /* timeline */
342 :
343 : /*
344 : * When we're currently replaying a record, ie. in a redo function,
345 : * replayEndRecPtr points to the end+1 of the record being replayed,
346 : * otherwise it's equal to lastReplayedEndRecPtr.
347 : */
348 : XLogRecPtr replayEndRecPtr;
349 : TimeLineID replayEndTLI;
350 : /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351 : TimestampTz recoveryLastXTime;
352 :
353 : /*
354 : * timestamp of when we started replaying the current chunk of WAL data,
355 : * only relevant for replication or archive recovery
356 : */
357 : TimestampTz currentChunkStartTime;
358 : /* Recovery pause state */
359 : RecoveryPauseState recoveryPauseState;
360 : ConditionVariable recoveryNotPausedCV;
361 :
362 : slock_t info_lck; /* locks shared variables shown above */
363 : } XLogRecoveryCtlData;
364 :
365 : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
366 :
367 : /*
368 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
369 : * recovery completes; missingContrecPtr is the location of the first
370 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
371 : * details.
372 : */
373 : static XLogRecPtr abortedRecPtr;
374 : static XLogRecPtr missingContrecPtr;
375 :
376 : /*
377 : * if recoveryStopsBefore/After returns true, it saves information of the stop
378 : * point here
379 : */
380 : static TransactionId recoveryStopXid;
381 : static TimestampTz recoveryStopTime;
382 : static XLogRecPtr recoveryStopLSN;
383 : static char recoveryStopName[MAXFNAMELEN];
384 : static bool recoveryStopAfter;
385 :
386 : /* prototypes for local functions */
387 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
388 :
389 : static void EnableStandbyMode(void);
390 : static void readRecoverySignalFile(void);
391 : static void validateRecoveryParameters(void);
392 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
393 : TimeLineID *backupLabelTLI,
394 : bool *backupEndRequired, bool *backupFromStandby);
395 : static bool read_tablespace_map(List **tablespaces);
396 :
397 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
398 : static void CheckRecoveryConsistency(void);
399 : static void rm_redo_error_callback(void *arg);
400 : #ifdef WAL_DEBUG
401 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
402 : #endif
403 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
404 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
405 : TimeLineID prevTLI, TimeLineID replayTLI);
406 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
407 : static void verifyBackupPageConsistency(XLogReaderState *record);
408 :
409 : static bool recoveryStopsBefore(XLogReaderState *record);
410 : static bool recoveryStopsAfter(XLogReaderState *record);
411 : static char *getRecoveryStopReason(void);
412 : static void recoveryPausesHere(bool endOfRecovery);
413 : static bool recoveryApplyDelay(XLogReaderState *record);
414 : static void ConfirmRecoveryPaused(void);
415 :
416 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
417 : int emode, bool fetching_ckpt,
418 : TimeLineID replayTLI);
419 :
420 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
421 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
422 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
423 : bool randAccess,
424 : bool fetching_ckpt,
425 : XLogRecPtr tliRecPtr,
426 : TimeLineID replayTLI,
427 : XLogRecPtr replayLSN,
428 : bool nonblocking);
429 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
430 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
431 : XLogRecPtr RecPtr, TimeLineID replayTLI);
432 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
433 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
434 : XLogSource source, bool notfoundOk);
435 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
436 :
437 : static bool CheckForStandbyTrigger(void);
438 : static void SetPromoteIsTriggered(void);
439 : static bool HotStandbyActiveInReplay(void);
440 :
441 : static void SetCurrentChunkStartTime(TimestampTz xtime);
442 : static void SetLatestXTime(TimestampTz xtime);
443 :
444 : /*
445 : * Initialization of shared memory for WAL recovery
446 : */
447 : Size
448 5826 : XLogRecoveryShmemSize(void)
449 : {
450 : Size size;
451 :
452 : /* XLogRecoveryCtl */
453 5826 : size = sizeof(XLogRecoveryCtlData);
454 :
455 5826 : return size;
456 : }
457 :
458 : void
459 2032 : XLogRecoveryShmemInit(void)
460 : {
461 : bool found;
462 :
463 2032 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
464 2032 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
465 2032 : if (found)
466 0 : return;
467 2032 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
468 :
469 2032 : SpinLockInit(&XLogRecoveryCtl->info_lck);
470 2032 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
471 2032 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
472 : }
473 :
474 : /*
475 : * A thin wrapper to enable StandbyMode and do other preparatory work as
476 : * needed.
477 : */
478 : static void
479 202 : EnableStandbyMode(void)
480 : {
481 202 : StandbyMode = true;
482 :
483 : /*
484 : * To avoid server log bloat, we don't report recovery progress in a
485 : * standby as it will always be in recovery unless promoted. We disable
486 : * startup progress timeout in standby mode to avoid calling
487 : * startup_progress_timeout_handler() unnecessarily.
488 : */
489 202 : disable_startup_progress_timeout();
490 202 : }
491 :
492 : /*
493 : * Prepare the system for WAL recovery, if needed.
494 : *
495 : * This is called by StartupXLOG() which coordinates the server startup
496 : * sequence. This function analyzes the control file and the backup label
497 : * file, if any, and figures out whether we need to perform crash recovery or
498 : * archive recovery, and how far we need to replay the WAL to reach a
499 : * consistent state.
500 : *
501 : * This doesn't yet change the on-disk state, except for creating the symlinks
502 : * from table space map file if any, and for fetching WAL files needed to find
503 : * the checkpoint record. On entry, the caller has already read the control
504 : * file into memory, and passes it as argument. This function updates it to
505 : * reflect the recovery state, and the caller is expected to write it back to
506 : * disk does after initializing other subsystems, but before calling
507 : * PerformWalRecovery().
508 : *
509 : * This initializes some global variables like ArchiveRecoveryRequested, and
510 : * StandbyModeRequested and InRecovery.
511 : */
512 : void
513 1762 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
514 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
515 : {
516 : XLogPageReadPrivate *private;
517 : struct stat st;
518 : bool wasShutdown;
519 : XLogRecord *record;
520 : DBState dbstate_at_startup;
521 1762 : bool haveTblspcMap = false;
522 1762 : bool haveBackupLabel = false;
523 : CheckPoint checkPoint;
524 1762 : bool backupFromStandby = false;
525 :
526 1762 : dbstate_at_startup = ControlFile->state;
527 :
528 : /*
529 : * Initialize on the assumption we want to recover to the latest timeline
530 : * that's active according to pg_control.
531 : */
532 1762 : if (ControlFile->minRecoveryPointTLI >
533 1762 : ControlFile->checkPointCopy.ThisTimeLineID)
534 4 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
535 : else
536 1758 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
537 :
538 : /*
539 : * Check for signal files, and if so set up state for offline recovery
540 : */
541 1762 : readRecoverySignalFile();
542 1762 : validateRecoveryParameters();
543 :
544 : /*
545 : * Take ownership of the wakeup latch if we're going to sleep during
546 : * recovery, if required.
547 : */
548 1762 : if (ArchiveRecoveryRequested)
549 212 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
550 :
551 : /*
552 : * Set the WAL reading processor now, as it will be needed when reading
553 : * the checkpoint record required (backup_label or not).
554 : */
555 1762 : private = palloc0(sizeof(XLogPageReadPrivate));
556 1762 : xlogreader =
557 1762 : XLogReaderAllocate(wal_segment_size, NULL,
558 1762 : XL_ROUTINE(.page_read = &XLogPageRead,
559 : .segment_open = NULL,
560 : .segment_close = wal_segment_close),
561 : private);
562 1762 : if (!xlogreader)
563 0 : ereport(ERROR,
564 : (errcode(ERRCODE_OUT_OF_MEMORY),
565 : errmsg("out of memory"),
566 : errdetail("Failed while allocating a WAL reading processor.")));
567 1762 : xlogreader->system_identifier = ControlFile->system_identifier;
568 :
569 : /*
570 : * Set the WAL decode buffer size. This limits how far ahead we can read
571 : * in the WAL.
572 : */
573 1762 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
574 :
575 : /* Create a WAL prefetcher. */
576 1762 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
577 :
578 : /*
579 : * Allocate two page buffers dedicated to WAL consistency checks. We do
580 : * it this way, rather than just making static arrays, for two reasons:
581 : * (1) no need to waste the storage in most instantiations of the backend;
582 : * (2) a static char array isn't guaranteed to have any particular
583 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
584 : */
585 1762 : replay_image_masked = (char *) palloc(BLCKSZ);
586 1762 : primary_image_masked = (char *) palloc(BLCKSZ);
587 :
588 : /*
589 : * Read the backup_label file. We want to run this part of the recovery
590 : * process after checking for signal files and after performing validation
591 : * of the recovery parameters.
592 : */
593 1762 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
594 : &backupFromStandby))
595 : {
596 138 : List *tablespaces = NIL;
597 :
598 : /*
599 : * Archive recovery was requested, and thanks to the backup label
600 : * file, we know how far we need to replay to reach consistency. Enter
601 : * archive recovery directly.
602 : */
603 138 : InArchiveRecovery = true;
604 138 : if (StandbyModeRequested)
605 116 : EnableStandbyMode();
606 :
607 : /*
608 : * Omitting backup_label when creating a new replica, PITR node etc.
609 : * unfortunately is a common cause of corruption. Logging that
610 : * backup_label was used makes it a bit easier to exclude that as the
611 : * cause of observed corruption.
612 : *
613 : * Do so before we try to read the checkpoint record (which can fail),
614 : * as otherwise it can be hard to understand why a checkpoint other
615 : * than ControlFile->checkPoint is used.
616 : */
617 138 : ereport(LOG,
618 : (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
619 : LSN_FORMAT_ARGS(RedoStartLSN),
620 : LSN_FORMAT_ARGS(CheckPointLoc),
621 : CheckPointTLI)));
622 :
623 : /*
624 : * When a backup_label file is present, we want to roll forward from
625 : * the checkpoint it identifies, rather than using pg_control.
626 : */
627 138 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
628 : CheckPointTLI);
629 138 : if (record != NULL)
630 : {
631 138 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
632 138 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
633 138 : ereport(DEBUG1,
634 : (errmsg_internal("checkpoint record is at %X/%X",
635 : LSN_FORMAT_ARGS(CheckPointLoc))));
636 138 : InRecovery = true; /* force recovery even if SHUTDOWNED */
637 :
638 : /*
639 : * Make sure that REDO location exists. This may not be the case
640 : * if there was a crash during an online backup, which left a
641 : * backup_label around that references a WAL segment that's
642 : * already been archived.
643 : */
644 138 : if (checkPoint.redo < CheckPointLoc)
645 : {
646 138 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
647 138 : if (!ReadRecord(xlogprefetcher, LOG, false,
648 : checkPoint.ThisTimeLineID))
649 0 : ereport(FATAL,
650 : (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
651 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
652 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
653 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
654 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
655 : DataDir, DataDir, DataDir, DataDir)));
656 : }
657 : }
658 : else
659 : {
660 0 : ereport(FATAL,
661 : (errmsg("could not locate required checkpoint record at %X/%X",
662 : LSN_FORMAT_ARGS(CheckPointLoc)),
663 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
664 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
665 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
666 : DataDir, DataDir, DataDir, DataDir)));
667 : wasShutdown = false; /* keep compiler quiet */
668 : }
669 :
670 : /* Read the tablespace_map file if present and create symlinks. */
671 138 : if (read_tablespace_map(&tablespaces))
672 : {
673 : ListCell *lc;
674 :
675 8 : foreach(lc, tablespaces)
676 : {
677 4 : tablespaceinfo *ti = lfirst(lc);
678 : char *linkloc;
679 :
680 4 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
681 :
682 : /*
683 : * Remove the existing symlink if any and Create the symlink
684 : * under PGDATA.
685 : */
686 4 : remove_tablespace_symlink(linkloc);
687 :
688 4 : if (symlink(ti->path, linkloc) < 0)
689 0 : ereport(ERROR,
690 : (errcode_for_file_access(),
691 : errmsg("could not create symbolic link \"%s\": %m",
692 : linkloc)));
693 :
694 4 : pfree(ti->path);
695 4 : pfree(ti);
696 : }
697 :
698 : /* tell the caller to delete it later */
699 4 : haveTblspcMap = true;
700 : }
701 :
702 : /* tell the caller to delete it later */
703 138 : haveBackupLabel = true;
704 : }
705 : else
706 : {
707 : /* No backup_label file has been found if we are here. */
708 :
709 : /*
710 : * If tablespace_map file is present without backup_label file, there
711 : * is no use of such file. There is no harm in retaining it, but it
712 : * is better to get rid of the map file so that we don't have any
713 : * redundant file in data directory and it will avoid any sort of
714 : * confusion. It seems prudent though to just rename the file out of
715 : * the way rather than delete it completely, also we ignore any error
716 : * that occurs in rename operation as even if map file is present
717 : * without backup_label file, it is harmless.
718 : */
719 1624 : if (stat(TABLESPACE_MAP, &st) == 0)
720 : {
721 2 : unlink(TABLESPACE_MAP_OLD);
722 2 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
723 2 : ereport(LOG,
724 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
725 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
726 : errdetail("File \"%s\" was renamed to \"%s\".",
727 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
728 : else
729 0 : ereport(LOG,
730 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
732 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
733 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
734 : }
735 :
736 : /*
737 : * It's possible that archive recovery was requested, but we don't
738 : * know how far we need to replay the WAL before we reach consistency.
739 : * This can happen for example if a base backup is taken from a
740 : * running server using an atomic filesystem snapshot, without calling
741 : * pg_backup_start/stop. Or if you just kill a running primary server
742 : * and put it into archive recovery by creating a recovery signal
743 : * file.
744 : *
745 : * Our strategy in that case is to perform crash recovery first,
746 : * replaying all the WAL present in pg_wal, and only enter archive
747 : * recovery after that.
748 : *
749 : * But usually we already know how far we need to replay the WAL (up
750 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
751 : * end-of-backup record), and we can enter archive recovery directly.
752 : */
753 1624 : if (ArchiveRecoveryRequested &&
754 86 : (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
755 18 : ControlFile->backupEndRequired ||
756 18 : ControlFile->backupEndPoint != InvalidXLogRecPtr ||
757 18 : ControlFile->state == DB_SHUTDOWNED))
758 : {
759 82 : InArchiveRecovery = true;
760 82 : if (StandbyModeRequested)
761 82 : EnableStandbyMode();
762 : }
763 :
764 : /*
765 : * For the same reason as when starting up with backup_label present,
766 : * emit a log message when we continue initializing from a base
767 : * backup.
768 : */
769 1624 : if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
770 0 : ereport(LOG,
771 : (errmsg("restarting backup recovery with redo LSN %X/%X",
772 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
773 :
774 : /* Get the last valid checkpoint record. */
775 1624 : CheckPointLoc = ControlFile->checkPoint;
776 1624 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
777 1624 : RedoStartLSN = ControlFile->checkPointCopy.redo;
778 1624 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
779 1624 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
780 : CheckPointTLI);
781 1624 : if (record != NULL)
782 : {
783 1624 : ereport(DEBUG1,
784 : (errmsg_internal("checkpoint record is at %X/%X",
785 : LSN_FORMAT_ARGS(CheckPointLoc))));
786 : }
787 : else
788 : {
789 : /*
790 : * We used to attempt to go back to a secondary checkpoint record
791 : * here, but only when not in standby mode. We now just fail if we
792 : * can't read the last checkpoint because this allows us to
793 : * simplify processing around checkpoints.
794 : */
795 0 : ereport(PANIC,
796 : (errmsg("could not locate a valid checkpoint record at %X/%X",
797 : LSN_FORMAT_ARGS(CheckPointLoc))));
798 : }
799 1624 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
800 1624 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
801 : }
802 :
803 1762 : if (ArchiveRecoveryRequested)
804 : {
805 212 : if (StandbyModeRequested)
806 202 : ereport(LOG,
807 : (errmsg("entering standby mode")));
808 10 : else if (recoveryTarget == RECOVERY_TARGET_XID)
809 0 : ereport(LOG,
810 : (errmsg("starting point-in-time recovery to XID %u",
811 : recoveryTargetXid)));
812 10 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
813 0 : ereport(LOG,
814 : (errmsg("starting point-in-time recovery to %s",
815 : timestamptz_to_str(recoveryTargetTime))));
816 10 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
817 6 : ereport(LOG,
818 : (errmsg("starting point-in-time recovery to \"%s\"",
819 : recoveryTargetName)));
820 4 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
821 0 : ereport(LOG,
822 : (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
823 : LSN_FORMAT_ARGS(recoveryTargetLSN))));
824 4 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
825 0 : ereport(LOG,
826 : (errmsg("starting point-in-time recovery to earliest consistent point")));
827 : else
828 4 : ereport(LOG,
829 : (errmsg("starting archive recovery")));
830 : }
831 :
832 : /*
833 : * If the location of the checkpoint record is not on the expected
834 : * timeline in the history of the requested timeline, we cannot proceed:
835 : * the backup is not part of the history of the requested timeline.
836 : */
837 : Assert(expectedTLEs); /* was initialized by reading checkpoint
838 : * record */
839 1762 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
840 : CheckPointTLI)
841 : {
842 : XLogRecPtr switchpoint;
843 :
844 : /*
845 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
846 : * not in expectedTLEs at all.
847 : */
848 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
849 0 : ereport(FATAL,
850 : (errmsg("requested timeline %u is not a child of this server's history",
851 : recoveryTargetTLI),
852 : /* translator: %s is a backup_label file or a pg_control file */
853 : errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
854 : haveBackupLabel ? "backup_label" : "pg_control",
855 : LSN_FORMAT_ARGS(CheckPointLoc),
856 : CheckPointTLI,
857 : LSN_FORMAT_ARGS(switchpoint))));
858 : }
859 :
860 : /*
861 : * The min recovery point should be part of the requested timeline's
862 : * history, too.
863 : */
864 1762 : if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
865 80 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
866 80 : ControlFile->minRecoveryPointTLI)
867 0 : ereport(FATAL,
868 : (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
869 : recoveryTargetTLI,
870 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
871 : ControlFile->minRecoveryPointTLI)));
872 :
873 1762 : ereport(DEBUG1,
874 : (errmsg_internal("redo record is at %X/%X; shutdown %s",
875 : LSN_FORMAT_ARGS(checkPoint.redo),
876 : wasShutdown ? "true" : "false")));
877 1762 : ereport(DEBUG1,
878 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
879 : U64FromFullTransactionId(checkPoint.nextXid),
880 : checkPoint.nextOid)));
881 1762 : ereport(DEBUG1,
882 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
883 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
884 1762 : ereport(DEBUG1,
885 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
886 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
887 1762 : ereport(DEBUG1,
888 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
889 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
890 1762 : ereport(DEBUG1,
891 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
892 : checkPoint.oldestCommitTsXid,
893 : checkPoint.newestCommitTsXid)));
894 1762 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
895 0 : ereport(PANIC,
896 : (errmsg("invalid next transaction ID")));
897 :
898 : /* sanity check */
899 1762 : if (checkPoint.redo > CheckPointLoc)
900 0 : ereport(PANIC,
901 : (errmsg("invalid redo in checkpoint record")));
902 :
903 : /*
904 : * Check whether we need to force recovery from WAL. If it appears to
905 : * have been a clean shutdown and we did not have a recovery signal file,
906 : * then assume no recovery needed.
907 : */
908 1762 : if (checkPoint.redo < CheckPointLoc)
909 : {
910 218 : if (wasShutdown)
911 0 : ereport(PANIC,
912 : (errmsg("invalid redo record in shutdown checkpoint")));
913 218 : InRecovery = true;
914 : }
915 1544 : else if (ControlFile->state != DB_SHUTDOWNED)
916 188 : InRecovery = true;
917 1356 : else if (ArchiveRecoveryRequested)
918 : {
919 : /* force recovery due to presence of recovery signal file */
920 14 : InRecovery = true;
921 : }
922 :
923 : /*
924 : * If recovery is needed, update our in-memory copy of pg_control to show
925 : * that we are recovering and to show the selected checkpoint as the place
926 : * we are starting from. We also mark pg_control with any minimum recovery
927 : * stop point obtained from a backup history file.
928 : *
929 : * We don't write the changes to disk yet, though. Only do that after
930 : * initializing various subsystems.
931 : */
932 1762 : if (InRecovery)
933 : {
934 420 : if (InArchiveRecovery)
935 : {
936 220 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
937 : }
938 : else
939 : {
940 200 : ereport(LOG,
941 : (errmsg("database system was not properly shut down; "
942 : "automatic recovery in progress")));
943 200 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
944 4 : ereport(LOG,
945 : (errmsg("crash recovery starts in timeline %u "
946 : "and has target timeline %u",
947 : ControlFile->checkPointCopy.ThisTimeLineID,
948 : recoveryTargetTLI)));
949 200 : ControlFile->state = DB_IN_CRASH_RECOVERY;
950 : }
951 420 : ControlFile->checkPoint = CheckPointLoc;
952 420 : ControlFile->checkPointCopy = checkPoint;
953 420 : if (InArchiveRecovery)
954 : {
955 : /* initialize minRecoveryPoint if not set yet */
956 220 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
957 : {
958 144 : ControlFile->minRecoveryPoint = checkPoint.redo;
959 144 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
960 : }
961 : }
962 :
963 : /*
964 : * Set backupStartPoint if we're starting recovery from a base backup.
965 : *
966 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
967 : * location if we're starting recovery from a base backup which was
968 : * taken from a standby. In this case, the database system status in
969 : * pg_control must indicate that the database was already in recovery.
970 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
971 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
972 : * before reaching this point; e.g. because restore_command or
973 : * primary_conninfo were faulty.
974 : *
975 : * Any other state indicates that the backup somehow became corrupted
976 : * and we can't sensibly continue with recovery.
977 : */
978 420 : if (haveBackupLabel)
979 : {
980 138 : ControlFile->backupStartPoint = checkPoint.redo;
981 138 : ControlFile->backupEndRequired = backupEndRequired;
982 :
983 138 : if (backupFromStandby)
984 : {
985 8 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
986 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
987 0 : ereport(FATAL,
988 : (errmsg("backup_label contains data inconsistent with control file"),
989 : errhint("This means that the backup is corrupted and you will "
990 : "have to use another backup for recovery.")));
991 8 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
992 : }
993 : }
994 : }
995 :
996 : /* remember these, so that we know when we have reached consistency */
997 1762 : backupStartPoint = ControlFile->backupStartPoint;
998 1762 : backupEndRequired = ControlFile->backupEndRequired;
999 1762 : backupEndPoint = ControlFile->backupEndPoint;
1000 1762 : if (InArchiveRecovery)
1001 : {
1002 220 : minRecoveryPoint = ControlFile->minRecoveryPoint;
1003 220 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1004 : }
1005 : else
1006 : {
1007 1542 : minRecoveryPoint = InvalidXLogRecPtr;
1008 1542 : minRecoveryPointTLI = 0;
1009 : }
1010 :
1011 : /*
1012 : * Start recovery assuming that the final record isn't lost.
1013 : */
1014 1762 : abortedRecPtr = InvalidXLogRecPtr;
1015 1762 : missingContrecPtr = InvalidXLogRecPtr;
1016 :
1017 1762 : *wasShutdown_ptr = wasShutdown;
1018 1762 : *haveBackupLabel_ptr = haveBackupLabel;
1019 1762 : *haveTblspcMap_ptr = haveTblspcMap;
1020 1762 : }
1021 :
1022 : /*
1023 : * See if there are any recovery signal files and if so, set state for
1024 : * recovery.
1025 : *
1026 : * See if there is a recovery command file (recovery.conf), and if so
1027 : * throw an ERROR since as of PG12 we no longer recognize that.
1028 : */
1029 : static void
1030 1762 : readRecoverySignalFile(void)
1031 : {
1032 : struct stat stat_buf;
1033 :
1034 1762 : if (IsBootstrapProcessingMode())
1035 1550 : return;
1036 :
1037 : /*
1038 : * Check for old recovery API file: recovery.conf
1039 : */
1040 1672 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1041 0 : ereport(FATAL,
1042 : (errcode_for_file_access(),
1043 : errmsg("using recovery command file \"%s\" is not supported",
1044 : RECOVERY_COMMAND_FILE)));
1045 :
1046 : /*
1047 : * Remove unused .done file, if present. Ignore if absent.
1048 : */
1049 1672 : unlink(RECOVERY_COMMAND_DONE);
1050 :
1051 : /*
1052 : * Check for recovery signal files and if found, fsync them since they
1053 : * represent server state information. We don't sweat too much about the
1054 : * possibility of fsync failure, however.
1055 : *
1056 : * If present, standby signal file takes precedence. If neither is present
1057 : * then we won't enter archive recovery.
1058 : */
1059 1672 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1060 : {
1061 : int fd;
1062 :
1063 202 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1064 : S_IRUSR | S_IWUSR);
1065 202 : if (fd >= 0)
1066 : {
1067 202 : (void) pg_fsync(fd);
1068 202 : close(fd);
1069 : }
1070 202 : standby_signal_file_found = true;
1071 : }
1072 1470 : else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1073 : {
1074 : int fd;
1075 :
1076 10 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1077 : S_IRUSR | S_IWUSR);
1078 10 : if (fd >= 0)
1079 : {
1080 10 : (void) pg_fsync(fd);
1081 10 : close(fd);
1082 : }
1083 10 : recovery_signal_file_found = true;
1084 : }
1085 :
1086 1672 : StandbyModeRequested = false;
1087 1672 : ArchiveRecoveryRequested = false;
1088 1672 : if (standby_signal_file_found)
1089 : {
1090 202 : StandbyModeRequested = true;
1091 202 : ArchiveRecoveryRequested = true;
1092 : }
1093 1470 : else if (recovery_signal_file_found)
1094 : {
1095 10 : StandbyModeRequested = false;
1096 10 : ArchiveRecoveryRequested = true;
1097 : }
1098 : else
1099 1460 : return;
1100 :
1101 : /*
1102 : * We don't support standby mode in standalone backends; that requires
1103 : * other processes such as the WAL receiver to be alive.
1104 : */
1105 212 : if (StandbyModeRequested && !IsUnderPostmaster)
1106 0 : ereport(FATAL,
1107 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1108 : errmsg("standby mode is not supported by single-user servers")));
1109 : }
1110 :
1111 : static void
1112 1762 : validateRecoveryParameters(void)
1113 : {
1114 1762 : if (!ArchiveRecoveryRequested)
1115 1550 : return;
1116 :
1117 : /*
1118 : * Check for compulsory parameters
1119 : */
1120 212 : if (StandbyModeRequested)
1121 : {
1122 202 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1123 20 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1124 4 : ereport(WARNING,
1125 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1126 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1127 : }
1128 : else
1129 : {
1130 10 : if (recoveryRestoreCommand == NULL ||
1131 10 : strcmp(recoveryRestoreCommand, "") == 0)
1132 0 : ereport(FATAL,
1133 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1134 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1135 : }
1136 :
1137 : /*
1138 : * Override any inconsistent requests. Note that this is a change of
1139 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1140 : * hot_standby = off, which was surprising behaviour.
1141 : */
1142 212 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1143 198 : !EnableHotStandby)
1144 6 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1145 :
1146 : /*
1147 : * Final parsing of recovery_target_time string; see also
1148 : * check_recovery_target_time().
1149 : */
1150 212 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1151 : {
1152 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1153 : CStringGetDatum(recovery_target_time_string),
1154 : ObjectIdGetDatum(InvalidOid),
1155 : Int32GetDatum(-1)));
1156 : }
1157 :
1158 : /*
1159 : * If user specified recovery_target_timeline, validate it or compute the
1160 : * "latest" value. We can't do this until after we've gotten the restore
1161 : * command and set InArchiveRecovery, because we need to fetch timeline
1162 : * history files from the archive.
1163 : */
1164 212 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1165 : {
1166 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1167 :
1168 : /* Timeline 1 does not have a history file, all else should */
1169 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1170 0 : ereport(FATAL,
1171 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1172 : errmsg("recovery target timeline %u does not exist",
1173 : rtli)));
1174 0 : recoveryTargetTLI = rtli;
1175 : }
1176 212 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1177 : {
1178 : /* We start the "latest" search from pg_control's timeline */
1179 212 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1180 : }
1181 : else
1182 : {
1183 : /*
1184 : * else we just use the recoveryTargetTLI as already read from
1185 : * ControlFile
1186 : */
1187 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1188 : }
1189 : }
1190 :
1191 : /*
1192 : * read_backup_label: check to see if a backup_label file is present
1193 : *
1194 : * If we see a backup_label during recovery, we assume that we are recovering
1195 : * from a backup dump file, and we therefore roll forward from the checkpoint
1196 : * identified by the label file, NOT what pg_control says. This avoids the
1197 : * problem that pg_control might have been archived one or more checkpoints
1198 : * later than the start of the dump, and so if we rely on it as the start
1199 : * point, we will fail to restore a consistent database state.
1200 : *
1201 : * Returns true if a backup_label was found (and fills the checkpoint
1202 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1203 : * returns false if not. If this backup_label came from a streamed backup,
1204 : * *backupEndRequired is set to true. If this backup_label was created during
1205 : * recovery, *backupFromStandby is set to true.
1206 : *
1207 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1208 : * and TLI read from the backup file.
1209 : */
1210 : static bool
1211 1762 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1212 : bool *backupEndRequired, bool *backupFromStandby)
1213 : {
1214 : char startxlogfilename[MAXFNAMELEN];
1215 : TimeLineID tli_from_walseg,
1216 : tli_from_file;
1217 : FILE *lfp;
1218 : char ch;
1219 : char backuptype[20];
1220 : char backupfrom[20];
1221 : char backuplabel[MAXPGPATH];
1222 : char backuptime[128];
1223 : uint32 hi,
1224 : lo;
1225 :
1226 : /* suppress possible uninitialized-variable warnings */
1227 1762 : *checkPointLoc = InvalidXLogRecPtr;
1228 1762 : *backupLabelTLI = 0;
1229 1762 : *backupEndRequired = false;
1230 1762 : *backupFromStandby = false;
1231 :
1232 : /*
1233 : * See if label file is present
1234 : */
1235 1762 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1236 1762 : if (!lfp)
1237 : {
1238 1624 : if (errno != ENOENT)
1239 0 : ereport(FATAL,
1240 : (errcode_for_file_access(),
1241 : errmsg("could not read file \"%s\": %m",
1242 : BACKUP_LABEL_FILE)));
1243 1624 : return false; /* it's not there, all is fine */
1244 : }
1245 :
1246 : /*
1247 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1248 : * is pretty crude, but we are not expecting any variability in the file
1249 : * format).
1250 : */
1251 138 : if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1252 138 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1253 0 : ereport(FATAL,
1254 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1255 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1256 138 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1257 138 : RedoStartTLI = tli_from_walseg;
1258 138 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1259 138 : &hi, &lo, &ch) != 3 || ch != '\n')
1260 0 : ereport(FATAL,
1261 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1262 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1263 138 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1264 138 : *backupLabelTLI = tli_from_walseg;
1265 :
1266 : /*
1267 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1268 : * which could mean either pg_basebackup or the pg_backup_start/stop
1269 : * method was used) or if this label came from somewhere else (the only
1270 : * other option today being from pg_rewind). If this was a streamed
1271 : * backup then we know that we need to play through until we get to the
1272 : * end of the WAL which was generated during the backup (at which point we
1273 : * will have reached consistency and backupEndRequired will be reset to be
1274 : * false).
1275 : */
1276 138 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1277 : {
1278 138 : if (strcmp(backuptype, "streamed") == 0)
1279 136 : *backupEndRequired = true;
1280 : }
1281 :
1282 : /*
1283 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1284 : * it was from a standby, we'll double-check that the control file state
1285 : * matches that of a standby.
1286 : */
1287 138 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1288 : {
1289 138 : if (strcmp(backupfrom, "standby") == 0)
1290 8 : *backupFromStandby = true;
1291 : }
1292 :
1293 : /*
1294 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1295 : * but checking for their presence is useful for debugging and the next
1296 : * sanity checks. Cope also with the fact that the result buffers have a
1297 : * pre-allocated size, hence if the backup_label file has been generated
1298 : * with strings longer than the maximum assumed here an incorrect parsing
1299 : * happens. That's fine as only minor consistency checks are done
1300 : * afterwards.
1301 : */
1302 138 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1303 138 : ereport(DEBUG1,
1304 : (errmsg_internal("backup time %s in file \"%s\"",
1305 : backuptime, BACKUP_LABEL_FILE)));
1306 :
1307 138 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1308 136 : ereport(DEBUG1,
1309 : (errmsg_internal("backup label %s in file \"%s\"",
1310 : backuplabel, BACKUP_LABEL_FILE)));
1311 :
1312 : /*
1313 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1314 : * it as a sanity check if present.
1315 : */
1316 138 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1317 : {
1318 136 : if (tli_from_walseg != tli_from_file)
1319 0 : ereport(FATAL,
1320 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1321 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1322 : errdetail("Timeline ID parsed is %u, but expected %u.",
1323 : tli_from_file, tli_from_walseg)));
1324 :
1325 136 : ereport(DEBUG1,
1326 : (errmsg_internal("backup timeline %u in file \"%s\"",
1327 : tli_from_file, BACKUP_LABEL_FILE)));
1328 : }
1329 :
1330 138 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1331 0 : ereport(FATAL,
1332 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1333 : errmsg("this is an incremental backup, not a data directory"),
1334 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1335 :
1336 138 : if (ferror(lfp) || FreeFile(lfp))
1337 0 : ereport(FATAL,
1338 : (errcode_for_file_access(),
1339 : errmsg("could not read file \"%s\": %m",
1340 : BACKUP_LABEL_FILE)));
1341 :
1342 138 : return true;
1343 : }
1344 :
1345 : /*
1346 : * read_tablespace_map: check to see if a tablespace_map file is present
1347 : *
1348 : * If we see a tablespace_map file during recovery, we assume that we are
1349 : * recovering from a backup dump file, and we therefore need to create symlinks
1350 : * as per the information present in tablespace_map file.
1351 : *
1352 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1353 : * with a tablespaceinfo struct for each tablespace listed in the file);
1354 : * returns false if not.
1355 : */
1356 : static bool
1357 138 : read_tablespace_map(List **tablespaces)
1358 : {
1359 : tablespaceinfo *ti;
1360 : FILE *lfp;
1361 : char str[MAXPGPATH];
1362 : int ch,
1363 : i,
1364 : n;
1365 : bool was_backslash;
1366 :
1367 : /*
1368 : * See if tablespace_map file is present
1369 : */
1370 138 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1371 138 : if (!lfp)
1372 : {
1373 134 : if (errno != ENOENT)
1374 0 : ereport(FATAL,
1375 : (errcode_for_file_access(),
1376 : errmsg("could not read file \"%s\": %m",
1377 : TABLESPACE_MAP)));
1378 134 : return false; /* it's not there, all is fine */
1379 : }
1380 :
1381 : /*
1382 : * Read and parse the link name and path lines from tablespace_map file
1383 : * (this code is pretty crude, but we are not expecting any variability in
1384 : * the file format). De-escape any backslashes that were inserted.
1385 : */
1386 4 : i = 0;
1387 4 : was_backslash = false;
1388 154 : while ((ch = fgetc(lfp)) != EOF)
1389 : {
1390 150 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1391 : {
1392 : char *endp;
1393 :
1394 4 : if (i == 0)
1395 0 : continue; /* \r immediately followed by \n */
1396 :
1397 : /*
1398 : * The de-escaped line should contain an OID followed by exactly
1399 : * one space followed by a path. The path might start with
1400 : * spaces, so don't be too liberal about parsing.
1401 : */
1402 4 : str[i] = '\0';
1403 4 : n = 0;
1404 24 : while (str[n] && str[n] != ' ')
1405 20 : n++;
1406 4 : if (n < 1 || n >= i - 1)
1407 0 : ereport(FATAL,
1408 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1409 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1410 4 : str[n++] = '\0';
1411 :
1412 4 : ti = palloc0(sizeof(tablespaceinfo));
1413 4 : errno = 0;
1414 4 : ti->oid = strtoul(str, &endp, 10);
1415 4 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1416 0 : ereport(FATAL,
1417 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1418 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1419 4 : ti->path = pstrdup(str + n);
1420 4 : *tablespaces = lappend(*tablespaces, ti);
1421 :
1422 4 : i = 0;
1423 4 : continue;
1424 : }
1425 146 : else if (!was_backslash && ch == '\\')
1426 0 : was_backslash = true;
1427 : else
1428 : {
1429 146 : if (i < sizeof(str) - 1)
1430 146 : str[i++] = ch;
1431 146 : was_backslash = false;
1432 : }
1433 : }
1434 :
1435 4 : if (i != 0 || was_backslash) /* last line not terminated? */
1436 0 : ereport(FATAL,
1437 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1438 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1439 :
1440 4 : if (ferror(lfp) || FreeFile(lfp))
1441 0 : ereport(FATAL,
1442 : (errcode_for_file_access(),
1443 : errmsg("could not read file \"%s\": %m",
1444 : TABLESPACE_MAP)));
1445 :
1446 4 : return true;
1447 : }
1448 :
1449 : /*
1450 : * Finish WAL recovery.
1451 : *
1452 : * This does not close the 'xlogreader' yet, because in some cases the caller
1453 : * still wants to re-read the last checkpoint record by calling
1454 : * ReadCheckpointRecord().
1455 : *
1456 : * Returns the position of the last valid or applied record, after which new
1457 : * WAL should be appended, information about why recovery was ended, and some
1458 : * other things. See the EndOfWalRecoveryInfo struct for details.
1459 : */
1460 : EndOfWalRecoveryInfo *
1461 1648 : FinishWalRecovery(void)
1462 : {
1463 1648 : EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1464 : XLogRecPtr lastRec;
1465 : TimeLineID lastRecTLI;
1466 : XLogRecPtr endOfLog;
1467 :
1468 : /*
1469 : * Kill WAL receiver, if it's still running, before we continue to write
1470 : * the startup checkpoint and aborted-contrecord records. It will trump
1471 : * over these records and subsequent ones if it's still alive when we
1472 : * start writing WAL.
1473 : */
1474 1648 : XLogShutdownWalRcv();
1475 :
1476 : /*
1477 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1478 : * it and to prevent it from keep trying to fetch the failover slots.
1479 : *
1480 : * We do not update the 'synced' column in 'pg_replication_slots' system
1481 : * view from true to false here, as any failed update could leave 'synced'
1482 : * column false for some slots. This could cause issues during slot sync
1483 : * after restarting the server as a standby. While updating the 'synced'
1484 : * column after switching to the new timeline is an option, it does not
1485 : * simplify the handling for the 'synced' column. Therefore, we retain the
1486 : * 'synced' column as true after promotion as it may provide useful
1487 : * information about the slot origin.
1488 : */
1489 1648 : ShutDownSlotSync();
1490 :
1491 : /*
1492 : * We are now done reading the xlog from stream. Turn off streaming
1493 : * recovery to force fetching the files (which would be required at end of
1494 : * recovery, e.g., timeline history file) from archive or pg_wal.
1495 : *
1496 : * Note that standby mode must be turned off after killing WAL receiver,
1497 : * i.e., calling XLogShutdownWalRcv().
1498 : */
1499 : Assert(!WalRcvStreaming());
1500 1648 : StandbyMode = false;
1501 :
1502 : /*
1503 : * Determine where to start writing WAL next.
1504 : *
1505 : * Re-fetch the last valid or last applied record, so we can identify the
1506 : * exact endpoint of what we consider the valid portion of WAL. There may
1507 : * be an incomplete continuation record after that, in which case
1508 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1509 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1510 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1511 : *
1512 : * An important side-effect of this is to load the last page into
1513 : * xlogreader. The caller uses it to initialize the WAL for writing.
1514 : */
1515 1648 : if (!InRecovery)
1516 : {
1517 1342 : lastRec = CheckPointLoc;
1518 1342 : lastRecTLI = CheckPointTLI;
1519 : }
1520 : else
1521 : {
1522 306 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1523 306 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1524 : }
1525 1648 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1526 1648 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1527 1648 : endOfLog = xlogreader->EndRecPtr;
1528 :
1529 : /*
1530 : * Remember the TLI in the filename of the XLOG segment containing the
1531 : * end-of-log. It could be different from the timeline that endOfLog
1532 : * nominally belongs to, if there was a timeline switch in that segment,
1533 : * and we were reading the old WAL from a segment belonging to a higher
1534 : * timeline.
1535 : */
1536 1648 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1537 :
1538 1648 : if (ArchiveRecoveryRequested)
1539 : {
1540 : /*
1541 : * We are no longer in archive recovery state.
1542 : *
1543 : * We are now done reading the old WAL. Turn off archive fetching if
1544 : * it was active.
1545 : */
1546 : Assert(InArchiveRecovery);
1547 98 : InArchiveRecovery = false;
1548 :
1549 : /*
1550 : * If the ending log segment is still open, close it (to avoid
1551 : * problems on Windows with trying to rename or delete an open file).
1552 : */
1553 98 : if (readFile >= 0)
1554 : {
1555 98 : close(readFile);
1556 98 : readFile = -1;
1557 : }
1558 : }
1559 :
1560 : /*
1561 : * Copy the last partial block to the caller, for initializing the WAL
1562 : * buffer for appending new WAL.
1563 : */
1564 1648 : if (endOfLog % XLOG_BLCKSZ != 0)
1565 : {
1566 : char *page;
1567 : int len;
1568 : XLogRecPtr pageBeginPtr;
1569 :
1570 1606 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1571 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1572 :
1573 : /* Copy the valid part of the last block */
1574 1606 : len = endOfLog % XLOG_BLCKSZ;
1575 1606 : page = palloc(len);
1576 1606 : memcpy(page, xlogreader->readBuf, len);
1577 :
1578 1606 : result->lastPageBeginPtr = pageBeginPtr;
1579 1606 : result->lastPage = page;
1580 : }
1581 : else
1582 : {
1583 : /* There is no partial block to copy. */
1584 42 : result->lastPageBeginPtr = endOfLog;
1585 42 : result->lastPage = NULL;
1586 : }
1587 :
1588 : /*
1589 : * Create a comment for the history file to explain why and where timeline
1590 : * changed.
1591 : */
1592 1648 : result->recoveryStopReason = getRecoveryStopReason();
1593 :
1594 1648 : result->lastRec = lastRec;
1595 1648 : result->lastRecTLI = lastRecTLI;
1596 1648 : result->endOfLog = endOfLog;
1597 :
1598 1648 : result->abortedRecPtr = abortedRecPtr;
1599 1648 : result->missingContrecPtr = missingContrecPtr;
1600 :
1601 1648 : result->standby_signal_file_found = standby_signal_file_found;
1602 1648 : result->recovery_signal_file_found = recovery_signal_file_found;
1603 :
1604 1648 : return result;
1605 : }
1606 :
1607 : /*
1608 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1609 : */
1610 : void
1611 1648 : ShutdownWalRecovery(void)
1612 : {
1613 : char recoveryPath[MAXPGPATH];
1614 :
1615 : /* Final update of pg_stat_recovery_prefetch. */
1616 1648 : XLogPrefetcherComputeStats(xlogprefetcher);
1617 :
1618 : /* Shut down xlogreader */
1619 1648 : if (readFile >= 0)
1620 : {
1621 1550 : close(readFile);
1622 1550 : readFile = -1;
1623 : }
1624 1648 : XLogReaderFree(xlogreader);
1625 1648 : XLogPrefetcherFree(xlogprefetcher);
1626 :
1627 1648 : if (ArchiveRecoveryRequested)
1628 : {
1629 : /*
1630 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1631 : * rid of it.
1632 : */
1633 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1634 98 : unlink(recoveryPath); /* ignore any error */
1635 :
1636 : /* Get rid of any remaining recovered timeline-history file, too */
1637 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1638 98 : unlink(recoveryPath); /* ignore any error */
1639 : }
1640 :
1641 : /*
1642 : * We don't need the latch anymore. It's not strictly necessary to disown
1643 : * it, but let's do it for the sake of tidiness.
1644 : */
1645 1648 : if (ArchiveRecoveryRequested)
1646 98 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1647 1648 : }
1648 :
1649 : /*
1650 : * Perform WAL recovery.
1651 : *
1652 : * If the system was shut down cleanly, this is never called.
1653 : */
1654 : void
1655 418 : PerformWalRecovery(void)
1656 : {
1657 : XLogRecord *record;
1658 418 : bool reachedRecoveryTarget = false;
1659 : TimeLineID replayTLI;
1660 :
1661 : /*
1662 : * Initialize shared variables for tracking progress of WAL replay, as if
1663 : * we had just replayed the record before the REDO location (or the
1664 : * checkpoint record itself, if it's a shutdown checkpoint).
1665 : */
1666 418 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1667 418 : if (RedoStartLSN < CheckPointLoc)
1668 : {
1669 216 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1670 216 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1671 216 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1672 : }
1673 : else
1674 : {
1675 202 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1676 202 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1677 202 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1678 : }
1679 418 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1680 418 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1681 418 : XLogRecoveryCtl->recoveryLastXTime = 0;
1682 418 : XLogRecoveryCtl->currentChunkStartTime = 0;
1683 418 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1684 418 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1685 :
1686 : /* Also ensure XLogReceiptTime has a sane value */
1687 418 : XLogReceiptTime = GetCurrentTimestamp();
1688 :
1689 : /*
1690 : * Let postmaster know we've started redo now, so that it can launch the
1691 : * archiver if necessary.
1692 : */
1693 418 : if (IsUnderPostmaster)
1694 400 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1695 :
1696 : /*
1697 : * Allow read-only connections immediately if we're consistent already.
1698 : */
1699 418 : CheckRecoveryConsistency();
1700 :
1701 : /*
1702 : * Find the first record that logically follows the checkpoint --- it
1703 : * might physically precede it, though.
1704 : */
1705 418 : if (RedoStartLSN < CheckPointLoc)
1706 : {
1707 : /* back up to find the record */
1708 216 : replayTLI = RedoStartTLI;
1709 216 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1710 216 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1711 :
1712 : /*
1713 : * If a checkpoint record's redo pointer points back to an earlier
1714 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1715 : * record.
1716 : */
1717 216 : if (record->xl_rmid != RM_XLOG_ID ||
1718 216 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1719 0 : ereport(FATAL,
1720 : (errmsg("unexpected record type found at redo point %X/%X",
1721 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1722 : }
1723 : else
1724 : {
1725 : /* just have to read next record after CheckPoint */
1726 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1727 202 : replayTLI = CheckPointTLI;
1728 202 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1729 : }
1730 :
1731 418 : if (record != NULL)
1732 : {
1733 : TimestampTz xtime;
1734 : PGRUsage ru0;
1735 :
1736 400 : pg_rusage_init(&ru0);
1737 :
1738 400 : InRedo = true;
1739 :
1740 400 : RmgrStartup();
1741 :
1742 400 : ereport(LOG,
1743 : (errmsg("redo starts at %X/%X",
1744 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1745 :
1746 : /* Prepare to report progress of the redo phase. */
1747 400 : if (!StandbyMode)
1748 210 : begin_startup_progress_phase();
1749 :
1750 : /*
1751 : * main redo apply loop
1752 : */
1753 : do
1754 : {
1755 5420846 : if (!StandbyMode)
1756 527922 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1757 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1758 :
1759 : #ifdef WAL_DEBUG
1760 : if (XLOG_DEBUG)
1761 : {
1762 : StringInfoData buf;
1763 :
1764 : initStringInfo(&buf);
1765 : appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1766 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1767 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1768 : xlog_outrec(&buf, xlogreader);
1769 : appendStringInfoString(&buf, " - ");
1770 : xlog_outdesc(&buf, xlogreader);
1771 : elog(LOG, "%s", buf.data);
1772 : pfree(buf.data);
1773 : }
1774 : #endif
1775 :
1776 : /* Handle interrupt signals of startup process */
1777 5420846 : ProcessStartupProcInterrupts();
1778 :
1779 : /*
1780 : * Pause WAL replay, if requested by a hot-standby session via
1781 : * SetRecoveryPause().
1782 : *
1783 : * Note that we intentionally don't take the info_lck spinlock
1784 : * here. We might therefore read a slightly stale value of the
1785 : * recoveryPause flag, but it can't be very stale (no worse than
1786 : * the last spinlock we did acquire). Since a pause request is a
1787 : * pretty asynchronous thing anyway, possibly responding to it one
1788 : * WAL record later than we otherwise would is a minor issue, so
1789 : * it doesn't seem worth adding another spinlock cycle to prevent
1790 : * that.
1791 : */
1792 5420846 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1793 : RECOVERY_NOT_PAUSED)
1794 0 : recoveryPausesHere(false);
1795 :
1796 : /*
1797 : * Have we reached our recovery target?
1798 : */
1799 5420846 : if (recoveryStopsBefore(xlogreader))
1800 : {
1801 2 : reachedRecoveryTarget = true;
1802 2 : break;
1803 : }
1804 :
1805 : /*
1806 : * If we've been asked to lag the primary, wait on latch until
1807 : * enough time has passed.
1808 : */
1809 5420844 : if (recoveryApplyDelay(xlogreader))
1810 : {
1811 : /*
1812 : * We test for paused recovery again here. If user sets
1813 : * delayed apply, it may be because they expect to pause
1814 : * recovery in case of problems, so we must test again here
1815 : * otherwise pausing during the delay-wait wouldn't work.
1816 : */
1817 0 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1818 : RECOVERY_NOT_PAUSED)
1819 0 : recoveryPausesHere(false);
1820 : }
1821 :
1822 : /*
1823 : * Apply the record
1824 : */
1825 5420844 : ApplyWalRecord(xlogreader, record, &replayTLI);
1826 :
1827 : /* Exit loop if we reached inclusive recovery target */
1828 5420840 : if (recoveryStopsAfter(xlogreader))
1829 : {
1830 12 : reachedRecoveryTarget = true;
1831 12 : break;
1832 : }
1833 :
1834 : /* Else, try to fetch the next WAL record */
1835 5420828 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1836 5420722 : } while (record != NULL);
1837 :
1838 : /*
1839 : * end of main redo apply loop
1840 : */
1841 :
1842 290 : if (reachedRecoveryTarget)
1843 : {
1844 14 : if (!reachedConsistency)
1845 0 : ereport(FATAL,
1846 : (errmsg("requested recovery stop point is before consistent recovery point")));
1847 :
1848 : /*
1849 : * This is the last point where we can restart recovery with a new
1850 : * recovery target, if we shutdown and begin again. After this,
1851 : * Resource Managers may choose to do permanent corrective actions
1852 : * at end of recovery.
1853 : */
1854 14 : switch (recoveryTargetAction)
1855 : {
1856 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1857 :
1858 : /*
1859 : * exit with special return code to request shutdown of
1860 : * postmaster. Log messages issued from postmaster.
1861 : */
1862 0 : proc_exit(3);
1863 :
1864 2 : case RECOVERY_TARGET_ACTION_PAUSE:
1865 2 : SetRecoveryPause(true);
1866 2 : recoveryPausesHere(true);
1867 :
1868 : /* drop into promote */
1869 :
1870 14 : case RECOVERY_TARGET_ACTION_PROMOTE:
1871 14 : break;
1872 : }
1873 276 : }
1874 :
1875 290 : RmgrCleanup();
1876 :
1877 290 : ereport(LOG,
1878 : (errmsg("redo done at %X/%X system usage: %s",
1879 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1880 : pg_rusage_show(&ru0))));
1881 290 : xtime = GetLatestXTime();
1882 290 : if (xtime)
1883 70 : ereport(LOG,
1884 : (errmsg("last completed transaction was at log time %s",
1885 : timestamptz_to_str(xtime))));
1886 :
1887 290 : InRedo = false;
1888 : }
1889 : else
1890 : {
1891 : /* there are no WAL records following the checkpoint */
1892 18 : ereport(LOG,
1893 : (errmsg("redo is not required")));
1894 : }
1895 :
1896 : /*
1897 : * This check is intentionally after the above log messages that indicate
1898 : * how far recovery went.
1899 : */
1900 308 : if (ArchiveRecoveryRequested &&
1901 100 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1902 16 : !reachedRecoveryTarget)
1903 2 : ereport(FATAL,
1904 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1905 : errmsg("recovery ended before configured recovery target was reached")));
1906 306 : }
1907 :
1908 : /*
1909 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1910 : */
1911 : static void
1912 5420844 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1913 : {
1914 : ErrorContextCallback errcallback;
1915 5420844 : bool switchedTLI = false;
1916 :
1917 : /* Setup error traceback support for ereport() */
1918 5420844 : errcallback.callback = rm_redo_error_callback;
1919 5420844 : errcallback.arg = xlogreader;
1920 5420844 : errcallback.previous = error_context_stack;
1921 5420844 : error_context_stack = &errcallback;
1922 :
1923 : /*
1924 : * TransamVariables->nextXid must be beyond record's xid.
1925 : */
1926 5420844 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1927 :
1928 : /*
1929 : * Before replaying this record, check if this record causes the current
1930 : * timeline to change. The record is already considered to be part of the
1931 : * new timeline, so we update replayTLI before replaying it. That's
1932 : * important so that replayEndTLI, which is recorded as the minimum
1933 : * recovery point's TLI if recovery stops after this record, is set
1934 : * correctly.
1935 : */
1936 5420844 : if (record->xl_rmid == RM_XLOG_ID)
1937 : {
1938 79542 : TimeLineID newReplayTLI = *replayTLI;
1939 79542 : TimeLineID prevReplayTLI = *replayTLI;
1940 79542 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1941 :
1942 79542 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1943 : {
1944 : CheckPoint checkPoint;
1945 :
1946 68 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1947 68 : newReplayTLI = checkPoint.ThisTimeLineID;
1948 68 : prevReplayTLI = checkPoint.PrevTimeLineID;
1949 : }
1950 79474 : else if (info == XLOG_END_OF_RECOVERY)
1951 : {
1952 : xl_end_of_recovery xlrec;
1953 :
1954 20 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1955 20 : newReplayTLI = xlrec.ThisTimeLineID;
1956 20 : prevReplayTLI = xlrec.PrevTimeLineID;
1957 : }
1958 :
1959 79542 : if (newReplayTLI != *replayTLI)
1960 : {
1961 : /* Check that it's OK to switch to this TLI */
1962 22 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1963 : newReplayTLI, prevReplayTLI, *replayTLI);
1964 :
1965 : /* Following WAL records should be run with new TLI */
1966 22 : *replayTLI = newReplayTLI;
1967 22 : switchedTLI = true;
1968 : }
1969 : }
1970 :
1971 : /*
1972 : * Update shared replayEndRecPtr before replaying this record, so that
1973 : * XLogFlush will update minRecoveryPoint correctly.
1974 : */
1975 5420844 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1976 5420844 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1977 5420844 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1978 5420844 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1979 :
1980 : /*
1981 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1982 : */
1983 5420844 : if (standbyState >= STANDBY_INITIALIZED &&
1984 4933022 : TransactionIdIsValid(record->xl_xid))
1985 4841074 : RecordKnownAssignedTransactionIds(record->xl_xid);
1986 :
1987 : /*
1988 : * Some XLOG record types that are related to recovery are processed
1989 : * directly here, rather than in xlog_redo()
1990 : */
1991 5420844 : if (record->xl_rmid == RM_XLOG_ID)
1992 79542 : xlogrecovery_redo(xlogreader, *replayTLI);
1993 :
1994 : /* Now apply the WAL record itself */
1995 5420844 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1996 :
1997 : /*
1998 : * After redo, check whether the backup pages associated with the WAL
1999 : * record are consistent with the existing pages. This check is done only
2000 : * if consistency check is enabled for this record.
2001 : */
2002 5420840 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2003 4275932 : verifyBackupPageConsistency(xlogreader);
2004 :
2005 : /* Pop the error context stack */
2006 5420840 : error_context_stack = errcallback.previous;
2007 :
2008 : /*
2009 : * Update lastReplayedEndRecPtr after this record has been successfully
2010 : * replayed.
2011 : */
2012 5420840 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2013 5420840 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2014 5420840 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2015 5420840 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2016 5420840 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2017 :
2018 : /* ------
2019 : * Wakeup walsenders:
2020 : *
2021 : * On the standby, the WAL is flushed first (which will only wake up
2022 : * physical walsenders) and then applied, which will only wake up logical
2023 : * walsenders.
2024 : *
2025 : * Indeed, logical walsenders on standby can't decode and send data until
2026 : * it's been applied.
2027 : *
2028 : * Physical walsenders don't need to be woken up during replay unless
2029 : * cascading replication is allowed and time line change occurred (so that
2030 : * they can notice that they are on a new time line).
2031 : *
2032 : * That's why the wake up conditions are for:
2033 : *
2034 : * - physical walsenders in case of new time line and cascade
2035 : * replication is allowed
2036 : * - logical walsenders in case cascade replication is allowed (could not
2037 : * be created otherwise)
2038 : * ------
2039 : */
2040 5420840 : if (AllowCascadeReplication())
2041 5042554 : WalSndWakeup(switchedTLI, true);
2042 :
2043 : /*
2044 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2045 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2046 : * a reply to the primary.
2047 : */
2048 5420840 : if (doRequestWalReceiverReply)
2049 : {
2050 4 : doRequestWalReceiverReply = false;
2051 4 : WalRcvForceReply();
2052 : }
2053 :
2054 : /* Allow read-only connections if we're consistent now */
2055 5420840 : CheckRecoveryConsistency();
2056 :
2057 : /* Is this a timeline switch? */
2058 5420840 : if (switchedTLI)
2059 : {
2060 : /*
2061 : * Before we continue on the new timeline, clean up any (possibly
2062 : * bogus) future WAL segments on the old timeline.
2063 : */
2064 22 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2065 :
2066 : /* Reset the prefetcher. */
2067 22 : XLogPrefetchReconfigure();
2068 : }
2069 5420840 : }
2070 :
2071 : /*
2072 : * Some XLOG RM record types that are directly related to WAL recovery are
2073 : * handled here rather than in the xlog_redo()
2074 : */
2075 : static void
2076 79542 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2077 : {
2078 79542 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2079 79542 : XLogRecPtr lsn = record->EndRecPtr;
2080 :
2081 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2082 :
2083 79542 : if (info == XLOG_OVERWRITE_CONTRECORD)
2084 : {
2085 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2086 : xl_overwrite_contrecord xlrec;
2087 :
2088 2 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2089 2 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2090 0 : elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2091 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2092 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2093 :
2094 : /* We have safely skipped the aborted record */
2095 2 : abortedRecPtr = InvalidXLogRecPtr;
2096 2 : missingContrecPtr = InvalidXLogRecPtr;
2097 :
2098 2 : ereport(LOG,
2099 : (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2100 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2101 : timestamptz_to_str(xlrec.overwrite_time))));
2102 :
2103 : /* Verifying the record should only happen once */
2104 2 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2105 : }
2106 79540 : else if (info == XLOG_BACKUP_END)
2107 : {
2108 : XLogRecPtr startpoint;
2109 :
2110 166 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2111 :
2112 166 : if (backupStartPoint == startpoint)
2113 : {
2114 : /*
2115 : * We have reached the end of base backup, the point where
2116 : * pg_backup_stop() was done. The data on disk is now consistent
2117 : * (assuming we have also reached minRecoveryPoint). Set
2118 : * backupEndPoint to the current LSN, so that the next call to
2119 : * CheckRecoveryConsistency() will notice it and do the
2120 : * end-of-backup processing.
2121 : */
2122 134 : elog(DEBUG1, "end of backup record reached");
2123 :
2124 134 : backupEndPoint = lsn;
2125 : }
2126 : else
2127 32 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2128 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2129 : }
2130 79542 : }
2131 :
2132 : /*
2133 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2134 : * directories.
2135 : *
2136 : * Replay of database creation XLOG records for databases that were later
2137 : * dropped can create fake directories in pg_tblspc. By the time consistency
2138 : * is reached these directories should have been removed; here we verify
2139 : * that this did indeed happen. This is to be called at the point where
2140 : * consistent state is reached.
2141 : *
2142 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2143 : * useful for testing purposes, and also allows for an escape hatch in case
2144 : * things go south.
2145 : */
2146 : static void
2147 222 : CheckTablespaceDirectory(void)
2148 : {
2149 : DIR *dir;
2150 : struct dirent *de;
2151 :
2152 222 : dir = AllocateDir(PG_TBLSPC_DIR);
2153 680 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2154 : {
2155 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2156 :
2157 : /* Skip entries of non-oid names */
2158 458 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2159 444 : continue;
2160 :
2161 14 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2162 :
2163 14 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2164 8 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2165 : (errcode(ERRCODE_DATA_CORRUPTED),
2166 : errmsg("unexpected directory entry \"%s\" found in %s",
2167 : de->d_name, PG_TBLSPC_DIR),
2168 : errdetail("All directory entries in %s/ should be symbolic links.",
2169 : PG_TBLSPC_DIR),
2170 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2171 : }
2172 222 : }
2173 :
2174 : /*
2175 : * Checks if recovery has reached a consistent state. When consistency is
2176 : * reached and we have a valid starting standby snapshot, tell postmaster
2177 : * that it can start accepting read-only connections.
2178 : */
2179 : static void
2180 5421262 : CheckRecoveryConsistency(void)
2181 : {
2182 : XLogRecPtr lastReplayedEndRecPtr;
2183 : TimeLineID lastReplayedTLI;
2184 :
2185 : /*
2186 : * During crash recovery, we don't reach a consistent state until we've
2187 : * replayed all the WAL.
2188 : */
2189 5421262 : if (XLogRecPtrIsInvalid(minRecoveryPoint))
2190 517692 : return;
2191 :
2192 : Assert(InArchiveRecovery);
2193 :
2194 : /*
2195 : * assume that we are called in the startup process, and hence don't need
2196 : * a lock to read lastReplayedEndRecPtr
2197 : */
2198 4903570 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2199 4903570 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2200 :
2201 : /*
2202 : * Have we reached the point where our base backup was completed?
2203 : */
2204 4903570 : if (!XLogRecPtrIsInvalid(backupEndPoint) &&
2205 200 : backupEndPoint <= lastReplayedEndRecPtr)
2206 : {
2207 138 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2208 138 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2209 :
2210 138 : elog(DEBUG1, "end of backup reached");
2211 :
2212 : /*
2213 : * We have reached the end of base backup, as indicated by pg_control.
2214 : * Update the control file accordingly.
2215 : */
2216 138 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2217 138 : backupStartPoint = InvalidXLogRecPtr;
2218 138 : backupEndPoint = InvalidXLogRecPtr;
2219 138 : backupEndRequired = false;
2220 :
2221 138 : ereport(LOG,
2222 : (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2223 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2224 : LSN_FORMAT_ARGS(saveBackupEndPoint))));
2225 : }
2226 :
2227 : /*
2228 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2229 : * known to be incorrectly set if recovering from a backup, until the
2230 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2231 : * All we know prior to that is that we're not consistent yet.
2232 : */
2233 4903570 : if (!reachedConsistency && !backupEndRequired &&
2234 14808 : minRecoveryPoint <= lastReplayedEndRecPtr)
2235 : {
2236 : /*
2237 : * Check to see if the XLOG sequence contained any unresolved
2238 : * references to uninitialized pages.
2239 : */
2240 222 : XLogCheckInvalidPages();
2241 :
2242 : /*
2243 : * Check that pg_tblspc doesn't contain any real directories. Replay
2244 : * of Database/CREATE_* records may have created fictitious tablespace
2245 : * directories that should have been removed by the time consistency
2246 : * was reached.
2247 : */
2248 222 : CheckTablespaceDirectory();
2249 :
2250 222 : reachedConsistency = true;
2251 222 : ereport(LOG,
2252 : (errmsg("consistent recovery state reached at %X/%X",
2253 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2254 : }
2255 :
2256 : /*
2257 : * Have we got a valid starting snapshot that will allow queries to be
2258 : * run? If so, we can tell postmaster that the database is consistent now,
2259 : * enabling connections.
2260 : */
2261 4903570 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2262 4903134 : !LocalHotStandbyActive &&
2263 206 : reachedConsistency &&
2264 : IsUnderPostmaster)
2265 : {
2266 206 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2267 206 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2268 206 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2269 :
2270 206 : LocalHotStandbyActive = true;
2271 :
2272 206 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2273 : }
2274 : }
2275 :
2276 : /*
2277 : * Error context callback for errors occurring during rm_redo().
2278 : */
2279 : static void
2280 204 : rm_redo_error_callback(void *arg)
2281 : {
2282 204 : XLogReaderState *record = (XLogReaderState *) arg;
2283 : StringInfoData buf;
2284 :
2285 204 : initStringInfo(&buf);
2286 204 : xlog_outdesc(&buf, record);
2287 204 : xlog_block_info(&buf, record);
2288 :
2289 : /* translator: %s is a WAL record description */
2290 204 : errcontext("WAL redo at %X/%X for %s",
2291 204 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2292 : buf.data);
2293 :
2294 204 : pfree(buf.data);
2295 204 : }
2296 :
2297 : /*
2298 : * Returns a string describing an XLogRecord, consisting of its identity
2299 : * optionally followed by a colon, a space, and a further description.
2300 : */
2301 : void
2302 204 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2303 : {
2304 204 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2305 204 : uint8 info = XLogRecGetInfo(record);
2306 : const char *id;
2307 :
2308 204 : appendStringInfoString(buf, rmgr.rm_name);
2309 204 : appendStringInfoChar(buf, '/');
2310 :
2311 204 : id = rmgr.rm_identify(info);
2312 204 : if (id == NULL)
2313 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2314 : else
2315 204 : appendStringInfo(buf, "%s: ", id);
2316 :
2317 204 : rmgr.rm_desc(buf, record);
2318 204 : }
2319 :
2320 : #ifdef WAL_DEBUG
2321 :
2322 : static void
2323 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2324 : {
2325 : appendStringInfo(buf, "prev %X/%X; xid %u",
2326 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2327 : XLogRecGetXid(record));
2328 :
2329 : appendStringInfo(buf, "; len %u",
2330 : XLogRecGetDataLen(record));
2331 :
2332 : xlog_block_info(buf, record);
2333 : }
2334 : #endif /* WAL_DEBUG */
2335 :
2336 : /*
2337 : * Returns a string giving information about all the blocks in an
2338 : * XLogRecord.
2339 : */
2340 : static void
2341 204 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2342 : {
2343 : int block_id;
2344 :
2345 : /* decode block references */
2346 320 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2347 : {
2348 : RelFileLocator rlocator;
2349 : ForkNumber forknum;
2350 : BlockNumber blk;
2351 :
2352 116 : if (!XLogRecGetBlockTagExtended(record, block_id,
2353 : &rlocator, &forknum, &blk, NULL))
2354 0 : continue;
2355 :
2356 116 : if (forknum != MAIN_FORKNUM)
2357 6 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2358 : block_id,
2359 : rlocator.spcOid, rlocator.dbOid,
2360 : rlocator.relNumber,
2361 : forknum,
2362 : blk);
2363 : else
2364 110 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2365 : block_id,
2366 : rlocator.spcOid, rlocator.dbOid,
2367 : rlocator.relNumber,
2368 : blk);
2369 116 : if (XLogRecHasBlockImage(record, block_id))
2370 72 : appendStringInfoString(buf, " FPW");
2371 : }
2372 204 : }
2373 :
2374 :
2375 : /*
2376 : * Check that it's OK to switch to new timeline during recovery.
2377 : *
2378 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2379 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2380 : */
2381 : static void
2382 22 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2383 : TimeLineID replayTLI)
2384 : {
2385 : /* Check that the record agrees on what the current (old) timeline is */
2386 22 : if (prevTLI != replayTLI)
2387 0 : ereport(PANIC,
2388 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2389 : prevTLI, replayTLI)));
2390 :
2391 : /*
2392 : * The new timeline better be in the list of timelines we expect to see,
2393 : * according to the timeline history. It should also not decrease.
2394 : */
2395 22 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2396 0 : ereport(PANIC,
2397 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2398 : newTLI, replayTLI)));
2399 :
2400 : /*
2401 : * If we have not yet reached min recovery point, and we're about to
2402 : * switch to a timeline greater than the timeline of the min recovery
2403 : * point: trouble. After switching to the new timeline, we could not
2404 : * possibly visit the min recovery point on the correct timeline anymore.
2405 : * This can happen if there is a newer timeline in the archive that
2406 : * branched before the timeline the min recovery point is on, and you
2407 : * attempt to do PITR to the new timeline.
2408 : */
2409 22 : if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
2410 18 : lsn < minRecoveryPoint &&
2411 2 : newTLI > minRecoveryPointTLI)
2412 0 : ereport(PANIC,
2413 : (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2414 : newTLI,
2415 : LSN_FORMAT_ARGS(minRecoveryPoint),
2416 : minRecoveryPointTLI)));
2417 :
2418 : /* Looks good */
2419 22 : }
2420 :
2421 :
2422 : /*
2423 : * Extract timestamp from WAL record.
2424 : *
2425 : * If the record contains a timestamp, returns true, and saves the timestamp
2426 : * in *recordXtime. If the record type has no timestamp, returns false.
2427 : * Currently, only transaction commit/abort records and restore points contain
2428 : * timestamps.
2429 : */
2430 : static bool
2431 83548 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2432 : {
2433 83548 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2434 83548 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2435 83548 : uint8 rmid = XLogRecGetRmid(record);
2436 :
2437 83548 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2438 : {
2439 4 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2440 4 : return true;
2441 : }
2442 83544 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2443 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2444 : {
2445 76608 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2446 76608 : return true;
2447 : }
2448 6936 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2449 : xact_info == XLOG_XACT_ABORT_PREPARED))
2450 : {
2451 6936 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2452 6936 : return true;
2453 : }
2454 0 : return false;
2455 : }
2456 :
2457 : /*
2458 : * Checks whether the current buffer page and backup page stored in the
2459 : * WAL record are consistent or not. Before comparing the two pages, a
2460 : * masking can be applied to the pages to ignore certain areas like hint bits,
2461 : * unused space between pd_lower and pd_upper among other things. This
2462 : * function should be called once WAL replay has been completed for a
2463 : * given record.
2464 : */
2465 : static void
2466 4275932 : verifyBackupPageConsistency(XLogReaderState *record)
2467 : {
2468 4275932 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2469 : RelFileLocator rlocator;
2470 : ForkNumber forknum;
2471 : BlockNumber blkno;
2472 : int block_id;
2473 :
2474 : /* Records with no backup blocks have no need for consistency checks. */
2475 4275932 : if (!XLogRecHasAnyBlockRefs(record))
2476 22 : return;
2477 :
2478 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2479 :
2480 8881854 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2481 : {
2482 : Buffer buf;
2483 : Page page;
2484 :
2485 4605944 : if (!XLogRecGetBlockTagExtended(record, block_id,
2486 : &rlocator, &forknum, &blkno, NULL))
2487 : {
2488 : /*
2489 : * WAL record doesn't contain a block reference with the given id.
2490 : * Do nothing.
2491 : */
2492 3910 : continue;
2493 : }
2494 :
2495 : Assert(XLogRecHasBlockImage(record, block_id));
2496 :
2497 4602034 : if (XLogRecBlockImageApply(record, block_id))
2498 : {
2499 : /*
2500 : * WAL record has already applied the page, so bypass the
2501 : * consistency check as that would result in comparing the full
2502 : * page stored in the record with itself.
2503 : */
2504 42888 : continue;
2505 : }
2506 :
2507 : /*
2508 : * Read the contents from the current buffer and store it in a
2509 : * temporary page.
2510 : */
2511 4559146 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2512 : RBM_NORMAL_NO_LOG,
2513 : InvalidBuffer);
2514 4559146 : if (!BufferIsValid(buf))
2515 0 : continue;
2516 :
2517 4559146 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2518 4559146 : page = BufferGetPage(buf);
2519 :
2520 : /*
2521 : * Take a copy of the local page where WAL has been applied to have a
2522 : * comparison base before masking it...
2523 : */
2524 4559146 : memcpy(replay_image_masked, page, BLCKSZ);
2525 :
2526 : /* No need for this page anymore now that a copy is in. */
2527 4559146 : UnlockReleaseBuffer(buf);
2528 :
2529 : /*
2530 : * If the block LSN is already ahead of this WAL record, we can't
2531 : * expect contents to match. This can happen if recovery is
2532 : * restarted.
2533 : */
2534 4559146 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2535 0 : continue;
2536 :
2537 : /*
2538 : * Read the contents from the backup copy, stored in WAL record and
2539 : * store it in a temporary page. There is no need to allocate a new
2540 : * page here, a local buffer is fine to hold its contents and a mask
2541 : * can be directly applied on it.
2542 : */
2543 4559146 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2544 0 : ereport(ERROR,
2545 : (errcode(ERRCODE_INTERNAL_ERROR),
2546 : errmsg_internal("%s", record->errormsg_buf)));
2547 :
2548 : /*
2549 : * If masking function is defined, mask both the primary and replay
2550 : * images
2551 : */
2552 4559146 : if (rmgr.rm_mask != NULL)
2553 : {
2554 4559146 : rmgr.rm_mask(replay_image_masked, blkno);
2555 4559146 : rmgr.rm_mask(primary_image_masked, blkno);
2556 : }
2557 :
2558 : /* Time to compare the primary and replay images. */
2559 4559146 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2560 : {
2561 0 : elog(FATAL,
2562 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2563 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2564 : forknum, blkno);
2565 : }
2566 : }
2567 : }
2568 :
2569 : /*
2570 : * For point-in-time recovery, this function decides whether we want to
2571 : * stop applying the XLOG before the current record.
2572 : *
2573 : * Returns true if we are stopping, false otherwise. If stopping, some
2574 : * information is saved in recoveryStopXid et al for use in annotating the
2575 : * new timeline's history file.
2576 : */
2577 : static bool
2578 5420846 : recoveryStopsBefore(XLogReaderState *record)
2579 : {
2580 5420846 : bool stopsHere = false;
2581 : uint8 xact_info;
2582 : bool isCommit;
2583 5420846 : TimestampTz recordXtime = 0;
2584 : TransactionId recordXid;
2585 :
2586 : /*
2587 : * Ignore recovery target settings when not in archive recovery (meaning
2588 : * we are in crash recovery).
2589 : */
2590 5420846 : if (!ArchiveRecoveryRequested)
2591 487794 : return false;
2592 :
2593 : /* Check if we should stop as soon as reaching consistency */
2594 4933052 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2595 : {
2596 0 : ereport(LOG,
2597 : (errmsg("recovery stopping after reaching consistency")));
2598 :
2599 0 : recoveryStopAfter = false;
2600 0 : recoveryStopXid = InvalidTransactionId;
2601 0 : recoveryStopLSN = InvalidXLogRecPtr;
2602 0 : recoveryStopTime = 0;
2603 0 : recoveryStopName[0] = '\0';
2604 0 : return true;
2605 : }
2606 :
2607 : /* Check if target LSN has been reached */
2608 4933052 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2609 14318 : !recoveryTargetInclusive &&
2610 722 : record->ReadRecPtr >= recoveryTargetLSN)
2611 : {
2612 2 : recoveryStopAfter = false;
2613 2 : recoveryStopXid = InvalidTransactionId;
2614 2 : recoveryStopLSN = record->ReadRecPtr;
2615 2 : recoveryStopTime = 0;
2616 2 : recoveryStopName[0] = '\0';
2617 2 : ereport(LOG,
2618 : (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2619 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2620 2 : return true;
2621 : }
2622 :
2623 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2624 4933050 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2625 4890760 : return false;
2626 :
2627 42290 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2628 :
2629 42290 : if (xact_info == XLOG_XACT_COMMIT)
2630 : {
2631 38258 : isCommit = true;
2632 38258 : recordXid = XLogRecGetXid(record);
2633 : }
2634 4032 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2635 : {
2636 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2637 : xl_xact_parsed_commit parsed;
2638 :
2639 48 : isCommit = true;
2640 48 : ParseCommitRecord(XLogRecGetInfo(record),
2641 : xlrec,
2642 : &parsed);
2643 48 : recordXid = parsed.twophase_xid;
2644 : }
2645 3984 : else if (xact_info == XLOG_XACT_ABORT)
2646 : {
2647 3446 : isCommit = false;
2648 3446 : recordXid = XLogRecGetXid(record);
2649 : }
2650 538 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2651 : {
2652 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2653 : xl_xact_parsed_abort parsed;
2654 :
2655 22 : isCommit = false;
2656 22 : ParseAbortRecord(XLogRecGetInfo(record),
2657 : xlrec,
2658 : &parsed);
2659 22 : recordXid = parsed.twophase_xid;
2660 : }
2661 : else
2662 516 : return false;
2663 :
2664 41774 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2665 : {
2666 : /*
2667 : * There can be only one transaction end record with this exact
2668 : * transactionid
2669 : *
2670 : * when testing for an xid, we MUST test for equality only, since
2671 : * transactions are numbered in the order they start, not the order
2672 : * they complete. A higher numbered xid will complete before you about
2673 : * 50% of the time...
2674 : */
2675 0 : stopsHere = (recordXid == recoveryTargetXid);
2676 : }
2677 :
2678 : /*
2679 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2680 : * We don't expect getRecordTimestamp ever to fail, since we already know
2681 : * this is a commit or abort record; but test its result anyway.
2682 : */
2683 41774 : if (getRecordTimestamp(record, &recordXtime) &&
2684 41774 : recoveryTarget == RECOVERY_TARGET_TIME)
2685 : {
2686 : /*
2687 : * There can be many transactions that share the same commit time, so
2688 : * we stop after the last one, if we are inclusive, or stop at the
2689 : * first one if we are exclusive
2690 : */
2691 0 : if (recoveryTargetInclusive)
2692 0 : stopsHere = (recordXtime > recoveryTargetTime);
2693 : else
2694 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2695 : }
2696 :
2697 41774 : if (stopsHere)
2698 : {
2699 0 : recoveryStopAfter = false;
2700 0 : recoveryStopXid = recordXid;
2701 0 : recoveryStopTime = recordXtime;
2702 0 : recoveryStopLSN = InvalidXLogRecPtr;
2703 0 : recoveryStopName[0] = '\0';
2704 :
2705 0 : if (isCommit)
2706 : {
2707 0 : ereport(LOG,
2708 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2709 : recoveryStopXid,
2710 : timestamptz_to_str(recoveryStopTime))));
2711 : }
2712 : else
2713 : {
2714 0 : ereport(LOG,
2715 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2716 : recoveryStopXid,
2717 : timestamptz_to_str(recoveryStopTime))));
2718 : }
2719 : }
2720 :
2721 41774 : return stopsHere;
2722 : }
2723 :
2724 : /*
2725 : * Same as recoveryStopsBefore, but called after applying the record.
2726 : *
2727 : * We also track the timestamp of the latest applied COMMIT/ABORT
2728 : * record in XLogRecoveryCtl->recoveryLastXTime.
2729 : */
2730 : static bool
2731 5420840 : recoveryStopsAfter(XLogReaderState *record)
2732 : {
2733 : uint8 info;
2734 : uint8 xact_info;
2735 : uint8 rmid;
2736 5420840 : TimestampTz recordXtime = 0;
2737 :
2738 : /*
2739 : * Ignore recovery target settings when not in archive recovery (meaning
2740 : * we are in crash recovery).
2741 : */
2742 5420840 : if (!ArchiveRecoveryRequested)
2743 487794 : return false;
2744 :
2745 4933046 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2746 4933046 : rmid = XLogRecGetRmid(record);
2747 :
2748 : /*
2749 : * There can be many restore points that share the same name; we stop at
2750 : * the first one.
2751 : */
2752 4933046 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2753 40 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2754 : {
2755 : xl_restore_point *recordRestorePointData;
2756 :
2757 6 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2758 :
2759 6 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2760 : {
2761 4 : recoveryStopAfter = true;
2762 4 : recoveryStopXid = InvalidTransactionId;
2763 4 : recoveryStopLSN = InvalidXLogRecPtr;
2764 4 : (void) getRecordTimestamp(record, &recoveryStopTime);
2765 4 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2766 :
2767 4 : ereport(LOG,
2768 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2769 : recoveryStopName,
2770 : timestamptz_to_str(recoveryStopTime))));
2771 4 : return true;
2772 : }
2773 : }
2774 :
2775 : /* Check if the target LSN has been reached */
2776 4933042 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2777 13596 : recoveryTargetInclusive &&
2778 13596 : record->ReadRecPtr >= recoveryTargetLSN)
2779 : {
2780 8 : recoveryStopAfter = true;
2781 8 : recoveryStopXid = InvalidTransactionId;
2782 8 : recoveryStopLSN = record->ReadRecPtr;
2783 8 : recoveryStopTime = 0;
2784 8 : recoveryStopName[0] = '\0';
2785 8 : ereport(LOG,
2786 : (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2787 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2788 8 : return true;
2789 : }
2790 :
2791 4933034 : if (rmid != RM_XACT_ID)
2792 4890748 : return false;
2793 :
2794 42286 : xact_info = info & XLOG_XACT_OPMASK;
2795 :
2796 42286 : if (xact_info == XLOG_XACT_COMMIT ||
2797 3984 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2798 538 : xact_info == XLOG_XACT_ABORT ||
2799 : xact_info == XLOG_XACT_ABORT_PREPARED)
2800 : {
2801 : TransactionId recordXid;
2802 :
2803 : /* Update the last applied transaction timestamp */
2804 41770 : if (getRecordTimestamp(record, &recordXtime))
2805 41770 : SetLatestXTime(recordXtime);
2806 :
2807 : /* Extract the XID of the committed/aborted transaction */
2808 41770 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2809 : {
2810 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2811 : xl_xact_parsed_commit parsed;
2812 :
2813 48 : ParseCommitRecord(XLogRecGetInfo(record),
2814 : xlrec,
2815 : &parsed);
2816 48 : recordXid = parsed.twophase_xid;
2817 : }
2818 41722 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2819 : {
2820 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2821 : xl_xact_parsed_abort parsed;
2822 :
2823 22 : ParseAbortRecord(XLogRecGetInfo(record),
2824 : xlrec,
2825 : &parsed);
2826 22 : recordXid = parsed.twophase_xid;
2827 : }
2828 : else
2829 41700 : recordXid = XLogRecGetXid(record);
2830 :
2831 : /*
2832 : * There can be only one transaction end record with this exact
2833 : * transactionid
2834 : *
2835 : * when testing for an xid, we MUST test for equality only, since
2836 : * transactions are numbered in the order they start, not the order
2837 : * they complete. A higher numbered xid will complete before you about
2838 : * 50% of the time...
2839 : */
2840 41770 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2841 0 : recordXid == recoveryTargetXid)
2842 : {
2843 0 : recoveryStopAfter = true;
2844 0 : recoveryStopXid = recordXid;
2845 0 : recoveryStopTime = recordXtime;
2846 0 : recoveryStopLSN = InvalidXLogRecPtr;
2847 0 : recoveryStopName[0] = '\0';
2848 :
2849 0 : if (xact_info == XLOG_XACT_COMMIT ||
2850 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2851 : {
2852 0 : ereport(LOG,
2853 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2854 : recoveryStopXid,
2855 : timestamptz_to_str(recoveryStopTime))));
2856 : }
2857 0 : else if (xact_info == XLOG_XACT_ABORT ||
2858 : xact_info == XLOG_XACT_ABORT_PREPARED)
2859 : {
2860 0 : ereport(LOG,
2861 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2862 : recoveryStopXid,
2863 : timestamptz_to_str(recoveryStopTime))));
2864 : }
2865 0 : return true;
2866 : }
2867 : }
2868 :
2869 : /* Check if we should stop as soon as reaching consistency */
2870 42286 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2871 : {
2872 0 : ereport(LOG,
2873 : (errmsg("recovery stopping after reaching consistency")));
2874 :
2875 0 : recoveryStopAfter = true;
2876 0 : recoveryStopXid = InvalidTransactionId;
2877 0 : recoveryStopTime = 0;
2878 0 : recoveryStopLSN = InvalidXLogRecPtr;
2879 0 : recoveryStopName[0] = '\0';
2880 0 : return true;
2881 : }
2882 :
2883 42286 : return false;
2884 : }
2885 :
2886 : /*
2887 : * Create a comment for the history file to explain why and where
2888 : * timeline changed.
2889 : */
2890 : static char *
2891 1648 : getRecoveryStopReason(void)
2892 : {
2893 : char reason[200];
2894 :
2895 1648 : if (recoveryTarget == RECOVERY_TARGET_XID)
2896 0 : snprintf(reason, sizeof(reason),
2897 : "%s transaction %u",
2898 0 : recoveryStopAfter ? "after" : "before",
2899 : recoveryStopXid);
2900 1648 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2901 0 : snprintf(reason, sizeof(reason),
2902 : "%s %s\n",
2903 0 : recoveryStopAfter ? "after" : "before",
2904 : timestamptz_to_str(recoveryStopTime));
2905 1648 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2906 14 : snprintf(reason, sizeof(reason),
2907 : "%s LSN %X/%X\n",
2908 14 : recoveryStopAfter ? "after" : "before",
2909 14 : LSN_FORMAT_ARGS(recoveryStopLSN));
2910 1634 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2911 6 : snprintf(reason, sizeof(reason),
2912 : "at restore point \"%s\"",
2913 : recoveryStopName);
2914 1628 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2915 0 : snprintf(reason, sizeof(reason), "reached consistency");
2916 : else
2917 1628 : snprintf(reason, sizeof(reason), "no recovery target specified");
2918 :
2919 1648 : return pstrdup(reason);
2920 : }
2921 :
2922 : /*
2923 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2924 : *
2925 : * endOfRecovery is true if the recovery target is reached and
2926 : * the paused state starts at the end of recovery because of
2927 : * recovery_target_action=pause, and false otherwise.
2928 : */
2929 : static void
2930 6 : recoveryPausesHere(bool endOfRecovery)
2931 : {
2932 : /* Don't pause unless users can connect! */
2933 6 : if (!LocalHotStandbyActive)
2934 0 : return;
2935 :
2936 : /* Don't pause after standby promotion has been triggered */
2937 6 : if (LocalPromoteIsTriggered)
2938 0 : return;
2939 :
2940 6 : if (endOfRecovery)
2941 2 : ereport(LOG,
2942 : (errmsg("pausing at the end of recovery"),
2943 : errhint("Execute pg_wal_replay_resume() to promote.")));
2944 : else
2945 4 : ereport(LOG,
2946 : (errmsg("recovery has paused"),
2947 : errhint("Execute pg_wal_replay_resume() to continue.")));
2948 :
2949 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2950 18 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2951 : {
2952 16 : ProcessStartupProcInterrupts();
2953 16 : if (CheckForStandbyTrigger())
2954 4 : return;
2955 :
2956 : /*
2957 : * If recovery pause is requested then set it paused. While we are in
2958 : * the loop, user might resume and pause again so set this every time.
2959 : */
2960 12 : ConfirmRecoveryPaused();
2961 :
2962 : /*
2963 : * We wait on a condition variable that will wake us as soon as the
2964 : * pause ends, but we use a timeout so we can check the above exit
2965 : * condition periodically too.
2966 : */
2967 12 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2968 : WAIT_EVENT_RECOVERY_PAUSE);
2969 : }
2970 2 : ConditionVariableCancelSleep();
2971 : }
2972 :
2973 : /*
2974 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2975 : * certain record types are applied at least that interval behind the primary.
2976 : *
2977 : * Returns true if we waited.
2978 : *
2979 : * Note that the delay is calculated between the WAL record log time and
2980 : * the current time on standby. We would prefer to keep track of when this
2981 : * standby received each WAL record, which would allow a more consistent
2982 : * approach and one not affected by time synchronisation issues, but that
2983 : * is significantly more effort and complexity for little actual gain in
2984 : * usability.
2985 : */
2986 : static bool
2987 5420844 : recoveryApplyDelay(XLogReaderState *record)
2988 : {
2989 : uint8 xact_info;
2990 : TimestampTz xtime;
2991 : TimestampTz delayUntil;
2992 : long msecs;
2993 :
2994 : /* nothing to do if no delay configured */
2995 5420844 : if (recovery_min_apply_delay <= 0)
2996 5420844 : return false;
2997 :
2998 : /* no delay is applied on a database not yet consistent */
2999 0 : if (!reachedConsistency)
3000 0 : return false;
3001 :
3002 : /* nothing to do if crash recovery is requested */
3003 0 : if (!ArchiveRecoveryRequested)
3004 0 : return false;
3005 :
3006 : /*
3007 : * Is it a COMMIT record?
3008 : *
3009 : * We deliberately choose not to delay aborts since they have no effect on
3010 : * MVCC. We already allow replay of records that don't have a timestamp,
3011 : * so there is already opportunity for issues caused by early conflicts on
3012 : * standbys.
3013 : */
3014 0 : if (XLogRecGetRmid(record) != RM_XACT_ID)
3015 0 : return false;
3016 :
3017 0 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3018 :
3019 0 : if (xact_info != XLOG_XACT_COMMIT &&
3020 : xact_info != XLOG_XACT_COMMIT_PREPARED)
3021 0 : return false;
3022 :
3023 0 : if (!getRecordTimestamp(record, &xtime))
3024 0 : return false;
3025 :
3026 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3027 :
3028 : /*
3029 : * Exit without arming the latch if it's already past time to apply this
3030 : * record
3031 : */
3032 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3033 0 : if (msecs <= 0)
3034 0 : return false;
3035 :
3036 : while (true)
3037 : {
3038 0 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3039 :
3040 : /* This might change recovery_min_apply_delay. */
3041 0 : ProcessStartupProcInterrupts();
3042 :
3043 0 : if (CheckForStandbyTrigger())
3044 0 : break;
3045 :
3046 : /*
3047 : * Recalculate delayUntil as recovery_min_apply_delay could have
3048 : * changed while waiting in this loop.
3049 : */
3050 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3051 :
3052 : /*
3053 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3054 : */
3055 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3056 : delayUntil);
3057 :
3058 0 : if (msecs <= 0)
3059 0 : break;
3060 :
3061 0 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3062 :
3063 0 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3064 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3065 : msecs,
3066 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3067 : }
3068 0 : return true;
3069 : }
3070 :
3071 : /*
3072 : * Get the current state of the recovery pause request.
3073 : */
3074 : RecoveryPauseState
3075 28 : GetRecoveryPauseState(void)
3076 : {
3077 : RecoveryPauseState state;
3078 :
3079 28 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3080 28 : state = XLogRecoveryCtl->recoveryPauseState;
3081 28 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3082 :
3083 28 : return state;
3084 : }
3085 :
3086 : /*
3087 : * Set the recovery pause state.
3088 : *
3089 : * If recovery pause is requested then sets the recovery pause state to
3090 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3091 : * to 'not paused' to resume the recovery. The recovery pause will be
3092 : * confirmed by the ConfirmRecoveryPaused.
3093 : */
3094 : void
3095 92 : SetRecoveryPause(bool recoveryPause)
3096 : {
3097 92 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3098 :
3099 92 : if (!recoveryPause)
3100 86 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3101 6 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3102 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3103 :
3104 92 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3105 :
3106 92 : if (!recoveryPause)
3107 86 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3108 92 : }
3109 :
3110 : /*
3111 : * Confirm the recovery pause by setting the recovery pause state to
3112 : * RECOVERY_PAUSED.
3113 : */
3114 : static void
3115 12 : ConfirmRecoveryPaused(void)
3116 : {
3117 : /* If recovery pause is requested then set it paused */
3118 12 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3119 12 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3120 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3121 12 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3122 12 : }
3123 :
3124 :
3125 : /*
3126 : * Attempt to read the next XLOG record.
3127 : *
3128 : * Before first call, the reader needs to be positioned to the first record
3129 : * by calling XLogPrefetcherBeginRead().
3130 : *
3131 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3132 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3133 : * record is available.
3134 : */
3135 : static XLogRecord *
3136 5424794 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3137 : bool fetching_ckpt, TimeLineID replayTLI)
3138 : {
3139 : XLogRecord *record;
3140 5424794 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3141 5424794 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3142 :
3143 : /* Pass through parameters to XLogPageRead */
3144 5424794 : private->fetching_ckpt = fetching_ckpt;
3145 5424794 : private->emode = emode;
3146 5424794 : private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3147 5424794 : private->replayTLI = replayTLI;
3148 :
3149 : /* This is the first attempt to read this page. */
3150 5424794 : lastSourceFailed = false;
3151 :
3152 : for (;;)
3153 206 : {
3154 : char *errormsg;
3155 :
3156 5425000 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3157 5424894 : if (record == NULL)
3158 : {
3159 : /*
3160 : * When we find that WAL ends in an incomplete record, keep track
3161 : * of that record. After recovery is done, we'll write a record
3162 : * to indicate to downstream WAL readers that that portion is to
3163 : * be ignored.
3164 : *
3165 : * However, when ArchiveRecoveryRequested = true, we're going to
3166 : * switch to a new timeline at the end of recovery. We will only
3167 : * copy WAL over to the new timeline up to the end of the last
3168 : * complete record, so if we did this, we would later create an
3169 : * overwrite contrecord in the wrong place, breaking everything.
3170 : */
3171 500 : if (!ArchiveRecoveryRequested &&
3172 208 : !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
3173 : {
3174 22 : abortedRecPtr = xlogreader->abortedRecPtr;
3175 22 : missingContrecPtr = xlogreader->missingContrecPtr;
3176 : }
3177 :
3178 500 : if (readFile >= 0)
3179 : {
3180 454 : close(readFile);
3181 454 : readFile = -1;
3182 : }
3183 :
3184 : /*
3185 : * We only end up here without a message when XLogPageRead()
3186 : * failed - in that case we already logged something. In
3187 : * StandbyMode that only happens if we have been triggered, so we
3188 : * shouldn't loop anymore in that case.
3189 : */
3190 500 : if (errormsg)
3191 454 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3192 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3193 : }
3194 :
3195 : /*
3196 : * Check page TLI is one of the expected values.
3197 : */
3198 5424394 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3199 : {
3200 : char fname[MAXFNAMELEN];
3201 : XLogSegNo segno;
3202 : int32 offset;
3203 :
3204 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3205 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3206 : wal_segment_size);
3207 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3208 : wal_segment_size);
3209 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3210 : (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3211 : xlogreader->latestPageTLI,
3212 : fname,
3213 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3214 : offset)));
3215 0 : record = NULL;
3216 : }
3217 :
3218 5424894 : if (record)
3219 : {
3220 : /* Great, got a record */
3221 5424688 : return record;
3222 : }
3223 : else
3224 : {
3225 : /* No valid record available from this source */
3226 500 : lastSourceFailed = true;
3227 :
3228 : /*
3229 : * If archive recovery was requested, but we were still doing
3230 : * crash recovery, switch to archive recovery and retry using the
3231 : * offline archive. We have now replayed all the valid WAL in
3232 : * pg_wal, so we are presumably now consistent.
3233 : *
3234 : * We require that there's at least some valid WAL present in
3235 : * pg_wal, however (!fetching_ckpt). We could recover using the
3236 : * WAL from the archive, even if pg_wal is completely empty, but
3237 : * we'd have no idea how far we'd have to replay to reach
3238 : * consistency. So err on the safe side and give up.
3239 : */
3240 500 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3241 4 : !fetching_ckpt)
3242 : {
3243 4 : ereport(DEBUG1,
3244 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3245 4 : InArchiveRecovery = true;
3246 4 : if (StandbyModeRequested)
3247 4 : EnableStandbyMode();
3248 :
3249 4 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3250 4 : minRecoveryPoint = xlogreader->EndRecPtr;
3251 4 : minRecoveryPointTLI = replayTLI;
3252 :
3253 4 : CheckRecoveryConsistency();
3254 :
3255 : /*
3256 : * Before we retry, reset lastSourceFailed and currentSource
3257 : * so that we will check the archive next.
3258 : */
3259 4 : lastSourceFailed = false;
3260 4 : currentSource = XLOG_FROM_ANY;
3261 :
3262 206 : continue;
3263 : }
3264 :
3265 : /* In standby mode, loop back to retry. Otherwise, give up. */
3266 496 : if (StandbyMode && !CheckForStandbyTrigger())
3267 202 : continue;
3268 : else
3269 294 : return NULL;
3270 : }
3271 : }
3272 : }
3273 :
3274 : /*
3275 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3276 : * already). Returns number of bytes read, if the page is read successfully,
3277 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3278 : * but only if they have not been previously reported.
3279 : *
3280 : * See XLogReaderRoutine.page_read for more details.
3281 : *
3282 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3283 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3284 : *
3285 : * This is responsible for restoring files from archive as needed, as well
3286 : * as for waiting for the requested WAL record to arrive in standby mode.
3287 : *
3288 : * xlogreader->private_data->emode specifies the log level used for reporting
3289 : * "file not found" or "end of WAL" situations in archive recovery, or in
3290 : * standby mode when promotion is triggered. If set to WARNING or below,
3291 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3292 : * levels the ereport() won't return.
3293 : *
3294 : * In standby mode, if after a successful return of XLogPageRead() the
3295 : * caller finds the record it's interested in to be broken, it should
3296 : * ereport the error with the level determined by
3297 : * emode_for_corrupt_record(), and then set lastSourceFailed
3298 : * and call XLogPageRead() again with the same arguments. This lets
3299 : * XLogPageRead() to try fetching the record from another source, or to
3300 : * sleep and retry.
3301 : */
3302 : static int
3303 2786908 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3304 : XLogRecPtr targetRecPtr, char *readBuf)
3305 : {
3306 2786908 : XLogPageReadPrivate *private =
3307 : (XLogPageReadPrivate *) xlogreader->private_data;
3308 2786908 : int emode = private->emode;
3309 : uint32 targetPageOff;
3310 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3311 : int r;
3312 : instr_time io_start;
3313 :
3314 2786908 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3315 2786908 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3316 :
3317 : /*
3318 : * See if we need to switch to a new segment because the requested record
3319 : * is not in the currently open one.
3320 : */
3321 2786908 : if (readFile >= 0 &&
3322 2783466 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3323 : {
3324 : /*
3325 : * Request a restartpoint if we've replayed too much xlog since the
3326 : * last one.
3327 : */
3328 2532 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3329 : {
3330 2502 : if (XLogCheckpointNeeded(readSegNo))
3331 : {
3332 2298 : (void) GetRedoRecPtr();
3333 2298 : if (XLogCheckpointNeeded(readSegNo))
3334 2284 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3335 : }
3336 : }
3337 :
3338 2532 : close(readFile);
3339 2532 : readFile = -1;
3340 2532 : readSource = XLOG_FROM_ANY;
3341 : }
3342 :
3343 2786908 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3344 :
3345 2786912 : retry:
3346 : /* See if we need to retrieve more data */
3347 2786912 : if (readFile < 0 ||
3348 2780934 : (readSource == XLOG_FROM_STREAM &&
3349 2757250 : flushedUpto < targetPagePtr + reqLen))
3350 : {
3351 22244 : if (readFile >= 0 &&
3352 16266 : xlogreader->nonblocking &&
3353 7984 : readSource == XLOG_FROM_STREAM &&
3354 7984 : flushedUpto < targetPagePtr + reqLen)
3355 7984 : return XLREAD_WOULDBLOCK;
3356 :
3357 14154 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3358 14260 : private->randAccess,
3359 14260 : private->fetching_ckpt,
3360 : targetRecPtr,
3361 : private->replayTLI,
3362 : xlogreader->EndRecPtr,
3363 14260 : xlogreader->nonblocking))
3364 : {
3365 1140 : case XLREAD_WOULDBLOCK:
3366 1140 : return XLREAD_WOULDBLOCK;
3367 88 : case XLREAD_FAIL:
3368 88 : if (readFile >= 0)
3369 0 : close(readFile);
3370 88 : readFile = -1;
3371 88 : readLen = 0;
3372 88 : readSource = XLOG_FROM_ANY;
3373 88 : return XLREAD_FAIL;
3374 12926 : case XLREAD_SUCCESS:
3375 12926 : break;
3376 : }
3377 2764668 : }
3378 :
3379 : /*
3380 : * At this point, we have the right segment open and if we're streaming we
3381 : * know the requested record is in it.
3382 : */
3383 : Assert(readFile != -1);
3384 :
3385 : /*
3386 : * If the current segment is being streamed from the primary, calculate
3387 : * how much of the current page we have received already. We know the
3388 : * requested record has been received, but this is for the benefit of
3389 : * future calls, to allow quick exit at the top of this function.
3390 : */
3391 2777594 : if (readSource == XLOG_FROM_STREAM)
3392 : {
3393 2751604 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3394 2745072 : readLen = XLOG_BLCKSZ;
3395 : else
3396 6532 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3397 : targetPageOff;
3398 : }
3399 : else
3400 25990 : readLen = XLOG_BLCKSZ;
3401 :
3402 : /* Read the requested page */
3403 2777594 : readOff = targetPageOff;
3404 :
3405 : /* Measure I/O timing when reading segment */
3406 2777594 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3407 :
3408 2777594 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3409 2777594 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3410 2777594 : if (r != XLOG_BLCKSZ)
3411 : {
3412 : char fname[MAXFNAMELEN];
3413 0 : int save_errno = errno;
3414 :
3415 0 : pgstat_report_wait_end();
3416 :
3417 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3418 : io_start, 1, r);
3419 :
3420 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3421 0 : if (r < 0)
3422 : {
3423 0 : errno = save_errno;
3424 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3425 : (errcode_for_file_access(),
3426 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3427 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3428 : readOff)));
3429 : }
3430 : else
3431 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3432 : (errcode(ERRCODE_DATA_CORRUPTED),
3433 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3434 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3435 : readOff, r, (Size) XLOG_BLCKSZ)));
3436 0 : goto next_record_is_invalid;
3437 : }
3438 2777594 : pgstat_report_wait_end();
3439 :
3440 2777594 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3441 : io_start, 1, r);
3442 :
3443 : Assert(targetSegNo == readSegNo);
3444 : Assert(targetPageOff == readOff);
3445 : Assert(reqLen <= readLen);
3446 :
3447 2777594 : xlogreader->seg.ws_tli = curFileTLI;
3448 :
3449 : /*
3450 : * Check the page header immediately, so that we can retry immediately if
3451 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3452 : * validates the page header anyway, and would propagate the failure up to
3453 : * ReadRecord(), which would retry. However, there's a corner case with
3454 : * continuation records, if a record is split across two pages such that
3455 : * we would need to read the two pages from different sources across two
3456 : * WAL segments.
3457 : *
3458 : * The first page is only available locally, in pg_wal, because it's
3459 : * already been recycled on the primary. The second page, however, is not
3460 : * present in pg_wal, and we should stream it from the primary. There is a
3461 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3462 : * We would read the first page from the local WAL segment, but when
3463 : * reading the second page, we would read the bogus, recycled, WAL
3464 : * segment. If we didn't catch that case here, we would never recover,
3465 : * because ReadRecord() would retry reading the whole record from the
3466 : * beginning.
3467 : *
3468 : * Of course, this only catches errors in the page header, which is what
3469 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3470 : * corruption still has the same problem. But this at least fixes the
3471 : * common case, which can happen as part of normal operation.
3472 : *
3473 : * Validating the page header is cheap enough that doing it twice
3474 : * shouldn't be a big deal from a performance point of view.
3475 : *
3476 : * When not in standby mode, an invalid page header should cause recovery
3477 : * to end, not retry reading the page, so we don't need to validate the
3478 : * page header here for the retry. Instead, ReadPageInternal() is
3479 : * responsible for the validation.
3480 : */
3481 2777594 : if (StandbyMode &&
3482 2758148 : (targetPagePtr % wal_segment_size) == 0 &&
3483 1986 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3484 : {
3485 : /*
3486 : * Emit this error right now then retry this page immediately. Use
3487 : * errmsg_internal() because the message was already translated.
3488 : */
3489 6 : if (xlogreader->errormsg_buf[0])
3490 6 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3491 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3492 :
3493 : /* reset any error XLogReaderValidatePageHeader() might have set */
3494 6 : XLogReaderResetError(xlogreader);
3495 6 : goto next_record_is_invalid;
3496 : }
3497 :
3498 2777588 : return readLen;
3499 :
3500 6 : next_record_is_invalid:
3501 :
3502 : /*
3503 : * If we're reading ahead, give up fast. Retries and error reporting will
3504 : * be handled by a later read when recovery catches up to this point.
3505 : */
3506 6 : if (xlogreader->nonblocking)
3507 2 : return XLREAD_WOULDBLOCK;
3508 :
3509 4 : lastSourceFailed = true;
3510 :
3511 4 : if (readFile >= 0)
3512 4 : close(readFile);
3513 4 : readFile = -1;
3514 4 : readLen = 0;
3515 4 : readSource = XLOG_FROM_ANY;
3516 :
3517 : /* In standby-mode, keep trying */
3518 4 : if (StandbyMode)
3519 4 : goto retry;
3520 : else
3521 0 : return XLREAD_FAIL;
3522 : }
3523 :
3524 : /*
3525 : * Open the WAL segment containing WAL location 'RecPtr'.
3526 : *
3527 : * The segment can be fetched via restore_command, or via walreceiver having
3528 : * streamed the record, or it can already be present in pg_wal. Checking
3529 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3530 : * too, in case someone copies a new segment directly to pg_wal. That is not
3531 : * documented or recommended, though.
3532 : *
3533 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3534 : * prepare to read WAL starting from RedoStartLSN after this.
3535 : *
3536 : * 'RecPtr' might not point to the beginning of the record we're interested
3537 : * in, it might also point to the page or segment header. In that case,
3538 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3539 : * used to decide which timeline to stream the requested WAL from.
3540 : *
3541 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3542 : * timelines, we can reject a switch to a timeline that branched off before
3543 : * this point.
3544 : *
3545 : * If the record is not immediately available, the function returns false
3546 : * if we're not in standby mode. In standby mode, waits for it to become
3547 : * available.
3548 : *
3549 : * When the requested record becomes available, the function opens the file
3550 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3551 : * of standby mode is triggered by the user, and there is no more WAL
3552 : * available, returns XLREAD_FAIL.
3553 : *
3554 : * If nonblocking is true, then give up immediately if we can't satisfy the
3555 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3556 : */
3557 : static XLogPageReadResult
3558 14260 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3559 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3560 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3561 : bool nonblocking)
3562 : {
3563 : static TimestampTz last_fail_time = 0;
3564 : TimestampTz now;
3565 14260 : bool streaming_reply_sent = false;
3566 :
3567 : /*-------
3568 : * Standby mode is implemented by a state machine:
3569 : *
3570 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3571 : * pg_wal (XLOG_FROM_PG_WAL)
3572 : * 2. Check for promotion trigger request
3573 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3574 : * 4. Rescan timelines
3575 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3576 : *
3577 : * Failure to read from the current source advances the state machine to
3578 : * the next state.
3579 : *
3580 : * 'currentSource' indicates the current state. There are no currentSource
3581 : * values for "check trigger", "rescan timelines", and "sleep" states,
3582 : * those actions are taken when reading from the previous source fails, as
3583 : * part of advancing to the next state.
3584 : *
3585 : * If standby mode is turned off while reading WAL from stream, we move
3586 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3587 : * the files (which would be required at end of recovery, e.g., timeline
3588 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3589 : * here because it's already stopped when standby mode is turned off at
3590 : * the end of recovery.
3591 : *-------
3592 : */
3593 14260 : if (!InArchiveRecovery)
3594 1776 : currentSource = XLOG_FROM_PG_WAL;
3595 12484 : else if (currentSource == XLOG_FROM_ANY ||
3596 12260 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3597 : {
3598 224 : lastSourceFailed = false;
3599 224 : currentSource = XLOG_FROM_ARCHIVE;
3600 : }
3601 :
3602 : for (;;)
3603 11860 : {
3604 26120 : XLogSource oldSource = currentSource;
3605 26120 : bool startWalReceiver = false;
3606 :
3607 : /*
3608 : * First check if we failed to read from the current source, and
3609 : * advance the state machine if so. The failure to read might've
3610 : * happened outside this function, e.g when a CRC check fails on a
3611 : * record, or within this loop.
3612 : */
3613 26120 : if (lastSourceFailed)
3614 : {
3615 : /*
3616 : * Don't allow any retry loops to occur during nonblocking
3617 : * readahead. Let the caller process everything that has been
3618 : * decoded already first.
3619 : */
3620 856 : if (nonblocking)
3621 142 : return XLREAD_WOULDBLOCK;
3622 :
3623 714 : switch (currentSource)
3624 : {
3625 444 : case XLOG_FROM_ARCHIVE:
3626 : case XLOG_FROM_PG_WAL:
3627 :
3628 : /*
3629 : * Check to see if promotion is requested. Note that we do
3630 : * this only after failure, so when you promote, we still
3631 : * finish replaying as much as we can from archive and
3632 : * pg_wal before failover.
3633 : */
3634 444 : if (StandbyMode && CheckForStandbyTrigger())
3635 : {
3636 40 : XLogShutdownWalRcv();
3637 40 : return XLREAD_FAIL;
3638 : }
3639 :
3640 : /*
3641 : * Not in standby mode, and we've now tried the archive
3642 : * and pg_wal.
3643 : */
3644 404 : if (!StandbyMode)
3645 48 : return XLREAD_FAIL;
3646 :
3647 : /*
3648 : * Move to XLOG_FROM_STREAM state, and set to start a
3649 : * walreceiver if necessary.
3650 : */
3651 356 : currentSource = XLOG_FROM_STREAM;
3652 356 : startWalReceiver = true;
3653 356 : break;
3654 :
3655 270 : case XLOG_FROM_STREAM:
3656 :
3657 : /*
3658 : * Failure while streaming. Most likely, we got here
3659 : * because streaming replication was terminated, or
3660 : * promotion was triggered. But we also get here if we
3661 : * find an invalid record in the WAL streamed from the
3662 : * primary, in which case something is seriously wrong.
3663 : * There's little chance that the problem will just go
3664 : * away, but PANIC is not good for availability either,
3665 : * especially in hot standby mode. So, we treat that the
3666 : * same as disconnection, and retry from archive/pg_wal
3667 : * again. The WAL in the archive should be identical to
3668 : * what was streamed, so it's unlikely that it helps, but
3669 : * one can hope...
3670 : */
3671 :
3672 : /*
3673 : * We should be able to move to XLOG_FROM_STREAM only in
3674 : * standby mode.
3675 : */
3676 : Assert(StandbyMode);
3677 :
3678 : /*
3679 : * Before we leave XLOG_FROM_STREAM state, make sure that
3680 : * walreceiver is not active, so that it won't overwrite
3681 : * WAL that we restore from archive.
3682 : */
3683 270 : XLogShutdownWalRcv();
3684 :
3685 : /*
3686 : * Before we sleep, re-scan for possible new timelines if
3687 : * we were requested to recover to the latest timeline.
3688 : */
3689 270 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3690 : {
3691 270 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3692 : {
3693 12 : currentSource = XLOG_FROM_ARCHIVE;
3694 12 : break;
3695 : }
3696 : }
3697 :
3698 : /*
3699 : * XLOG_FROM_STREAM is the last state in our state
3700 : * machine, so we've exhausted all the options for
3701 : * obtaining the requested WAL. We're going to loop back
3702 : * and retry from the archive, but if it hasn't been long
3703 : * since last attempt, sleep wal_retrieve_retry_interval
3704 : * milliseconds to avoid busy-waiting.
3705 : */
3706 258 : now = GetCurrentTimestamp();
3707 258 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3708 : wal_retrieve_retry_interval))
3709 : {
3710 : long wait_time;
3711 :
3712 268 : wait_time = wal_retrieve_retry_interval -
3713 134 : TimestampDifferenceMilliseconds(last_fail_time, now);
3714 :
3715 134 : elog(LOG, "waiting for WAL to become available at %X/%X",
3716 : LSN_FORMAT_ARGS(RecPtr));
3717 :
3718 : /* Do background tasks that might benefit us later. */
3719 134 : KnownAssignedTransactionIdsIdleMaintenance();
3720 :
3721 134 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3722 : WL_LATCH_SET | WL_TIMEOUT |
3723 : WL_EXIT_ON_PM_DEATH,
3724 : wait_time,
3725 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3726 134 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3727 134 : now = GetCurrentTimestamp();
3728 :
3729 : /* Handle interrupt signals of startup process */
3730 134 : ProcessStartupProcInterrupts();
3731 : }
3732 234 : last_fail_time = now;
3733 234 : currentSource = XLOG_FROM_ARCHIVE;
3734 234 : break;
3735 :
3736 0 : default:
3737 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3738 : }
3739 : }
3740 25264 : else if (currentSource == XLOG_FROM_PG_WAL)
3741 : {
3742 : /*
3743 : * We just successfully read a file in pg_wal. We prefer files in
3744 : * the archive over ones in pg_wal, so try the next file again
3745 : * from the archive first.
3746 : */
3747 1768 : if (InArchiveRecovery)
3748 0 : currentSource = XLOG_FROM_ARCHIVE;
3749 : }
3750 :
3751 25866 : if (currentSource != oldSource)
3752 602 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3753 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3754 : lastSourceFailed ? "failure" : "success");
3755 :
3756 : /*
3757 : * We've now handled possible failure. Try to read from the chosen
3758 : * source.
3759 : */
3760 25866 : lastSourceFailed = false;
3761 :
3762 25866 : switch (currentSource)
3763 : {
3764 2644 : case XLOG_FROM_ARCHIVE:
3765 : case XLOG_FROM_PG_WAL:
3766 :
3767 : /*
3768 : * WAL receiver must not be running when reading WAL from
3769 : * archive or pg_wal.
3770 : */
3771 : Assert(!WalRcvStreaming());
3772 :
3773 : /* Close any old file we might have open. */
3774 2644 : if (readFile >= 0)
3775 : {
3776 134 : close(readFile);
3777 134 : readFile = -1;
3778 : }
3779 : /* Reset curFileTLI if random fetch. */
3780 2644 : if (randAccess)
3781 2060 : curFileTLI = 0;
3782 :
3783 : /*
3784 : * Try to restore the file from archive, or read an existing
3785 : * file from pg_wal.
3786 : */
3787 2644 : readFile = XLogFileReadAnyTLI(readSegNo,
3788 2644 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3789 : currentSource);
3790 2644 : if (readFile >= 0)
3791 2306 : return XLREAD_SUCCESS; /* success! */
3792 :
3793 : /*
3794 : * Nope, not found in archive or pg_wal.
3795 : */
3796 338 : lastSourceFailed = true;
3797 338 : break;
3798 :
3799 23222 : case XLOG_FROM_STREAM:
3800 : {
3801 : bool havedata;
3802 :
3803 : /*
3804 : * We should be able to move to XLOG_FROM_STREAM only in
3805 : * standby mode.
3806 : */
3807 : Assert(StandbyMode);
3808 :
3809 : /*
3810 : * First, shutdown walreceiver if its restart has been
3811 : * requested -- but no point if we're already slated for
3812 : * starting it.
3813 : */
3814 23222 : if (pendingWalRcvRestart && !startWalReceiver)
3815 : {
3816 6 : XLogShutdownWalRcv();
3817 :
3818 : /*
3819 : * Re-scan for possible new timelines if we were
3820 : * requested to recover to the latest timeline.
3821 : */
3822 6 : if (recoveryTargetTimeLineGoal ==
3823 : RECOVERY_TARGET_TIMELINE_LATEST)
3824 6 : rescanLatestTimeLine(replayTLI, replayLSN);
3825 :
3826 6 : startWalReceiver = true;
3827 : }
3828 23222 : pendingWalRcvRestart = false;
3829 :
3830 : /*
3831 : * Launch walreceiver if needed.
3832 : *
3833 : * If fetching_ckpt is true, RecPtr points to the initial
3834 : * checkpoint location. In that case, we use RedoStartLSN
3835 : * as the streaming start position instead of RecPtr, so
3836 : * that when we later jump backwards to start redo at
3837 : * RedoStartLSN, we will have the logs streamed already.
3838 : */
3839 23222 : if (startWalReceiver &&
3840 362 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3841 : {
3842 : XLogRecPtr ptr;
3843 : TimeLineID tli;
3844 :
3845 314 : if (fetching_ckpt)
3846 : {
3847 0 : ptr = RedoStartLSN;
3848 0 : tli = RedoStartTLI;
3849 : }
3850 : else
3851 : {
3852 314 : ptr = RecPtr;
3853 :
3854 : /*
3855 : * Use the record begin position to determine the
3856 : * TLI, rather than the position we're reading.
3857 : */
3858 314 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3859 :
3860 314 : if (curFileTLI > 0 && tli < curFileTLI)
3861 0 : elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3862 : LSN_FORMAT_ARGS(tliRecPtr),
3863 : tli, curFileTLI);
3864 : }
3865 314 : curFileTLI = tli;
3866 314 : SetInstallXLogFileSegmentActive();
3867 314 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3868 : PrimarySlotName,
3869 : wal_receiver_create_temp_slot);
3870 314 : flushedUpto = 0;
3871 : }
3872 :
3873 : /*
3874 : * Check if WAL receiver is active or wait to start up.
3875 : */
3876 23222 : if (!WalRcvStreaming())
3877 : {
3878 218 : lastSourceFailed = true;
3879 218 : break;
3880 : }
3881 :
3882 : /*
3883 : * Walreceiver is active, so see if new data has arrived.
3884 : *
3885 : * We only advance XLogReceiptTime when we obtain fresh
3886 : * WAL from walreceiver and observe that we had already
3887 : * processed everything before the most recent "chunk"
3888 : * that it flushed to disk. In steady state where we are
3889 : * keeping up with the incoming data, XLogReceiptTime will
3890 : * be updated on each cycle. When we are behind,
3891 : * XLogReceiptTime will not advance, so the grace time
3892 : * allotted to conflicting queries will decrease.
3893 : */
3894 23004 : if (RecPtr < flushedUpto)
3895 3624 : havedata = true;
3896 : else
3897 : {
3898 : XLogRecPtr latestChunkStart;
3899 :
3900 19380 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3901 19380 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3902 : {
3903 9526 : havedata = true;
3904 9526 : if (latestChunkStart <= RecPtr)
3905 : {
3906 7846 : XLogReceiptTime = GetCurrentTimestamp();
3907 7846 : SetCurrentChunkStartTime(XLogReceiptTime);
3908 : }
3909 : }
3910 : else
3911 9854 : havedata = false;
3912 : }
3913 23004 : if (havedata)
3914 : {
3915 : /*
3916 : * Great, streamed far enough. Open the file if it's
3917 : * not open already. Also read the timeline history
3918 : * file if we haven't initialized timeline history
3919 : * yet; it should be streamed over and present in
3920 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3921 : * info is set correctly and XLogReceiptTime isn't
3922 : * changed.
3923 : *
3924 : * NB: We must set readTimeLineHistory based on
3925 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3926 : * be the same, but if recovery_target_timeline is
3927 : * 'latest' and archiving is configured, then it's
3928 : * possible that we managed to retrieve one or more
3929 : * new timeline history files from the archive,
3930 : * updating recoveryTargetTLI.
3931 : */
3932 13150 : if (readFile < 0)
3933 : {
3934 2530 : if (!expectedTLEs)
3935 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3936 2530 : readFile = XLogFileRead(readSegNo, receiveTLI,
3937 : XLOG_FROM_STREAM, false);
3938 : Assert(readFile >= 0);
3939 : }
3940 : else
3941 : {
3942 : /* just make sure source info is correct... */
3943 10620 : readSource = XLOG_FROM_STREAM;
3944 10620 : XLogReceiptSource = XLOG_FROM_STREAM;
3945 10620 : return XLREAD_SUCCESS;
3946 : }
3947 2530 : break;
3948 : }
3949 :
3950 : /* In nonblocking mode, return rather than sleeping. */
3951 9854 : if (nonblocking)
3952 998 : return XLREAD_WOULDBLOCK;
3953 :
3954 : /*
3955 : * Data not here yet. Check for trigger, then wait for
3956 : * walreceiver to wake us up when new WAL arrives.
3957 : */
3958 8856 : if (CheckForStandbyTrigger())
3959 : {
3960 : /*
3961 : * Note that we don't return XLREAD_FAIL immediately
3962 : * here. After being triggered, we still want to
3963 : * replay all the WAL that was already streamed. It's
3964 : * in pg_wal now, so we just treat this as a failure,
3965 : * and the state machine will move on to replay the
3966 : * streamed WAL from pg_wal, and then recheck the
3967 : * trigger and exit replay.
3968 : */
3969 52 : lastSourceFailed = true;
3970 52 : break;
3971 : }
3972 :
3973 : /*
3974 : * Since we have replayed everything we have received so
3975 : * far and are about to start waiting for more WAL, let's
3976 : * tell the upstream server our replay location now so
3977 : * that pg_stat_replication doesn't show stale
3978 : * information.
3979 : */
3980 8804 : if (!streaming_reply_sent)
3981 : {
3982 7080 : WalRcvForceReply();
3983 7080 : streaming_reply_sent = true;
3984 : }
3985 :
3986 : /* Do any background tasks that might benefit us later. */
3987 8804 : KnownAssignedTransactionIdsIdleMaintenance();
3988 :
3989 : /* Update pg_stat_recovery_prefetch before sleeping. */
3990 8804 : XLogPrefetcherComputeStats(xlogprefetcher);
3991 :
3992 : /*
3993 : * Wait for more WAL to arrive, when we will be woken
3994 : * immediately by the WAL receiver.
3995 : */
3996 8804 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3997 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
3998 : -1L,
3999 : WAIT_EVENT_RECOVERY_WAL_STREAM);
4000 8804 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4001 8804 : break;
4002 : }
4003 :
4004 0 : default:
4005 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
4006 : }
4007 :
4008 : /*
4009 : * Check for recovery pause here so that we can confirm more quickly
4010 : * that a requested pause has actually taken effect.
4011 : */
4012 11942 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4013 : RECOVERY_NOT_PAUSED)
4014 4 : recoveryPausesHere(false);
4015 :
4016 : /*
4017 : * This possibly-long loop needs to handle interrupts of startup
4018 : * process.
4019 : */
4020 11942 : ProcessStartupProcInterrupts();
4021 : }
4022 :
4023 : return XLREAD_FAIL; /* not reached */
4024 : }
4025 :
4026 :
4027 : /*
4028 : * Determine what log level should be used to report a corrupt WAL record
4029 : * in the current WAL page, previously read by XLogPageRead().
4030 : *
4031 : * 'emode' is the error mode that would be used to report a file-not-found
4032 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4033 : * we're retrying the exact same record that we've tried previously, only
4034 : * complain the first time to keep the noise down. However, we only do when
4035 : * reading from pg_wal, because we don't expect any invalid records in archive
4036 : * or in records streamed from the primary. Files in the archive should be complete,
4037 : * and we should never hit the end of WAL because we stop and wait for more WAL
4038 : * to arrive before replaying it.
4039 : *
4040 : * NOTE: This function remembers the RecPtr value it was last called with,
4041 : * to suppress repeated messages about the same record. Only call this when
4042 : * you are about to ereport(), or you might cause a later message to be
4043 : * erroneously suppressed.
4044 : */
4045 : static int
4046 460 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4047 : {
4048 : static XLogRecPtr lastComplaint = 0;
4049 :
4050 460 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4051 : {
4052 454 : if (RecPtr == lastComplaint)
4053 88 : emode = DEBUG1;
4054 : else
4055 366 : lastComplaint = RecPtr;
4056 : }
4057 460 : return emode;
4058 : }
4059 :
4060 :
4061 : /*
4062 : * Subroutine to try to fetch and validate a prior checkpoint record.
4063 : */
4064 : static XLogRecord *
4065 1762 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4066 : TimeLineID replayTLI)
4067 : {
4068 : XLogRecord *record;
4069 : uint8 info;
4070 :
4071 : Assert(xlogreader != NULL);
4072 :
4073 1762 : if (!XRecOffIsValid(RecPtr))
4074 : {
4075 0 : ereport(LOG,
4076 : (errmsg("invalid checkpoint location")));
4077 0 : return NULL;
4078 : }
4079 :
4080 1762 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4081 1762 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4082 :
4083 1762 : if (record == NULL)
4084 : {
4085 0 : ereport(LOG,
4086 : (errmsg("invalid checkpoint record")));
4087 0 : return NULL;
4088 : }
4089 1762 : if (record->xl_rmid != RM_XLOG_ID)
4090 : {
4091 0 : ereport(LOG,
4092 : (errmsg("invalid resource manager ID in checkpoint record")));
4093 0 : return NULL;
4094 : }
4095 1762 : info = record->xl_info & ~XLR_INFO_MASK;
4096 1762 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4097 : info != XLOG_CHECKPOINT_ONLINE)
4098 : {
4099 0 : ereport(LOG,
4100 : (errmsg("invalid xl_info in checkpoint record")));
4101 0 : return NULL;
4102 : }
4103 1762 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4104 : {
4105 0 : ereport(LOG,
4106 : (errmsg("invalid length of checkpoint record")));
4107 0 : return NULL;
4108 : }
4109 1762 : return record;
4110 : }
4111 :
4112 : /*
4113 : * Scan for new timelines that might have appeared in the archive since we
4114 : * started recovery.
4115 : *
4116 : * If there are any, the function changes recovery target TLI to the latest
4117 : * one and returns 'true'.
4118 : */
4119 : static bool
4120 276 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4121 : {
4122 : List *newExpectedTLEs;
4123 : bool found;
4124 : ListCell *cell;
4125 : TimeLineID newtarget;
4126 276 : TimeLineID oldtarget = recoveryTargetTLI;
4127 276 : TimeLineHistoryEntry *currentTle = NULL;
4128 :
4129 276 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4130 276 : if (newtarget == recoveryTargetTLI)
4131 : {
4132 : /* No new timelines found */
4133 264 : return false;
4134 : }
4135 :
4136 : /*
4137 : * Determine the list of expected TLIs for the new TLI
4138 : */
4139 :
4140 12 : newExpectedTLEs = readTimeLineHistory(newtarget);
4141 :
4142 : /*
4143 : * If the current timeline is not part of the history of the new timeline,
4144 : * we cannot proceed to it.
4145 : */
4146 12 : found = false;
4147 24 : foreach(cell, newExpectedTLEs)
4148 : {
4149 24 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4150 :
4151 24 : if (currentTle->tli == recoveryTargetTLI)
4152 : {
4153 12 : found = true;
4154 12 : break;
4155 : }
4156 : }
4157 12 : if (!found)
4158 : {
4159 0 : ereport(LOG,
4160 : (errmsg("new timeline %u is not a child of database system timeline %u",
4161 : newtarget,
4162 : replayTLI)));
4163 0 : return false;
4164 : }
4165 :
4166 : /*
4167 : * The current timeline was found in the history file, but check that the
4168 : * next timeline was forked off from it *after* the current recovery
4169 : * location.
4170 : */
4171 12 : if (currentTle->end < replayLSN)
4172 : {
4173 0 : ereport(LOG,
4174 : (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4175 : newtarget,
4176 : replayTLI,
4177 : LSN_FORMAT_ARGS(replayLSN))));
4178 0 : return false;
4179 : }
4180 :
4181 : /* The new timeline history seems valid. Switch target */
4182 12 : recoveryTargetTLI = newtarget;
4183 12 : list_free_deep(expectedTLEs);
4184 12 : expectedTLEs = newExpectedTLEs;
4185 :
4186 : /*
4187 : * As in StartupXLOG(), try to ensure we have all the history files
4188 : * between the old target and new target in pg_wal.
4189 : */
4190 12 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4191 :
4192 12 : ereport(LOG,
4193 : (errmsg("new target timeline is %u",
4194 : recoveryTargetTLI)));
4195 :
4196 12 : return true;
4197 : }
4198 :
4199 :
4200 : /*
4201 : * Open a logfile segment for reading (during recovery).
4202 : *
4203 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4204 : * Otherwise, it's assumed to be already available in pg_wal.
4205 : */
4206 : static int
4207 6010 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4208 : XLogSource source, bool notfoundOk)
4209 : {
4210 : char xlogfname[MAXFNAMELEN];
4211 : char activitymsg[MAXFNAMELEN + 16];
4212 : char path[MAXPGPATH];
4213 : int fd;
4214 :
4215 6010 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4216 :
4217 6010 : switch (source)
4218 : {
4219 896 : case XLOG_FROM_ARCHIVE:
4220 : /* Report recovery progress in PS display */
4221 896 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4222 : xlogfname);
4223 896 : set_ps_display(activitymsg);
4224 :
4225 896 : if (!RestoreArchivedFile(path, xlogfname,
4226 : "RECOVERYXLOG",
4227 : wal_segment_size,
4228 : InRedo))
4229 816 : return -1;
4230 80 : break;
4231 :
4232 5114 : case XLOG_FROM_PG_WAL:
4233 : case XLOG_FROM_STREAM:
4234 5114 : XLogFilePath(path, tli, segno, wal_segment_size);
4235 5114 : break;
4236 :
4237 0 : default:
4238 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4239 : }
4240 :
4241 : /*
4242 : * If the segment was fetched from archival storage, replace the existing
4243 : * xlog segment (if any) with the archival version.
4244 : */
4245 5194 : if (source == XLOG_FROM_ARCHIVE)
4246 : {
4247 : Assert(!IsInstallXLogFileSegmentActive());
4248 80 : KeepFileRestoredFromArchive(path, xlogfname);
4249 :
4250 : /*
4251 : * Set path to point at the new file in pg_wal.
4252 : */
4253 80 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4254 : }
4255 :
4256 5194 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4257 5194 : if (fd >= 0)
4258 : {
4259 : /* Success! */
4260 4836 : curFileTLI = tli;
4261 :
4262 : /* Report recovery progress in PS display */
4263 4836 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4264 : xlogfname);
4265 4836 : set_ps_display(activitymsg);
4266 :
4267 : /* Track source of data in assorted state variables */
4268 4836 : readSource = source;
4269 4836 : XLogReceiptSource = source;
4270 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4271 4836 : if (source != XLOG_FROM_STREAM)
4272 2306 : XLogReceiptTime = GetCurrentTimestamp();
4273 :
4274 4836 : return fd;
4275 : }
4276 358 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4277 0 : ereport(PANIC,
4278 : (errcode_for_file_access(),
4279 : errmsg("could not open file \"%s\": %m", path)));
4280 358 : return -1;
4281 : }
4282 :
4283 : /*
4284 : * Open a logfile segment for reading (during recovery).
4285 : *
4286 : * This version searches for the segment with any TLI listed in expectedTLEs.
4287 : */
4288 : static int
4289 2644 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4290 : {
4291 : char path[MAXPGPATH];
4292 : ListCell *cell;
4293 : int fd;
4294 : List *tles;
4295 :
4296 : /*
4297 : * Loop looking for a suitable timeline ID: we might need to read any of
4298 : * the timelines listed in expectedTLEs.
4299 : *
4300 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4301 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4302 : * to go backwards; this prevents us from picking up the wrong file when a
4303 : * parent timeline extends to higher segment numbers than the child we
4304 : * want to read.
4305 : *
4306 : * If we haven't read the timeline history file yet, read it now, so that
4307 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4308 : * however, unless we actually find a valid segment. That way if there is
4309 : * neither a timeline history file nor a WAL segment in the archive, and
4310 : * streaming replication is set up, we'll read the timeline history file
4311 : * streamed from the primary when we start streaming, instead of
4312 : * recovering with a dummy history generated here.
4313 : */
4314 2644 : if (expectedTLEs)
4315 882 : tles = expectedTLEs;
4316 : else
4317 1762 : tles = readTimeLineHistory(recoveryTargetTLI);
4318 :
4319 3016 : foreach(cell, tles)
4320 : {
4321 2686 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4322 2686 : TimeLineID tli = hent->tli;
4323 :
4324 2686 : if (tli < curFileTLI)
4325 8 : break; /* don't bother looking at too-old TLIs */
4326 :
4327 : /*
4328 : * Skip scanning the timeline ID that the logfile segment to read
4329 : * doesn't belong to
4330 : */
4331 2678 : if (hent->begin != InvalidXLogRecPtr)
4332 : {
4333 142 : XLogSegNo beginseg = 0;
4334 :
4335 142 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4336 :
4337 : /*
4338 : * The logfile segment that doesn't belong to the timeline is
4339 : * older or newer than the segment that the timeline started or
4340 : * ended at, respectively. It's sufficient to check only the
4341 : * starting segment of the timeline here. Since the timelines are
4342 : * scanned in descending order in this loop, any segments newer
4343 : * than the ending segment should belong to newer timeline and
4344 : * have already been read before. So it's not necessary to check
4345 : * the ending segment of the timeline here.
4346 : */
4347 142 : if (segno < beginseg)
4348 14 : continue;
4349 : }
4350 :
4351 2664 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4352 : {
4353 896 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4354 896 : if (fd != -1)
4355 : {
4356 80 : elog(DEBUG1, "got WAL segment from archive");
4357 80 : if (!expectedTLEs)
4358 34 : expectedTLEs = tles;
4359 2306 : return fd;
4360 : }
4361 : }
4362 :
4363 2584 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4364 : {
4365 2584 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4366 2584 : if (fd != -1)
4367 : {
4368 2226 : if (!expectedTLEs)
4369 1728 : expectedTLEs = tles;
4370 2226 : return fd;
4371 : }
4372 : }
4373 : }
4374 :
4375 : /* Couldn't find it. For simplicity, complain about front timeline */
4376 338 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4377 338 : errno = ENOENT;
4378 338 : ereport(DEBUG2,
4379 : (errcode_for_file_access(),
4380 : errmsg("could not open file \"%s\": %m", path)));
4381 338 : return -1;
4382 : }
4383 :
4384 : /*
4385 : * Set flag to signal the walreceiver to restart. (The startup process calls
4386 : * this on noticing a relevant configuration change.)
4387 : */
4388 : void
4389 6 : StartupRequestWalReceiverRestart(void)
4390 : {
4391 6 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4392 : {
4393 6 : ereport(LOG,
4394 : (errmsg("WAL receiver process shutdown requested")));
4395 :
4396 6 : pendingWalRcvRestart = true;
4397 : }
4398 6 : }
4399 :
4400 :
4401 : /*
4402 : * Has a standby promotion already been triggered?
4403 : *
4404 : * Unlike CheckForStandbyTrigger(), this works in any process
4405 : * that's connected to shared memory.
4406 : */
4407 : bool
4408 104 : PromoteIsTriggered(void)
4409 : {
4410 : /*
4411 : * We check shared state each time only until a standby promotion is
4412 : * triggered. We can't trigger a promotion again, so there's no need to
4413 : * keep checking after the shared variable has once been seen true.
4414 : */
4415 104 : if (LocalPromoteIsTriggered)
4416 84 : return true;
4417 :
4418 20 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4419 20 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4420 20 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4421 :
4422 20 : return LocalPromoteIsTriggered;
4423 : }
4424 :
4425 : static void
4426 84 : SetPromoteIsTriggered(void)
4427 : {
4428 84 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4429 84 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4430 84 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4431 :
4432 : /*
4433 : * Mark the recovery pause state as 'not paused' because the paused state
4434 : * ends and promotion continues if a promotion is triggered while recovery
4435 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4436 : * return 'paused' while a promotion is ongoing.
4437 : */
4438 84 : SetRecoveryPause(false);
4439 :
4440 84 : LocalPromoteIsTriggered = true;
4441 84 : }
4442 :
4443 : /*
4444 : * Check whether a promote request has arrived.
4445 : */
4446 : static bool
4447 9552 : CheckForStandbyTrigger(void)
4448 : {
4449 9552 : if (LocalPromoteIsTriggered)
4450 94 : return true;
4451 :
4452 9458 : if (IsPromoteSignaled() && CheckPromoteSignal())
4453 : {
4454 84 : ereport(LOG, (errmsg("received promote request")));
4455 84 : RemovePromoteSignalFiles();
4456 84 : ResetPromoteSignaled();
4457 84 : SetPromoteIsTriggered();
4458 84 : return true;
4459 : }
4460 :
4461 9374 : return false;
4462 : }
4463 :
4464 : /*
4465 : * Remove the files signaling a standby promotion request.
4466 : */
4467 : void
4468 1728 : RemovePromoteSignalFiles(void)
4469 : {
4470 1728 : unlink(PROMOTE_SIGNAL_FILE);
4471 1728 : }
4472 :
4473 : /*
4474 : * Check to see if a promote request has arrived.
4475 : */
4476 : bool
4477 1470 : CheckPromoteSignal(void)
4478 : {
4479 : struct stat stat_buf;
4480 :
4481 1470 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4482 168 : return true;
4483 :
4484 1302 : return false;
4485 : }
4486 :
4487 : /*
4488 : * Wake up startup process to replay newly arrived WAL, or to notice that
4489 : * failover has been requested.
4490 : */
4491 : void
4492 22024 : WakeupRecovery(void)
4493 : {
4494 22024 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4495 22024 : }
4496 :
4497 : /*
4498 : * Schedule a walreceiver wakeup in the main recovery loop.
4499 : */
4500 : void
4501 4 : XLogRequestWalReceiverReply(void)
4502 : {
4503 4 : doRequestWalReceiverReply = true;
4504 4 : }
4505 :
4506 : /*
4507 : * Is HotStandby active yet? This is only important in special backends
4508 : * since normal backends won't ever be able to connect until this returns
4509 : * true. Postmaster knows this by way of signal, not via shared memory.
4510 : *
4511 : * Unlike testing standbyState, this works in any process that's connected to
4512 : * shared memory. (And note that standbyState alone doesn't tell the truth
4513 : * anyway.)
4514 : */
4515 : bool
4516 312 : HotStandbyActive(void)
4517 : {
4518 : /*
4519 : * We check shared state each time only until Hot Standby is active. We
4520 : * can't de-activate Hot Standby, so there's no need to keep checking
4521 : * after the shared variable has once been seen true.
4522 : */
4523 312 : if (LocalHotStandbyActive)
4524 44 : return true;
4525 : else
4526 : {
4527 : /* spinlock is essential on machines with weak memory ordering! */
4528 268 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4529 268 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4530 268 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4531 :
4532 268 : return LocalHotStandbyActive;
4533 : }
4534 : }
4535 :
4536 : /*
4537 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4538 : * where we don't need to ask any other process what the state is.
4539 : */
4540 : static bool
4541 0 : HotStandbyActiveInReplay(void)
4542 : {
4543 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4544 0 : return LocalHotStandbyActive;
4545 : }
4546 :
4547 : /*
4548 : * Get latest redo apply position.
4549 : *
4550 : * Exported to allow WALReceiver to read the pointer directly.
4551 : */
4552 : XLogRecPtr
4553 64616 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4554 : {
4555 : XLogRecPtr recptr;
4556 : TimeLineID tli;
4557 :
4558 64616 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4559 64616 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4560 64616 : tli = XLogRecoveryCtl->lastReplayedTLI;
4561 64616 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4562 :
4563 64616 : if (replayTLI)
4564 4318 : *replayTLI = tli;
4565 64616 : return recptr;
4566 : }
4567 :
4568 :
4569 : /*
4570 : * Get position of last applied, or the record being applied.
4571 : *
4572 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4573 : * record is currently being applied, this includes that record.
4574 : */
4575 : XLogRecPtr
4576 10540 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4577 : {
4578 : XLogRecPtr recptr;
4579 : TimeLineID tli;
4580 :
4581 10540 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4582 10540 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4583 10540 : tli = XLogRecoveryCtl->replayEndTLI;
4584 10540 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4585 :
4586 10540 : if (replayEndTLI)
4587 10540 : *replayEndTLI = tli;
4588 10540 : return recptr;
4589 : }
4590 :
4591 : /*
4592 : * Save timestamp of latest processed commit/abort record.
4593 : *
4594 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4595 : * seen by processes other than the startup process. Note in particular
4596 : * that CreateRestartPoint is executed in the checkpointer.
4597 : */
4598 : static void
4599 41770 : SetLatestXTime(TimestampTz xtime)
4600 : {
4601 41770 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4602 41770 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4603 41770 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4604 41770 : }
4605 :
4606 : /*
4607 : * Fetch timestamp of latest processed commit/abort record.
4608 : */
4609 : TimestampTz
4610 666 : GetLatestXTime(void)
4611 : {
4612 : TimestampTz xtime;
4613 :
4614 666 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4615 666 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4616 666 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4617 :
4618 666 : return xtime;
4619 : }
4620 :
4621 : /*
4622 : * Save timestamp of the next chunk of WAL records to apply.
4623 : *
4624 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4625 : * seen by all backends.
4626 : */
4627 : static void
4628 7846 : SetCurrentChunkStartTime(TimestampTz xtime)
4629 : {
4630 7846 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4631 7846 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4632 7846 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4633 7846 : }
4634 :
4635 : /*
4636 : * Fetch timestamp of latest processed commit/abort record.
4637 : * Startup process maintains an accurate local copy in XLogReceiptTime
4638 : */
4639 : TimestampTz
4640 212 : GetCurrentChunkReplayStartTime(void)
4641 : {
4642 : TimestampTz xtime;
4643 :
4644 212 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4645 212 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4646 212 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4647 :
4648 212 : return xtime;
4649 : }
4650 :
4651 : /*
4652 : * Returns time of receipt of current chunk of XLOG data, as well as
4653 : * whether it was received from streaming replication or from archives.
4654 : */
4655 : void
4656 58 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4657 : {
4658 : /*
4659 : * This must be executed in the startup process, since we don't export the
4660 : * relevant state to shared memory.
4661 : */
4662 : Assert(InRecovery);
4663 :
4664 58 : *rtime = XLogReceiptTime;
4665 58 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4666 58 : }
4667 :
4668 : /*
4669 : * Note that text field supplied is a parameter name and does not require
4670 : * translation
4671 : */
4672 : void
4673 1200 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4674 : {
4675 1200 : if (currValue < minValue)
4676 : {
4677 0 : if (HotStandbyActiveInReplay())
4678 : {
4679 0 : bool warned_for_promote = false;
4680 :
4681 0 : ereport(WARNING,
4682 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4683 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4684 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4685 : param_name,
4686 : currValue,
4687 : minValue)));
4688 :
4689 0 : SetRecoveryPause(true);
4690 :
4691 0 : ereport(LOG,
4692 : (errmsg("recovery has paused"),
4693 : errdetail("If recovery is unpaused, the server will shut down."),
4694 : errhint("You can then restart the server after making the necessary configuration changes.")));
4695 :
4696 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4697 : {
4698 0 : ProcessStartupProcInterrupts();
4699 :
4700 0 : if (CheckForStandbyTrigger())
4701 : {
4702 0 : if (!warned_for_promote)
4703 0 : ereport(WARNING,
4704 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4705 : errmsg("promotion is not possible because of insufficient parameter settings"),
4706 :
4707 : /*
4708 : * Repeat the detail from above so it's easy to find
4709 : * in the log.
4710 : */
4711 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4712 : param_name,
4713 : currValue,
4714 : minValue),
4715 : errhint("Restart the server after making the necessary configuration changes.")));
4716 0 : warned_for_promote = true;
4717 : }
4718 :
4719 : /*
4720 : * If recovery pause is requested then set it paused. While
4721 : * we are in the loop, user might resume and pause again so
4722 : * set this every time.
4723 : */
4724 0 : ConfirmRecoveryPaused();
4725 :
4726 : /*
4727 : * We wait on a condition variable that will wake us as soon
4728 : * as the pause ends, but we use a timeout so we can check the
4729 : * above conditions periodically too.
4730 : */
4731 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4732 : WAIT_EVENT_RECOVERY_PAUSE);
4733 : }
4734 0 : ConditionVariableCancelSleep();
4735 : }
4736 :
4737 0 : ereport(FATAL,
4738 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4739 : errmsg("recovery aborted because of insufficient parameter settings"),
4740 : /* Repeat the detail from above so it's easy to find in the log. */
4741 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4742 : param_name,
4743 : currValue,
4744 : minValue),
4745 : errhint("You can restart the server after making the necessary configuration changes.")));
4746 : }
4747 1200 : }
4748 :
4749 :
4750 : /*
4751 : * GUC check_hook for primary_slot_name
4752 : */
4753 : bool
4754 2392 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4755 : {
4756 2392 : if (*newval && strcmp(*newval, "") != 0 &&
4757 290 : !ReplicationSlotValidateName(*newval, WARNING))
4758 0 : return false;
4759 :
4760 2392 : return true;
4761 : }
4762 :
4763 : /*
4764 : * Recovery target settings: Only one of the several recovery_target* settings
4765 : * may be set. Setting a second one results in an error. The global variable
4766 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4767 : * variables store the actual target value (for example a string or a xid).
4768 : * The assign functions of the parameters check whether a competing parameter
4769 : * was already set. But we want to allow setting the same parameter multiple
4770 : * times. We also want to allow unsetting a parameter and setting a different
4771 : * one, so we unset recoveryTarget when the parameter is set to an empty
4772 : * string.
4773 : *
4774 : * XXX this code is broken by design. Throwing an error from a GUC assign
4775 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4776 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4777 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4778 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4779 : */
4780 :
4781 : pg_noreturn static void
4782 2 : error_multiple_recovery_targets(void)
4783 : {
4784 2 : ereport(ERROR,
4785 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4786 : errmsg("multiple recovery targets specified"),
4787 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4788 : }
4789 :
4790 : /*
4791 : * GUC check_hook for recovery_target
4792 : */
4793 : bool
4794 2104 : check_recovery_target(char **newval, void **extra, GucSource source)
4795 : {
4796 2104 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4797 : {
4798 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4799 0 : return false;
4800 : }
4801 2104 : return true;
4802 : }
4803 :
4804 : /*
4805 : * GUC assign_hook for recovery_target
4806 : */
4807 : void
4808 2104 : assign_recovery_target(const char *newval, void *extra)
4809 : {
4810 2104 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4811 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4812 0 : error_multiple_recovery_targets();
4813 :
4814 2104 : if (newval && strcmp(newval, "") != 0)
4815 2 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4816 : else
4817 2102 : recoveryTarget = RECOVERY_TARGET_UNSET;
4818 2104 : }
4819 :
4820 : /*
4821 : * GUC check_hook for recovery_target_lsn
4822 : */
4823 : bool
4824 2114 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4825 : {
4826 2114 : if (strcmp(*newval, "") != 0)
4827 : {
4828 : XLogRecPtr lsn;
4829 : XLogRecPtr *myextra;
4830 16 : bool have_error = false;
4831 :
4832 16 : lsn = pg_lsn_in_internal(*newval, &have_error);
4833 16 : if (have_error)
4834 0 : return false;
4835 :
4836 16 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4837 16 : if (!myextra)
4838 0 : return false;
4839 16 : *myextra = lsn;
4840 16 : *extra = myextra;
4841 : }
4842 2114 : return true;
4843 : }
4844 :
4845 : /*
4846 : * GUC assign_hook for recovery_target_lsn
4847 : */
4848 : void
4849 2114 : assign_recovery_target_lsn(const char *newval, void *extra)
4850 : {
4851 2114 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4852 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4853 0 : error_multiple_recovery_targets();
4854 :
4855 2114 : if (newval && strcmp(newval, "") != 0)
4856 : {
4857 16 : recoveryTarget = RECOVERY_TARGET_LSN;
4858 16 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4859 : }
4860 : else
4861 2098 : recoveryTarget = RECOVERY_TARGET_UNSET;
4862 2114 : }
4863 :
4864 : /*
4865 : * GUC check_hook for recovery_target_name
4866 : */
4867 : bool
4868 2116 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4869 : {
4870 : /* Use the value of newval directly */
4871 2116 : if (strlen(*newval) >= MAXFNAMELEN)
4872 : {
4873 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4874 : "recovery_target_name", MAXFNAMELEN - 1);
4875 0 : return false;
4876 : }
4877 2116 : return true;
4878 : }
4879 :
4880 : /*
4881 : * GUC assign_hook for recovery_target_name
4882 : */
4883 : void
4884 2116 : assign_recovery_target_name(const char *newval, void *extra)
4885 : {
4886 2116 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4887 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4888 0 : error_multiple_recovery_targets();
4889 :
4890 2116 : if (newval && strcmp(newval, "") != 0)
4891 : {
4892 12 : recoveryTarget = RECOVERY_TARGET_NAME;
4893 12 : recoveryTargetName = newval;
4894 : }
4895 : else
4896 2104 : recoveryTarget = RECOVERY_TARGET_UNSET;
4897 2116 : }
4898 :
4899 : /*
4900 : * GUC check_hook for recovery_target_time
4901 : *
4902 : * The interpretation of the recovery_target_time string can depend on the
4903 : * time zone setting, so we need to wait until after all GUC processing is
4904 : * done before we can do the final parsing of the string. This check function
4905 : * only does a parsing pass to catch syntax errors, but we store the string
4906 : * and parse it again when we need to use it.
4907 : */
4908 : bool
4909 2108 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4910 : {
4911 2108 : if (strcmp(*newval, "") != 0)
4912 : {
4913 : /* reject some special values */
4914 6 : if (strcmp(*newval, "now") == 0 ||
4915 6 : strcmp(*newval, "today") == 0 ||
4916 6 : strcmp(*newval, "tomorrow") == 0 ||
4917 6 : strcmp(*newval, "yesterday") == 0)
4918 : {
4919 0 : return false;
4920 : }
4921 :
4922 : /*
4923 : * parse timestamp value (see also timestamptz_in())
4924 : */
4925 : {
4926 6 : char *str = *newval;
4927 : fsec_t fsec;
4928 : struct pg_tm tt,
4929 6 : *tm = &tt;
4930 : int tz;
4931 : int dtype;
4932 : int nf;
4933 : int dterr;
4934 : char *field[MAXDATEFIELDS];
4935 : int ftype[MAXDATEFIELDS];
4936 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4937 : DateTimeErrorExtra dtextra;
4938 : TimestampTz timestamp;
4939 :
4940 6 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4941 : field, ftype, MAXDATEFIELDS, &nf);
4942 6 : if (dterr == 0)
4943 6 : dterr = DecodeDateTime(field, ftype, nf,
4944 : &dtype, tm, &fsec, &tz, &dtextra);
4945 6 : if (dterr != 0)
4946 0 : return false;
4947 6 : if (dtype != DTK_DATE)
4948 0 : return false;
4949 :
4950 6 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4951 : {
4952 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4953 0 : return false;
4954 : }
4955 : }
4956 : }
4957 2108 : return true;
4958 : }
4959 :
4960 : /*
4961 : * GUC assign_hook for recovery_target_time
4962 : */
4963 : void
4964 2108 : assign_recovery_target_time(const char *newval, void *extra)
4965 : {
4966 2108 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4967 2 : recoveryTarget != RECOVERY_TARGET_TIME)
4968 2 : error_multiple_recovery_targets();
4969 :
4970 2106 : if (newval && strcmp(newval, "") != 0)
4971 4 : recoveryTarget = RECOVERY_TARGET_TIME;
4972 : else
4973 2102 : recoveryTarget = RECOVERY_TARGET_UNSET;
4974 2106 : }
4975 :
4976 : /*
4977 : * GUC check_hook for recovery_target_timeline
4978 : */
4979 : bool
4980 2104 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4981 : {
4982 : RecoveryTargetTimeLineGoal rttg;
4983 : RecoveryTargetTimeLineGoal *myextra;
4984 :
4985 2104 : if (strcmp(*newval, "current") == 0)
4986 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4987 2104 : else if (strcmp(*newval, "latest") == 0)
4988 2104 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4989 : else
4990 : {
4991 0 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
4992 :
4993 0 : errno = 0;
4994 0 : strtoul(*newval, NULL, 0);
4995 0 : if (errno == EINVAL || errno == ERANGE)
4996 : {
4997 0 : GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4998 0 : return false;
4999 : }
5000 : }
5001 :
5002 2104 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5003 2104 : if (!myextra)
5004 0 : return false;
5005 2104 : *myextra = rttg;
5006 2104 : *extra = myextra;
5007 :
5008 2104 : return true;
5009 : }
5010 :
5011 : /*
5012 : * GUC assign_hook for recovery_target_timeline
5013 : */
5014 : void
5015 2104 : assign_recovery_target_timeline(const char *newval, void *extra)
5016 : {
5017 2104 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5018 2104 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5019 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5020 : else
5021 2104 : recoveryTargetTLIRequested = 0;
5022 2104 : }
5023 :
5024 : /*
5025 : * GUC check_hook for recovery_target_xid
5026 : */
5027 : bool
5028 2104 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5029 : {
5030 2104 : if (strcmp(*newval, "") != 0)
5031 : {
5032 : TransactionId xid;
5033 : TransactionId *myextra;
5034 :
5035 2 : errno = 0;
5036 2 : xid = (TransactionId) strtou64(*newval, NULL, 0);
5037 2 : if (errno == EINVAL || errno == ERANGE)
5038 0 : return false;
5039 :
5040 2 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5041 2 : if (!myextra)
5042 0 : return false;
5043 2 : *myextra = xid;
5044 2 : *extra = myextra;
5045 : }
5046 2104 : return true;
5047 : }
5048 :
5049 : /*
5050 : * GUC assign_hook for recovery_target_xid
5051 : */
5052 : void
5053 2104 : assign_recovery_target_xid(const char *newval, void *extra)
5054 : {
5055 2104 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5056 0 : recoveryTarget != RECOVERY_TARGET_XID)
5057 0 : error_multiple_recovery_targets();
5058 :
5059 2104 : if (newval && strcmp(newval, "") != 0)
5060 : {
5061 2 : recoveryTarget = RECOVERY_TARGET_XID;
5062 2 : recoveryTargetXid = *((TransactionId *) extra);
5063 : }
5064 : else
5065 2102 : recoveryTarget = RECOVERY_TARGET_UNSET;
5066 2104 : }
|