Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <math.h>
29 : #include <time.h>
30 : #include <sys/stat.h>
31 : #include <sys/time.h>
32 : #include <unistd.h>
33 :
34 : #include "access/timeline.h"
35 : #include "access/transam.h"
36 : #include "access/xact.h"
37 : #include "access/xlog_internal.h"
38 : #include "access/xlogarchive.h"
39 : #include "access/xlogprefetcher.h"
40 : #include "access/xlogreader.h"
41 : #include "access/xlogrecovery.h"
42 : #include "access/xlogutils.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "pgstat.h"
49 : #include "postmaster/bgwriter.h"
50 : #include "postmaster/startup.h"
51 : #include "replication/slot.h"
52 : #include "replication/slotsync.h"
53 : #include "replication/walreceiver.h"
54 : #include "storage/fd.h"
55 : #include "storage/ipc.h"
56 : #include "storage/latch.h"
57 : #include "storage/pmsignal.h"
58 : #include "storage/procarray.h"
59 : #include "storage/spin.h"
60 : #include "utils/datetime.h"
61 : #include "utils/fmgrprotos.h"
62 : #include "utils/guc_hooks.h"
63 : #include "utils/pgstat_internal.h"
64 : #include "utils/pg_lsn.h"
65 : #include "utils/ps_status.h"
66 : #include "utils/pg_rusage.h"
67 :
68 : /* Unsupported old recovery command file names (relative to $PGDATA) */
69 : #define RECOVERY_COMMAND_FILE "recovery.conf"
70 : #define RECOVERY_COMMAND_DONE "recovery.done"
71 :
72 : /*
73 : * GUC support
74 : */
75 : const struct config_enum_entry recovery_target_action_options[] = {
76 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
77 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
78 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
79 : {NULL, 0, false}
80 : };
81 :
82 : /* options formerly taken from recovery.conf for archive recovery */
83 : char *recoveryRestoreCommand = NULL;
84 : char *recoveryEndCommand = NULL;
85 : char *archiveCleanupCommand = NULL;
86 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
87 : bool recoveryTargetInclusive = true;
88 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
89 : TransactionId recoveryTargetXid;
90 : char *recovery_target_time_string;
91 : TimestampTz recoveryTargetTime;
92 : const char *recoveryTargetName;
93 : XLogRecPtr recoveryTargetLSN;
94 : int recovery_min_apply_delay = 0;
95 :
96 : /* options formerly taken from recovery.conf for XLOG streaming */
97 : char *PrimaryConnInfo = NULL;
98 : char *PrimarySlotName = NULL;
99 : bool wal_receiver_create_temp_slot = false;
100 :
101 : /*
102 : * recoveryTargetTimeLineGoal: what the user requested, if any
103 : *
104 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105 : *
106 : * recoveryTargetTLI: the currently understood target timeline; changes
107 : *
108 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
109 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
110 : * always the first list member). Only these TLIs are expected to be seen in
111 : * the WAL segments we read, and indeed only these TLIs will be considered as
112 : * candidate WAL files to open at all.
113 : *
114 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
115 : * (This is not necessarily the same as the timeline from which we are
116 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
117 : * scanning data that was copied from an ancestor timeline when the current
118 : * file was created.) During a sequential scan we do not allow this value
119 : * to decrease.
120 : */
121 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
122 : TimeLineID recoveryTargetTLIRequested = 0;
123 : TimeLineID recoveryTargetTLI = 0;
124 : static List *expectedTLEs;
125 : static TimeLineID curFileTLI;
126 :
127 : /*
128 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
129 : * ie. signal files were present. When InArchiveRecovery is set, we are
130 : * currently recovering using offline XLOG archives. These variables are only
131 : * valid in the startup process.
132 : *
133 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
134 : * currently performing crash recovery using only XLOG files in pg_wal, but
135 : * will switch to using offline XLOG archives as soon as we reach the end of
136 : * WAL in pg_wal.
137 : */
138 : bool ArchiveRecoveryRequested = false;
139 : bool InArchiveRecovery = false;
140 :
141 : /*
142 : * When StandbyModeRequested is set, standby mode was requested, i.e.
143 : * standby.signal file was present. When StandbyMode is set, we are currently
144 : * in standby mode. These variables are only valid in the startup process.
145 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146 : */
147 : static bool StandbyModeRequested = false;
148 : bool StandbyMode = false;
149 :
150 : /* was a signal file present at startup? */
151 : static bool standby_signal_file_found = false;
152 : static bool recovery_signal_file_found = false;
153 :
154 : /*
155 : * CheckPointLoc is the position of the checkpoint record that determines
156 : * where to start the replay. It comes from the backup label file or the
157 : * control file.
158 : *
159 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
160 : * file or the control file. In standby mode, XLOG streaming usually starts
161 : * from the position where an invalid record was found. But if we fail to
162 : * read even the initial checkpoint record, we use the REDO location instead
163 : * of the checkpoint location as the start position of XLOG streaming.
164 : * Otherwise we would have to jump backwards to the REDO location after
165 : * reading the checkpoint record, because the REDO record can precede the
166 : * checkpoint record.
167 : */
168 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
169 : static TimeLineID CheckPointTLI = 0;
170 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
171 : static TimeLineID RedoStartTLI = 0;
172 :
173 : /*
174 : * Local copy of SharedHotStandbyActive variable. False actually means "not
175 : * known, need to check the shared state".
176 : */
177 : static bool LocalHotStandbyActive = false;
178 :
179 : /*
180 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
181 : * known, need to check the shared state".
182 : */
183 : static bool LocalPromoteIsTriggered = false;
184 :
185 : /* Has the recovery code requested a walreceiver wakeup? */
186 : static bool doRequestWalReceiverReply;
187 :
188 : /* XLogReader object used to parse the WAL records */
189 : static XLogReaderState *xlogreader = NULL;
190 :
191 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 : static XLogPrefetcher *xlogprefetcher = NULL;
193 :
194 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
195 : typedef struct XLogPageReadPrivate
196 : {
197 : int emode;
198 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
199 : bool randAccess;
200 : TimeLineID replayTLI;
201 : } XLogPageReadPrivate;
202 :
203 : /* flag to tell XLogPageRead that we have started replaying */
204 : static bool InRedo = false;
205 :
206 : /*
207 : * Codes indicating where we got a WAL file from during recovery, or where
208 : * to attempt to get one.
209 : */
210 : typedef enum
211 : {
212 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
213 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
214 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
215 : XLOG_FROM_STREAM, /* streamed from primary */
216 : } XLogSource;
217 :
218 : /* human-readable names for XLogSources, for debugging output */
219 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
220 :
221 : /*
222 : * readFile is -1 or a kernel FD for the log file segment that's currently
223 : * open for reading. readSegNo identifies the segment. readOff is the offset
224 : * of the page just read, readLen indicates how much of it has been read into
225 : * readBuf, and readSource indicates where we got the currently open file from.
226 : *
227 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
228 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
229 : * worthwhile, since the XLOG is not read by general-purpose sessions.
230 : */
231 : static int readFile = -1;
232 : static XLogSegNo readSegNo = 0;
233 : static uint32 readOff = 0;
234 : static uint32 readLen = 0;
235 : static XLogSource readSource = XLOG_FROM_ANY;
236 :
237 : /*
238 : * Keeps track of which source we're currently reading from. This is
239 : * different from readSource in that this is always set, even when we don't
240 : * currently have a WAL file open. If lastSourceFailed is set, our last
241 : * attempt to read from currentSource failed, and we should try another source
242 : * next.
243 : *
244 : * pendingWalRcvRestart is set when a config change occurs that requires a
245 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
246 : */
247 : static XLogSource currentSource = XLOG_FROM_ANY;
248 : static bool lastSourceFailed = false;
249 : static bool pendingWalRcvRestart = false;
250 :
251 : /*
252 : * These variables track when we last obtained some WAL data to process,
253 : * and where we got it from. (XLogReceiptSource is initially the same as
254 : * readSource, but readSource gets reset to zero when we don't have data
255 : * to process right now. It is also different from currentSource, which
256 : * also changes when we try to read from a source and fail, while
257 : * XLogReceiptSource tracks where we last successfully read some WAL.)
258 : */
259 : static TimestampTz XLogReceiptTime = 0;
260 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
261 :
262 : /* Local copy of WalRcv->flushedUpto */
263 : static XLogRecPtr flushedUpto = 0;
264 : static TimeLineID receiveTLI = 0;
265 :
266 : /*
267 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
268 : *
269 : * In order to reach consistency, we must replay the WAL up to
270 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
271 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
272 : * to backupStartPoint.
273 : *
274 : * Note: In archive recovery, after consistency has been reached, the
275 : * functions in xlog.c will start updating minRecoveryPoint in the control
276 : * file. But this copy of minRecoveryPoint variable reflects the value at the
277 : * beginning of recovery, and is *not* updated after consistency is reached.
278 : */
279 : static XLogRecPtr minRecoveryPoint;
280 : static TimeLineID minRecoveryPointTLI;
281 :
282 : static XLogRecPtr backupStartPoint;
283 : static XLogRecPtr backupEndPoint;
284 : static bool backupEndRequired = false;
285 :
286 : /*
287 : * Have we reached a consistent database state? In crash recovery, we have
288 : * to replay all the WAL, so reachedConsistency is never set. During archive
289 : * recovery, the database is consistent once minRecoveryPoint is reached.
290 : *
291 : * Consistent state means that the system is internally consistent, all
292 : * the WAL has been replayed up to a certain point, and importantly, there
293 : * is no trace of later actions on disk.
294 : */
295 : bool reachedConsistency = false;
296 :
297 : /* Buffers dedicated to consistency checks of size BLCKSZ */
298 : static char *replay_image_masked = NULL;
299 : static char *primary_image_masked = NULL;
300 :
301 :
302 : /*
303 : * Shared-memory state for WAL recovery.
304 : */
305 : typedef struct XLogRecoveryCtlData
306 : {
307 : /*
308 : * SharedHotStandbyActive indicates if we allow hot standby queries to be
309 : * run. Protected by info_lck.
310 : */
311 : bool SharedHotStandbyActive;
312 :
313 : /*
314 : * SharedPromoteIsTriggered indicates if a standby promotion has been
315 : * triggered. Protected by info_lck.
316 : */
317 : bool SharedPromoteIsTriggered;
318 :
319 : /*
320 : * recoveryWakeupLatch is used to wake up the startup process to continue
321 : * WAL replay, if it is waiting for WAL to arrive or promotion to be
322 : * requested.
323 : *
324 : * Note that the startup process also uses another latch, its procLatch,
325 : * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
326 : * signaling the startup process in favor of using its procLatch, which
327 : * comports better with possible generic signal handlers using that latch.
328 : * But we should not do that because the startup process doesn't assume
329 : * that it's waken up by walreceiver process or SIGHUP signal handler
330 : * while it's waiting for recovery conflict. The separate latches,
331 : * recoveryWakeupLatch and procLatch, should be used for inter-process
332 : * communication for WAL replay and recovery conflict, respectively.
333 : */
334 : Latch recoveryWakeupLatch;
335 :
336 : /*
337 : * Last record successfully replayed.
338 : */
339 : XLogRecPtr lastReplayedReadRecPtr; /* start position */
340 : XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
341 : TimeLineID lastReplayedTLI; /* timeline */
342 :
343 : /*
344 : * When we're currently replaying a record, ie. in a redo function,
345 : * replayEndRecPtr points to the end+1 of the record being replayed,
346 : * otherwise it's equal to lastReplayedEndRecPtr.
347 : */
348 : XLogRecPtr replayEndRecPtr;
349 : TimeLineID replayEndTLI;
350 : /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351 : TimestampTz recoveryLastXTime;
352 :
353 : /*
354 : * timestamp of when we started replaying the current chunk of WAL data,
355 : * only relevant for replication or archive recovery
356 : */
357 : TimestampTz currentChunkStartTime;
358 : /* Recovery pause state */
359 : RecoveryPauseState recoveryPauseState;
360 : ConditionVariable recoveryNotPausedCV;
361 :
362 : slock_t info_lck; /* locks shared variables shown above */
363 : } XLogRecoveryCtlData;
364 :
365 : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
366 :
367 : /*
368 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
369 : * recovery completes; missingContrecPtr is the location of the first
370 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
371 : * details.
372 : */
373 : static XLogRecPtr abortedRecPtr;
374 : static XLogRecPtr missingContrecPtr;
375 :
376 : /*
377 : * if recoveryStopsBefore/After returns true, it saves information of the stop
378 : * point here
379 : */
380 : static TransactionId recoveryStopXid;
381 : static TimestampTz recoveryStopTime;
382 : static XLogRecPtr recoveryStopLSN;
383 : static char recoveryStopName[MAXFNAMELEN];
384 : static bool recoveryStopAfter;
385 :
386 : /* prototypes for local functions */
387 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
388 :
389 : static void EnableStandbyMode(void);
390 : static void readRecoverySignalFile(void);
391 : static void validateRecoveryParameters(void);
392 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
393 : TimeLineID *backupLabelTLI,
394 : bool *backupEndRequired, bool *backupFromStandby);
395 : static bool read_tablespace_map(List **tablespaces);
396 :
397 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
398 : static void CheckRecoveryConsistency(void);
399 : static void rm_redo_error_callback(void *arg);
400 : #ifdef WAL_DEBUG
401 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
402 : #endif
403 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
404 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
405 : TimeLineID prevTLI, TimeLineID replayTLI);
406 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
407 : static void verifyBackupPageConsistency(XLogReaderState *record);
408 :
409 : static bool recoveryStopsBefore(XLogReaderState *record);
410 : static bool recoveryStopsAfter(XLogReaderState *record);
411 : static char *getRecoveryStopReason(void);
412 : static void recoveryPausesHere(bool endOfRecovery);
413 : static bool recoveryApplyDelay(XLogReaderState *record);
414 : static void ConfirmRecoveryPaused(void);
415 :
416 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
417 : int emode, bool fetching_ckpt,
418 : TimeLineID replayTLI);
419 :
420 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
421 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
422 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
423 : bool randAccess,
424 : bool fetching_ckpt,
425 : XLogRecPtr tliRecPtr,
426 : TimeLineID replayTLI,
427 : XLogRecPtr replayLSN,
428 : bool nonblocking);
429 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
430 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
431 : XLogRecPtr RecPtr, TimeLineID replayTLI);
432 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
433 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
434 : XLogSource source, bool notfoundOk);
435 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
436 :
437 : static bool CheckForStandbyTrigger(void);
438 : static void SetPromoteIsTriggered(void);
439 : static bool HotStandbyActiveInReplay(void);
440 :
441 : static void SetCurrentChunkStartTime(TimestampTz xtime);
442 : static void SetLatestXTime(TimestampTz xtime);
443 :
444 : /*
445 : * Initialization of shared memory for WAL recovery
446 : */
447 : Size
448 5544 : XLogRecoveryShmemSize(void)
449 : {
450 : Size size;
451 :
452 : /* XLogRecoveryCtl */
453 5544 : size = sizeof(XLogRecoveryCtlData);
454 :
455 5544 : return size;
456 : }
457 :
458 : void
459 1938 : XLogRecoveryShmemInit(void)
460 : {
461 : bool found;
462 :
463 1938 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
464 1938 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
465 1938 : if (found)
466 0 : return;
467 1938 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
468 :
469 1938 : SpinLockInit(&XLogRecoveryCtl->info_lck);
470 1938 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
471 1938 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
472 : }
473 :
474 : /*
475 : * A thin wrapper to enable StandbyMode and do other preparatory work as
476 : * needed.
477 : */
478 : static void
479 194 : EnableStandbyMode(void)
480 : {
481 194 : StandbyMode = true;
482 :
483 : /*
484 : * To avoid server log bloat, we don't report recovery progress in a
485 : * standby as it will always be in recovery unless promoted. We disable
486 : * startup progress timeout in standby mode to avoid calling
487 : * startup_progress_timeout_handler() unnecessarily.
488 : */
489 194 : disable_startup_progress_timeout();
490 194 : }
491 :
492 : /*
493 : * Prepare the system for WAL recovery, if needed.
494 : *
495 : * This is called by StartupXLOG() which coordinates the server startup
496 : * sequence. This function analyzes the control file and the backup label
497 : * file, if any, and figures out whether we need to perform crash recovery or
498 : * archive recovery, and how far we need to replay the WAL to reach a
499 : * consistent state.
500 : *
501 : * This doesn't yet change the on-disk state, except for creating the symlinks
502 : * from table space map file if any, and for fetching WAL files needed to find
503 : * the checkpoint record. On entry, the caller has already read the control
504 : * file into memory, and passes it as argument. This function updates it to
505 : * reflect the recovery state, and the caller is expected to write it back to
506 : * disk does after initializing other subsystems, but before calling
507 : * PerformWalRecovery().
508 : *
509 : * This initializes some global variables like ArchiveRecoveryRequested, and
510 : * StandbyModeRequested and InRecovery.
511 : */
512 : void
513 1668 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
514 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
515 : {
516 : XLogPageReadPrivate *private;
517 : struct stat st;
518 : bool wasShutdown;
519 : XLogRecord *record;
520 : DBState dbstate_at_startup;
521 1668 : bool haveTblspcMap = false;
522 1668 : bool haveBackupLabel = false;
523 : CheckPoint checkPoint;
524 1668 : bool backupFromStandby = false;
525 :
526 1668 : dbstate_at_startup = ControlFile->state;
527 :
528 : /*
529 : * Initialize on the assumption we want to recover to the latest timeline
530 : * that's active according to pg_control.
531 : */
532 1668 : if (ControlFile->minRecoveryPointTLI >
533 1668 : ControlFile->checkPointCopy.ThisTimeLineID)
534 4 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
535 : else
536 1664 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
537 :
538 : /*
539 : * Check for signal files, and if so set up state for offline recovery
540 : */
541 1668 : readRecoverySignalFile();
542 1668 : validateRecoveryParameters();
543 :
544 : /*
545 : * Take ownership of the wakeup latch if we're going to sleep during
546 : * recovery, if required.
547 : */
548 1668 : if (ArchiveRecoveryRequested)
549 204 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
550 :
551 : /*
552 : * Set the WAL reading processor now, as it will be needed when reading
553 : * the checkpoint record required (backup_label or not).
554 : */
555 1668 : private = palloc0(sizeof(XLogPageReadPrivate));
556 1668 : xlogreader =
557 1668 : XLogReaderAllocate(wal_segment_size, NULL,
558 1668 : XL_ROUTINE(.page_read = &XLogPageRead,
559 : .segment_open = NULL,
560 : .segment_close = wal_segment_close),
561 : private);
562 1668 : if (!xlogreader)
563 0 : ereport(ERROR,
564 : (errcode(ERRCODE_OUT_OF_MEMORY),
565 : errmsg("out of memory"),
566 : errdetail("Failed while allocating a WAL reading processor.")));
567 1668 : xlogreader->system_identifier = ControlFile->system_identifier;
568 :
569 : /*
570 : * Set the WAL decode buffer size. This limits how far ahead we can read
571 : * in the WAL.
572 : */
573 1668 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
574 :
575 : /* Create a WAL prefetcher. */
576 1668 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
577 :
578 : /*
579 : * Allocate two page buffers dedicated to WAL consistency checks. We do
580 : * it this way, rather than just making static arrays, for two reasons:
581 : * (1) no need to waste the storage in most instantiations of the backend;
582 : * (2) a static char array isn't guaranteed to have any particular
583 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
584 : */
585 1668 : replay_image_masked = (char *) palloc(BLCKSZ);
586 1668 : primary_image_masked = (char *) palloc(BLCKSZ);
587 :
588 : /*
589 : * Read the backup_label file. We want to run this part of the recovery
590 : * process after checking for signal files and after performing validation
591 : * of the recovery parameters.
592 : */
593 1668 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
594 : &backupFromStandby))
595 : {
596 138 : List *tablespaces = NIL;
597 :
598 : /*
599 : * Archive recovery was requested, and thanks to the backup label
600 : * file, we know how far we need to replay to reach consistency. Enter
601 : * archive recovery directly.
602 : */
603 138 : InArchiveRecovery = true;
604 138 : if (StandbyModeRequested)
605 116 : EnableStandbyMode();
606 :
607 : /*
608 : * Omitting backup_label when creating a new replica, PITR node etc.
609 : * unfortunately is a common cause of corruption. Logging that
610 : * backup_label was used makes it a bit easier to exclude that as the
611 : * cause of observed corruption.
612 : *
613 : * Do so before we try to read the checkpoint record (which can fail),
614 : * as otherwise it can be hard to understand why a checkpoint other
615 : * than ControlFile->checkPoint is used.
616 : */
617 138 : ereport(LOG,
618 : (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
619 : LSN_FORMAT_ARGS(RedoStartLSN),
620 : LSN_FORMAT_ARGS(CheckPointLoc),
621 : CheckPointTLI)));
622 :
623 : /*
624 : * When a backup_label file is present, we want to roll forward from
625 : * the checkpoint it identifies, rather than using pg_control.
626 : */
627 138 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
628 : CheckPointTLI);
629 138 : if (record != NULL)
630 : {
631 138 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
632 138 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
633 138 : ereport(DEBUG1,
634 : (errmsg_internal("checkpoint record is at %X/%X",
635 : LSN_FORMAT_ARGS(CheckPointLoc))));
636 138 : InRecovery = true; /* force recovery even if SHUTDOWNED */
637 :
638 : /*
639 : * Make sure that REDO location exists. This may not be the case
640 : * if there was a crash during an online backup, which left a
641 : * backup_label around that references a WAL segment that's
642 : * already been archived.
643 : */
644 138 : if (checkPoint.redo < CheckPointLoc)
645 : {
646 138 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
647 138 : if (!ReadRecord(xlogprefetcher, LOG, false,
648 : checkPoint.ThisTimeLineID))
649 0 : ereport(FATAL,
650 : (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
651 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
652 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
653 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
654 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
655 : DataDir, DataDir, DataDir, DataDir)));
656 : }
657 : }
658 : else
659 : {
660 0 : ereport(FATAL,
661 : (errmsg("could not locate required checkpoint record at %X/%X",
662 : LSN_FORMAT_ARGS(CheckPointLoc)),
663 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
664 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
665 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
666 : DataDir, DataDir, DataDir, DataDir)));
667 : wasShutdown = false; /* keep compiler quiet */
668 : }
669 :
670 : /* Read the tablespace_map file if present and create symlinks. */
671 138 : if (read_tablespace_map(&tablespaces))
672 : {
673 : ListCell *lc;
674 :
675 8 : foreach(lc, tablespaces)
676 : {
677 4 : tablespaceinfo *ti = lfirst(lc);
678 : char *linkloc;
679 :
680 4 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
681 :
682 : /*
683 : * Remove the existing symlink if any and Create the symlink
684 : * under PGDATA.
685 : */
686 4 : remove_tablespace_symlink(linkloc);
687 :
688 4 : if (symlink(ti->path, linkloc) < 0)
689 0 : ereport(ERROR,
690 : (errcode_for_file_access(),
691 : errmsg("could not create symbolic link \"%s\": %m",
692 : linkloc)));
693 :
694 4 : pfree(ti->path);
695 4 : pfree(ti);
696 : }
697 :
698 : /* tell the caller to delete it later */
699 4 : haveTblspcMap = true;
700 : }
701 :
702 : /* tell the caller to delete it later */
703 138 : haveBackupLabel = true;
704 : }
705 : else
706 : {
707 : /* No backup_label file has been found if we are here. */
708 :
709 : /*
710 : * If tablespace_map file is present without backup_label file, there
711 : * is no use of such file. There is no harm in retaining it, but it
712 : * is better to get rid of the map file so that we don't have any
713 : * redundant file in data directory and it will avoid any sort of
714 : * confusion. It seems prudent though to just rename the file out of
715 : * the way rather than delete it completely, also we ignore any error
716 : * that occurs in rename operation as even if map file is present
717 : * without backup_label file, it is harmless.
718 : */
719 1530 : if (stat(TABLESPACE_MAP, &st) == 0)
720 : {
721 2 : unlink(TABLESPACE_MAP_OLD);
722 2 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
723 2 : ereport(LOG,
724 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
725 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
726 : errdetail("File \"%s\" was renamed to \"%s\".",
727 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
728 : else
729 0 : ereport(LOG,
730 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
732 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
733 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
734 : }
735 :
736 : /*
737 : * It's possible that archive recovery was requested, but we don't
738 : * know how far we need to replay the WAL before we reach consistency.
739 : * This can happen for example if a base backup is taken from a
740 : * running server using an atomic filesystem snapshot, without calling
741 : * pg_backup_start/stop. Or if you just kill a running primary server
742 : * and put it into archive recovery by creating a recovery signal
743 : * file.
744 : *
745 : * Our strategy in that case is to perform crash recovery first,
746 : * replaying all the WAL present in pg_wal, and only enter archive
747 : * recovery after that.
748 : *
749 : * But usually we already know how far we need to replay the WAL (up
750 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
751 : * end-of-backup record), and we can enter archive recovery directly.
752 : */
753 1530 : if (ArchiveRecoveryRequested &&
754 78 : (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
755 18 : ControlFile->backupEndRequired ||
756 18 : ControlFile->backupEndPoint != InvalidXLogRecPtr ||
757 18 : ControlFile->state == DB_SHUTDOWNED))
758 : {
759 74 : InArchiveRecovery = true;
760 74 : if (StandbyModeRequested)
761 74 : EnableStandbyMode();
762 : }
763 :
764 : /*
765 : * For the same reason as when starting up with backup_label present,
766 : * emit a log message when we continue initializing from a base
767 : * backup.
768 : */
769 1530 : if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
770 0 : ereport(LOG,
771 : (errmsg("restarting backup recovery with redo LSN %X/%X",
772 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
773 :
774 : /* Get the last valid checkpoint record. */
775 1530 : CheckPointLoc = ControlFile->checkPoint;
776 1530 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
777 1530 : RedoStartLSN = ControlFile->checkPointCopy.redo;
778 1530 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
779 1530 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
780 : CheckPointTLI);
781 1530 : if (record != NULL)
782 : {
783 1530 : ereport(DEBUG1,
784 : (errmsg_internal("checkpoint record is at %X/%X",
785 : LSN_FORMAT_ARGS(CheckPointLoc))));
786 : }
787 : else
788 : {
789 : /*
790 : * We used to attempt to go back to a secondary checkpoint record
791 : * here, but only when not in standby mode. We now just fail if we
792 : * can't read the last checkpoint because this allows us to
793 : * simplify processing around checkpoints.
794 : */
795 0 : ereport(PANIC,
796 : (errmsg("could not locate a valid checkpoint record at %X/%X",
797 : LSN_FORMAT_ARGS(CheckPointLoc))));
798 : }
799 1530 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
800 1530 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
801 : }
802 :
803 1668 : if (ArchiveRecoveryRequested)
804 : {
805 204 : if (StandbyModeRequested)
806 194 : ereport(LOG,
807 : (errmsg("entering standby mode")));
808 10 : else if (recoveryTarget == RECOVERY_TARGET_XID)
809 0 : ereport(LOG,
810 : (errmsg("starting point-in-time recovery to XID %u",
811 : recoveryTargetXid)));
812 10 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
813 0 : ereport(LOG,
814 : (errmsg("starting point-in-time recovery to %s",
815 : timestamptz_to_str(recoveryTargetTime))));
816 10 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
817 6 : ereport(LOG,
818 : (errmsg("starting point-in-time recovery to \"%s\"",
819 : recoveryTargetName)));
820 4 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
821 0 : ereport(LOG,
822 : (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
823 : LSN_FORMAT_ARGS(recoveryTargetLSN))));
824 4 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
825 0 : ereport(LOG,
826 : (errmsg("starting point-in-time recovery to earliest consistent point")));
827 : else
828 4 : ereport(LOG,
829 : (errmsg("starting archive recovery")));
830 : }
831 :
832 : /*
833 : * If the location of the checkpoint record is not on the expected
834 : * timeline in the history of the requested timeline, we cannot proceed:
835 : * the backup is not part of the history of the requested timeline.
836 : */
837 : Assert(expectedTLEs); /* was initialized by reading checkpoint
838 : * record */
839 1668 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
840 : CheckPointTLI)
841 : {
842 : XLogRecPtr switchpoint;
843 :
844 : /*
845 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
846 : * not in expectedTLEs at all.
847 : */
848 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
849 0 : ereport(FATAL,
850 : (errmsg("requested timeline %u is not a child of this server's history",
851 : recoveryTargetTLI),
852 : errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
853 : LSN_FORMAT_ARGS(CheckPointLoc),
854 : CheckPointTLI,
855 : LSN_FORMAT_ARGS(switchpoint))));
856 : }
857 :
858 : /*
859 : * The min recovery point should be part of the requested timeline's
860 : * history, too.
861 : */
862 1668 : if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
863 72 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
864 72 : ControlFile->minRecoveryPointTLI)
865 0 : ereport(FATAL,
866 : (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
867 : recoveryTargetTLI,
868 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
869 : ControlFile->minRecoveryPointTLI)));
870 :
871 1668 : ereport(DEBUG1,
872 : (errmsg_internal("redo record is at %X/%X; shutdown %s",
873 : LSN_FORMAT_ARGS(checkPoint.redo),
874 : wasShutdown ? "true" : "false")));
875 1668 : ereport(DEBUG1,
876 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
877 : U64FromFullTransactionId(checkPoint.nextXid),
878 : checkPoint.nextOid)));
879 1668 : ereport(DEBUG1,
880 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
881 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
882 1668 : ereport(DEBUG1,
883 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
884 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
885 1668 : ereport(DEBUG1,
886 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
887 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
888 1668 : ereport(DEBUG1,
889 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
890 : checkPoint.oldestCommitTsXid,
891 : checkPoint.newestCommitTsXid)));
892 1668 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
893 0 : ereport(PANIC,
894 : (errmsg("invalid next transaction ID")));
895 :
896 : /* sanity check */
897 1668 : if (checkPoint.redo > CheckPointLoc)
898 0 : ereport(PANIC,
899 : (errmsg("invalid redo in checkpoint record")));
900 :
901 : /*
902 : * Check whether we need to force recovery from WAL. If it appears to
903 : * have been a clean shutdown and we did not have a recovery signal file,
904 : * then assume no recovery needed.
905 : */
906 1668 : if (checkPoint.redo < CheckPointLoc)
907 : {
908 214 : if (wasShutdown)
909 0 : ereport(PANIC,
910 : (errmsg("invalid redo record in shutdown checkpoint")));
911 214 : InRecovery = true;
912 : }
913 1454 : else if (ControlFile->state != DB_SHUTDOWNED)
914 184 : InRecovery = true;
915 1270 : else if (ArchiveRecoveryRequested)
916 : {
917 : /* force recovery due to presence of recovery signal file */
918 14 : InRecovery = true;
919 : }
920 :
921 : /*
922 : * If recovery is needed, update our in-memory copy of pg_control to show
923 : * that we are recovering and to show the selected checkpoint as the place
924 : * we are starting from. We also mark pg_control with any minimum recovery
925 : * stop point obtained from a backup history file.
926 : *
927 : * We don't write the changes to disk yet, though. Only do that after
928 : * initializing various subsystems.
929 : */
930 1668 : if (InRecovery)
931 : {
932 412 : if (InArchiveRecovery)
933 : {
934 212 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
935 : }
936 : else
937 : {
938 200 : ereport(LOG,
939 : (errmsg("database system was not properly shut down; "
940 : "automatic recovery in progress")));
941 200 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
942 4 : ereport(LOG,
943 : (errmsg("crash recovery starts in timeline %u "
944 : "and has target timeline %u",
945 : ControlFile->checkPointCopy.ThisTimeLineID,
946 : recoveryTargetTLI)));
947 200 : ControlFile->state = DB_IN_CRASH_RECOVERY;
948 : }
949 412 : ControlFile->checkPoint = CheckPointLoc;
950 412 : ControlFile->checkPointCopy = checkPoint;
951 412 : if (InArchiveRecovery)
952 : {
953 : /* initialize minRecoveryPoint if not set yet */
954 212 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
955 : {
956 144 : ControlFile->minRecoveryPoint = checkPoint.redo;
957 144 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
958 : }
959 : }
960 :
961 : /*
962 : * Set backupStartPoint if we're starting recovery from a base backup.
963 : *
964 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
965 : * location if we're starting recovery from a base backup which was
966 : * taken from a standby. In this case, the database system status in
967 : * pg_control must indicate that the database was already in recovery.
968 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
969 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
970 : * before reaching this point; e.g. because restore_command or
971 : * primary_conninfo were faulty.
972 : *
973 : * Any other state indicates that the backup somehow became corrupted
974 : * and we can't sensibly continue with recovery.
975 : */
976 412 : if (haveBackupLabel)
977 : {
978 138 : ControlFile->backupStartPoint = checkPoint.redo;
979 138 : ControlFile->backupEndRequired = backupEndRequired;
980 :
981 138 : if (backupFromStandby)
982 : {
983 8 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
984 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
985 0 : ereport(FATAL,
986 : (errmsg("backup_label contains data inconsistent with control file"),
987 : errhint("This means that the backup is corrupted and you will "
988 : "have to use another backup for recovery.")));
989 8 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
990 : }
991 : }
992 : }
993 :
994 : /* remember these, so that we know when we have reached consistency */
995 1668 : backupStartPoint = ControlFile->backupStartPoint;
996 1668 : backupEndRequired = ControlFile->backupEndRequired;
997 1668 : backupEndPoint = ControlFile->backupEndPoint;
998 1668 : if (InArchiveRecovery)
999 : {
1000 212 : minRecoveryPoint = ControlFile->minRecoveryPoint;
1001 212 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1002 : }
1003 : else
1004 : {
1005 1456 : minRecoveryPoint = InvalidXLogRecPtr;
1006 1456 : minRecoveryPointTLI = 0;
1007 : }
1008 :
1009 : /*
1010 : * Start recovery assuming that the final record isn't lost.
1011 : */
1012 1668 : abortedRecPtr = InvalidXLogRecPtr;
1013 1668 : missingContrecPtr = InvalidXLogRecPtr;
1014 :
1015 1668 : *wasShutdown_ptr = wasShutdown;
1016 1668 : *haveBackupLabel_ptr = haveBackupLabel;
1017 1668 : *haveTblspcMap_ptr = haveTblspcMap;
1018 1668 : }
1019 :
1020 : /*
1021 : * See if there are any recovery signal files and if so, set state for
1022 : * recovery.
1023 : *
1024 : * See if there is a recovery command file (recovery.conf), and if so
1025 : * throw an ERROR since as of PG12 we no longer recognize that.
1026 : */
1027 : static void
1028 1668 : readRecoverySignalFile(void)
1029 : {
1030 : struct stat stat_buf;
1031 :
1032 1668 : if (IsBootstrapProcessingMode())
1033 1464 : return;
1034 :
1035 : /*
1036 : * Check for old recovery API file: recovery.conf
1037 : */
1038 1578 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1039 0 : ereport(FATAL,
1040 : (errcode_for_file_access(),
1041 : errmsg("using recovery command file \"%s\" is not supported",
1042 : RECOVERY_COMMAND_FILE)));
1043 :
1044 : /*
1045 : * Remove unused .done file, if present. Ignore if absent.
1046 : */
1047 1578 : unlink(RECOVERY_COMMAND_DONE);
1048 :
1049 : /*
1050 : * Check for recovery signal files and if found, fsync them since they
1051 : * represent server state information. We don't sweat too much about the
1052 : * possibility of fsync failure, however.
1053 : *
1054 : * If present, standby signal file takes precedence. If neither is present
1055 : * then we won't enter archive recovery.
1056 : */
1057 1578 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1058 : {
1059 : int fd;
1060 :
1061 194 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1062 : S_IRUSR | S_IWUSR);
1063 194 : if (fd >= 0)
1064 : {
1065 194 : (void) pg_fsync(fd);
1066 194 : close(fd);
1067 : }
1068 194 : standby_signal_file_found = true;
1069 : }
1070 1384 : else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1071 : {
1072 : int fd;
1073 :
1074 10 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1075 : S_IRUSR | S_IWUSR);
1076 10 : if (fd >= 0)
1077 : {
1078 10 : (void) pg_fsync(fd);
1079 10 : close(fd);
1080 : }
1081 10 : recovery_signal_file_found = true;
1082 : }
1083 :
1084 1578 : StandbyModeRequested = false;
1085 1578 : ArchiveRecoveryRequested = false;
1086 1578 : if (standby_signal_file_found)
1087 : {
1088 194 : StandbyModeRequested = true;
1089 194 : ArchiveRecoveryRequested = true;
1090 : }
1091 1384 : else if (recovery_signal_file_found)
1092 : {
1093 10 : StandbyModeRequested = false;
1094 10 : ArchiveRecoveryRequested = true;
1095 : }
1096 : else
1097 1374 : return;
1098 :
1099 : /*
1100 : * We don't support standby mode in standalone backends; that requires
1101 : * other processes such as the WAL receiver to be alive.
1102 : */
1103 204 : if (StandbyModeRequested && !IsUnderPostmaster)
1104 0 : ereport(FATAL,
1105 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1106 : errmsg("standby mode is not supported by single-user servers")));
1107 : }
1108 :
1109 : static void
1110 1668 : validateRecoveryParameters(void)
1111 : {
1112 1668 : if (!ArchiveRecoveryRequested)
1113 1464 : return;
1114 :
1115 : /*
1116 : * Check for compulsory parameters
1117 : */
1118 204 : if (StandbyModeRequested)
1119 : {
1120 194 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1121 20 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1122 4 : ereport(WARNING,
1123 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1124 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1125 : }
1126 : else
1127 : {
1128 10 : if (recoveryRestoreCommand == NULL ||
1129 10 : strcmp(recoveryRestoreCommand, "") == 0)
1130 0 : ereport(FATAL,
1131 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1132 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1133 : }
1134 :
1135 : /*
1136 : * Override any inconsistent requests. Note that this is a change of
1137 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1138 : * hot_standby = off, which was surprising behaviour.
1139 : */
1140 204 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1141 190 : !EnableHotStandby)
1142 4 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1143 :
1144 : /*
1145 : * Final parsing of recovery_target_time string; see also
1146 : * check_recovery_target_time().
1147 : */
1148 204 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1149 : {
1150 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1151 : CStringGetDatum(recovery_target_time_string),
1152 : ObjectIdGetDatum(InvalidOid),
1153 : Int32GetDatum(-1)));
1154 : }
1155 :
1156 : /*
1157 : * If user specified recovery_target_timeline, validate it or compute the
1158 : * "latest" value. We can't do this until after we've gotten the restore
1159 : * command and set InArchiveRecovery, because we need to fetch timeline
1160 : * history files from the archive.
1161 : */
1162 204 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1163 : {
1164 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1165 :
1166 : /* Timeline 1 does not have a history file, all else should */
1167 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1168 0 : ereport(FATAL,
1169 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1170 : errmsg("recovery target timeline %u does not exist",
1171 : rtli)));
1172 0 : recoveryTargetTLI = rtli;
1173 : }
1174 204 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1175 : {
1176 : /* We start the "latest" search from pg_control's timeline */
1177 204 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1178 : }
1179 : else
1180 : {
1181 : /*
1182 : * else we just use the recoveryTargetTLI as already read from
1183 : * ControlFile
1184 : */
1185 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1186 : }
1187 : }
1188 :
1189 : /*
1190 : * read_backup_label: check to see if a backup_label file is present
1191 : *
1192 : * If we see a backup_label during recovery, we assume that we are recovering
1193 : * from a backup dump file, and we therefore roll forward from the checkpoint
1194 : * identified by the label file, NOT what pg_control says. This avoids the
1195 : * problem that pg_control might have been archived one or more checkpoints
1196 : * later than the start of the dump, and so if we rely on it as the start
1197 : * point, we will fail to restore a consistent database state.
1198 : *
1199 : * Returns true if a backup_label was found (and fills the checkpoint
1200 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1201 : * returns false if not. If this backup_label came from a streamed backup,
1202 : * *backupEndRequired is set to true. If this backup_label was created during
1203 : * recovery, *backupFromStandby is set to true.
1204 : *
1205 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1206 : * and TLI read from the backup file.
1207 : */
1208 : static bool
1209 1668 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1210 : bool *backupEndRequired, bool *backupFromStandby)
1211 : {
1212 : char startxlogfilename[MAXFNAMELEN];
1213 : TimeLineID tli_from_walseg,
1214 : tli_from_file;
1215 : FILE *lfp;
1216 : char ch;
1217 : char backuptype[20];
1218 : char backupfrom[20];
1219 : char backuplabel[MAXPGPATH];
1220 : char backuptime[128];
1221 : uint32 hi,
1222 : lo;
1223 :
1224 : /* suppress possible uninitialized-variable warnings */
1225 1668 : *checkPointLoc = InvalidXLogRecPtr;
1226 1668 : *backupLabelTLI = 0;
1227 1668 : *backupEndRequired = false;
1228 1668 : *backupFromStandby = false;
1229 :
1230 : /*
1231 : * See if label file is present
1232 : */
1233 1668 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1234 1668 : if (!lfp)
1235 : {
1236 1530 : if (errno != ENOENT)
1237 0 : ereport(FATAL,
1238 : (errcode_for_file_access(),
1239 : errmsg("could not read file \"%s\": %m",
1240 : BACKUP_LABEL_FILE)));
1241 1530 : return false; /* it's not there, all is fine */
1242 : }
1243 :
1244 : /*
1245 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1246 : * is pretty crude, but we are not expecting any variability in the file
1247 : * format).
1248 : */
1249 138 : if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1250 138 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1251 0 : ereport(FATAL,
1252 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1253 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1254 138 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1255 138 : RedoStartTLI = tli_from_walseg;
1256 138 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1257 138 : &hi, &lo, &ch) != 3 || ch != '\n')
1258 0 : ereport(FATAL,
1259 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1260 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1261 138 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1262 138 : *backupLabelTLI = tli_from_walseg;
1263 :
1264 : /*
1265 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1266 : * which could mean either pg_basebackup or the pg_backup_start/stop
1267 : * method was used) or if this label came from somewhere else (the only
1268 : * other option today being from pg_rewind). If this was a streamed
1269 : * backup then we know that we need to play through until we get to the
1270 : * end of the WAL which was generated during the backup (at which point we
1271 : * will have reached consistency and backupEndRequired will be reset to be
1272 : * false).
1273 : */
1274 138 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1275 : {
1276 138 : if (strcmp(backuptype, "streamed") == 0)
1277 136 : *backupEndRequired = true;
1278 : }
1279 :
1280 : /*
1281 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1282 : * it was from a standby, we'll double-check that the control file state
1283 : * matches that of a standby.
1284 : */
1285 138 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1286 : {
1287 138 : if (strcmp(backupfrom, "standby") == 0)
1288 8 : *backupFromStandby = true;
1289 : }
1290 :
1291 : /*
1292 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1293 : * but checking for their presence is useful for debugging and the next
1294 : * sanity checks. Cope also with the fact that the result buffers have a
1295 : * pre-allocated size, hence if the backup_label file has been generated
1296 : * with strings longer than the maximum assumed here an incorrect parsing
1297 : * happens. That's fine as only minor consistency checks are done
1298 : * afterwards.
1299 : */
1300 138 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1301 138 : ereport(DEBUG1,
1302 : (errmsg_internal("backup time %s in file \"%s\"",
1303 : backuptime, BACKUP_LABEL_FILE)));
1304 :
1305 138 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1306 136 : ereport(DEBUG1,
1307 : (errmsg_internal("backup label %s in file \"%s\"",
1308 : backuplabel, BACKUP_LABEL_FILE)));
1309 :
1310 : /*
1311 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1312 : * it as a sanity check if present.
1313 : */
1314 138 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1315 : {
1316 136 : if (tli_from_walseg != tli_from_file)
1317 0 : ereport(FATAL,
1318 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1319 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1320 : errdetail("Timeline ID parsed is %u, but expected %u.",
1321 : tli_from_file, tli_from_walseg)));
1322 :
1323 136 : ereport(DEBUG1,
1324 : (errmsg_internal("backup timeline %u in file \"%s\"",
1325 : tli_from_file, BACKUP_LABEL_FILE)));
1326 : }
1327 :
1328 138 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1329 0 : ereport(FATAL,
1330 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1331 : errmsg("this is an incremental backup, not a data directory"),
1332 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1333 :
1334 138 : if (ferror(lfp) || FreeFile(lfp))
1335 0 : ereport(FATAL,
1336 : (errcode_for_file_access(),
1337 : errmsg("could not read file \"%s\": %m",
1338 : BACKUP_LABEL_FILE)));
1339 :
1340 138 : return true;
1341 : }
1342 :
1343 : /*
1344 : * read_tablespace_map: check to see if a tablespace_map file is present
1345 : *
1346 : * If we see a tablespace_map file during recovery, we assume that we are
1347 : * recovering from a backup dump file, and we therefore need to create symlinks
1348 : * as per the information present in tablespace_map file.
1349 : *
1350 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1351 : * with a tablespaceinfo struct for each tablespace listed in the file);
1352 : * returns false if not.
1353 : */
1354 : static bool
1355 138 : read_tablespace_map(List **tablespaces)
1356 : {
1357 : tablespaceinfo *ti;
1358 : FILE *lfp;
1359 : char str[MAXPGPATH];
1360 : int ch,
1361 : i,
1362 : n;
1363 : bool was_backslash;
1364 :
1365 : /*
1366 : * See if tablespace_map file is present
1367 : */
1368 138 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1369 138 : if (!lfp)
1370 : {
1371 134 : if (errno != ENOENT)
1372 0 : ereport(FATAL,
1373 : (errcode_for_file_access(),
1374 : errmsg("could not read file \"%s\": %m",
1375 : TABLESPACE_MAP)));
1376 134 : return false; /* it's not there, all is fine */
1377 : }
1378 :
1379 : /*
1380 : * Read and parse the link name and path lines from tablespace_map file
1381 : * (this code is pretty crude, but we are not expecting any variability in
1382 : * the file format). De-escape any backslashes that were inserted.
1383 : */
1384 4 : i = 0;
1385 4 : was_backslash = false;
1386 154 : while ((ch = fgetc(lfp)) != EOF)
1387 : {
1388 150 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1389 : {
1390 : char *endp;
1391 :
1392 4 : if (i == 0)
1393 0 : continue; /* \r immediately followed by \n */
1394 :
1395 : /*
1396 : * The de-escaped line should contain an OID followed by exactly
1397 : * one space followed by a path. The path might start with
1398 : * spaces, so don't be too liberal about parsing.
1399 : */
1400 4 : str[i] = '\0';
1401 4 : n = 0;
1402 24 : while (str[n] && str[n] != ' ')
1403 20 : n++;
1404 4 : if (n < 1 || n >= i - 1)
1405 0 : ereport(FATAL,
1406 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1407 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1408 4 : str[n++] = '\0';
1409 :
1410 4 : ti = palloc0(sizeof(tablespaceinfo));
1411 4 : errno = 0;
1412 4 : ti->oid = strtoul(str, &endp, 10);
1413 4 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1414 0 : ereport(FATAL,
1415 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1416 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1417 4 : ti->path = pstrdup(str + n);
1418 4 : *tablespaces = lappend(*tablespaces, ti);
1419 :
1420 4 : i = 0;
1421 4 : continue;
1422 : }
1423 146 : else if (!was_backslash && ch == '\\')
1424 0 : was_backslash = true;
1425 : else
1426 : {
1427 146 : if (i < sizeof(str) - 1)
1428 146 : str[i++] = ch;
1429 146 : was_backslash = false;
1430 : }
1431 : }
1432 :
1433 4 : if (i != 0 || was_backslash) /* last line not terminated? */
1434 0 : ereport(FATAL,
1435 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1436 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1437 :
1438 4 : if (ferror(lfp) || FreeFile(lfp))
1439 0 : ereport(FATAL,
1440 : (errcode_for_file_access(),
1441 : errmsg("could not read file \"%s\": %m",
1442 : TABLESPACE_MAP)));
1443 :
1444 4 : return true;
1445 : }
1446 :
1447 : /*
1448 : * Finish WAL recovery.
1449 : *
1450 : * This does not close the 'xlogreader' yet, because in some cases the caller
1451 : * still wants to re-read the last checkpoint record by calling
1452 : * ReadCheckpointRecord().
1453 : *
1454 : * Returns the position of the last valid or applied record, after which new
1455 : * WAL should be appended, information about why recovery was ended, and some
1456 : * other things. See the EndOfWalRecoveryInfo struct for details.
1457 : */
1458 : EndOfWalRecoveryInfo *
1459 1562 : FinishWalRecovery(void)
1460 : {
1461 1562 : EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1462 : XLogRecPtr lastRec;
1463 : TimeLineID lastRecTLI;
1464 : XLogRecPtr endOfLog;
1465 :
1466 : /*
1467 : * Kill WAL receiver, if it's still running, before we continue to write
1468 : * the startup checkpoint and aborted-contrecord records. It will trump
1469 : * over these records and subsequent ones if it's still alive when we
1470 : * start writing WAL.
1471 : */
1472 1562 : XLogShutdownWalRcv();
1473 :
1474 : /*
1475 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1476 : * it and to prevent it from keep trying to fetch the failover slots.
1477 : *
1478 : * We do not update the 'synced' column in 'pg_replication_slots' system
1479 : * view from true to false here, as any failed update could leave 'synced'
1480 : * column false for some slots. This could cause issues during slot sync
1481 : * after restarting the server as a standby. While updating the 'synced'
1482 : * column after switching to the new timeline is an option, it does not
1483 : * simplify the handling for the 'synced' column. Therefore, we retain the
1484 : * 'synced' column as true after promotion as it may provide useful
1485 : * information about the slot origin.
1486 : */
1487 1562 : ShutDownSlotSync();
1488 :
1489 : /*
1490 : * We are now done reading the xlog from stream. Turn off streaming
1491 : * recovery to force fetching the files (which would be required at end of
1492 : * recovery, e.g., timeline history file) from archive or pg_wal.
1493 : *
1494 : * Note that standby mode must be turned off after killing WAL receiver,
1495 : * i.e., calling XLogShutdownWalRcv().
1496 : */
1497 : Assert(!WalRcvStreaming());
1498 1562 : StandbyMode = false;
1499 :
1500 : /*
1501 : * Determine where to start writing WAL next.
1502 : *
1503 : * Re-fetch the last valid or last applied record, so we can identify the
1504 : * exact endpoint of what we consider the valid portion of WAL. There may
1505 : * be an incomplete continuation record after that, in which case
1506 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1507 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1508 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1509 : *
1510 : * An important side-effect of this is to load the last page into
1511 : * xlogreader. The caller uses it to initialize the WAL for writing.
1512 : */
1513 1562 : if (!InRecovery)
1514 : {
1515 1256 : lastRec = CheckPointLoc;
1516 1256 : lastRecTLI = CheckPointTLI;
1517 : }
1518 : else
1519 : {
1520 306 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1521 306 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1522 : }
1523 1562 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1524 1562 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1525 1562 : endOfLog = xlogreader->EndRecPtr;
1526 :
1527 : /*
1528 : * Remember the TLI in the filename of the XLOG segment containing the
1529 : * end-of-log. It could be different from the timeline that endOfLog
1530 : * nominally belongs to, if there was a timeline switch in that segment,
1531 : * and we were reading the old WAL from a segment belonging to a higher
1532 : * timeline.
1533 : */
1534 1562 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1535 :
1536 1562 : if (ArchiveRecoveryRequested)
1537 : {
1538 : /*
1539 : * We are no longer in archive recovery state.
1540 : *
1541 : * We are now done reading the old WAL. Turn off archive fetching if
1542 : * it was active.
1543 : */
1544 : Assert(InArchiveRecovery);
1545 98 : InArchiveRecovery = false;
1546 :
1547 : /*
1548 : * If the ending log segment is still open, close it (to avoid
1549 : * problems on Windows with trying to rename or delete an open file).
1550 : */
1551 98 : if (readFile >= 0)
1552 : {
1553 98 : close(readFile);
1554 98 : readFile = -1;
1555 : }
1556 : }
1557 :
1558 : /*
1559 : * Copy the last partial block to the caller, for initializing the WAL
1560 : * buffer for appending new WAL.
1561 : */
1562 1562 : if (endOfLog % XLOG_BLCKSZ != 0)
1563 : {
1564 : char *page;
1565 : int len;
1566 : XLogRecPtr pageBeginPtr;
1567 :
1568 1524 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1569 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1570 :
1571 : /* Copy the valid part of the last block */
1572 1524 : len = endOfLog % XLOG_BLCKSZ;
1573 1524 : page = palloc(len);
1574 1524 : memcpy(page, xlogreader->readBuf, len);
1575 :
1576 1524 : result->lastPageBeginPtr = pageBeginPtr;
1577 1524 : result->lastPage = page;
1578 : }
1579 : else
1580 : {
1581 : /* There is no partial block to copy. */
1582 38 : result->lastPageBeginPtr = endOfLog;
1583 38 : result->lastPage = NULL;
1584 : }
1585 :
1586 : /*
1587 : * Create a comment for the history file to explain why and where timeline
1588 : * changed.
1589 : */
1590 1562 : result->recoveryStopReason = getRecoveryStopReason();
1591 :
1592 1562 : result->lastRec = lastRec;
1593 1562 : result->lastRecTLI = lastRecTLI;
1594 1562 : result->endOfLog = endOfLog;
1595 :
1596 1562 : result->abortedRecPtr = abortedRecPtr;
1597 1562 : result->missingContrecPtr = missingContrecPtr;
1598 :
1599 1562 : result->standby_signal_file_found = standby_signal_file_found;
1600 1562 : result->recovery_signal_file_found = recovery_signal_file_found;
1601 :
1602 1562 : return result;
1603 : }
1604 :
1605 : /*
1606 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1607 : */
1608 : void
1609 1562 : ShutdownWalRecovery(void)
1610 : {
1611 : char recoveryPath[MAXPGPATH];
1612 :
1613 : /* Final update of pg_stat_recovery_prefetch. */
1614 1562 : XLogPrefetcherComputeStats(xlogprefetcher);
1615 :
1616 : /* Shut down xlogreader */
1617 1562 : if (readFile >= 0)
1618 : {
1619 1464 : close(readFile);
1620 1464 : readFile = -1;
1621 : }
1622 1562 : XLogReaderFree(xlogreader);
1623 1562 : XLogPrefetcherFree(xlogprefetcher);
1624 :
1625 1562 : if (ArchiveRecoveryRequested)
1626 : {
1627 : /*
1628 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1629 : * rid of it.
1630 : */
1631 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1632 98 : unlink(recoveryPath); /* ignore any error */
1633 :
1634 : /* Get rid of any remaining recovered timeline-history file, too */
1635 98 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1636 98 : unlink(recoveryPath); /* ignore any error */
1637 : }
1638 :
1639 : /*
1640 : * We don't need the latch anymore. It's not strictly necessary to disown
1641 : * it, but let's do it for the sake of tidiness.
1642 : */
1643 1562 : if (ArchiveRecoveryRequested)
1644 98 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1645 1562 : }
1646 :
1647 : /*
1648 : * Perform WAL recovery.
1649 : *
1650 : * If the system was shut down cleanly, this is never called.
1651 : */
1652 : void
1653 412 : PerformWalRecovery(void)
1654 : {
1655 : XLogRecord *record;
1656 412 : bool reachedRecoveryTarget = false;
1657 : TimeLineID replayTLI;
1658 :
1659 : /*
1660 : * Initialize shared variables for tracking progress of WAL replay, as if
1661 : * we had just replayed the record before the REDO location (or the
1662 : * checkpoint record itself, if it's a shutdown checkpoint).
1663 : */
1664 412 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1665 412 : if (RedoStartLSN < CheckPointLoc)
1666 : {
1667 214 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1668 214 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1669 214 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1670 : }
1671 : else
1672 : {
1673 198 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1674 198 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1675 198 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1676 : }
1677 412 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1678 412 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1679 412 : XLogRecoveryCtl->recoveryLastXTime = 0;
1680 412 : XLogRecoveryCtl->currentChunkStartTime = 0;
1681 412 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1682 412 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1683 :
1684 : /* Also ensure XLogReceiptTime has a sane value */
1685 412 : XLogReceiptTime = GetCurrentTimestamp();
1686 :
1687 : /*
1688 : * Let postmaster know we've started redo now, so that it can launch the
1689 : * archiver if necessary.
1690 : */
1691 412 : if (IsUnderPostmaster)
1692 394 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1693 :
1694 : /*
1695 : * Allow read-only connections immediately if we're consistent already.
1696 : */
1697 412 : CheckRecoveryConsistency();
1698 :
1699 : /*
1700 : * Find the first record that logically follows the checkpoint --- it
1701 : * might physically precede it, though.
1702 : */
1703 412 : if (RedoStartLSN < CheckPointLoc)
1704 : {
1705 : /* back up to find the record */
1706 214 : replayTLI = RedoStartTLI;
1707 214 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1708 214 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1709 :
1710 : /*
1711 : * If a checkpoint record's redo pointer points back to an earlier
1712 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1713 : * record.
1714 : */
1715 214 : if (record->xl_rmid != RM_XLOG_ID ||
1716 214 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1717 0 : ereport(FATAL,
1718 : (errmsg("unexpected record type found at redo point %X/%X",
1719 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1720 : }
1721 : else
1722 : {
1723 : /* just have to read next record after CheckPoint */
1724 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1725 198 : replayTLI = CheckPointTLI;
1726 198 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1727 : }
1728 :
1729 412 : if (record != NULL)
1730 : {
1731 : TimestampTz xtime;
1732 : PGRUsage ru0;
1733 :
1734 394 : pg_rusage_init(&ru0);
1735 :
1736 394 : InRedo = true;
1737 :
1738 394 : RmgrStartup();
1739 :
1740 394 : ereport(LOG,
1741 : (errmsg("redo starts at %X/%X",
1742 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1743 :
1744 : /* Prepare to report progress of the redo phase. */
1745 394 : if (!StandbyMode)
1746 210 : begin_startup_progress_phase();
1747 :
1748 : /*
1749 : * main redo apply loop
1750 : */
1751 : do
1752 : {
1753 5361634 : if (!StandbyMode)
1754 527588 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1755 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1756 :
1757 : #ifdef WAL_DEBUG
1758 : if (XLOG_DEBUG)
1759 : {
1760 : StringInfoData buf;
1761 :
1762 : initStringInfo(&buf);
1763 : appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1764 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1765 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1766 : xlog_outrec(&buf, xlogreader);
1767 : appendStringInfoString(&buf, " - ");
1768 : xlog_outdesc(&buf, xlogreader);
1769 : elog(LOG, "%s", buf.data);
1770 : pfree(buf.data);
1771 : }
1772 : #endif
1773 :
1774 : /* Handle interrupt signals of startup process */
1775 5361634 : HandleStartupProcInterrupts();
1776 :
1777 : /*
1778 : * Pause WAL replay, if requested by a hot-standby session via
1779 : * SetRecoveryPause().
1780 : *
1781 : * Note that we intentionally don't take the info_lck spinlock
1782 : * here. We might therefore read a slightly stale value of the
1783 : * recoveryPause flag, but it can't be very stale (no worse than
1784 : * the last spinlock we did acquire). Since a pause request is a
1785 : * pretty asynchronous thing anyway, possibly responding to it one
1786 : * WAL record later than we otherwise would is a minor issue, so
1787 : * it doesn't seem worth adding another spinlock cycle to prevent
1788 : * that.
1789 : */
1790 5361634 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1791 : RECOVERY_NOT_PAUSED)
1792 0 : recoveryPausesHere(false);
1793 :
1794 : /*
1795 : * Have we reached our recovery target?
1796 : */
1797 5361634 : if (recoveryStopsBefore(xlogreader))
1798 : {
1799 2 : reachedRecoveryTarget = true;
1800 2 : break;
1801 : }
1802 :
1803 : /*
1804 : * If we've been asked to lag the primary, wait on latch until
1805 : * enough time has passed.
1806 : */
1807 5361632 : if (recoveryApplyDelay(xlogreader))
1808 : {
1809 : /*
1810 : * We test for paused recovery again here. If user sets
1811 : * delayed apply, it may be because they expect to pause
1812 : * recovery in case of problems, so we must test again here
1813 : * otherwise pausing during the delay-wait wouldn't work.
1814 : */
1815 0 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1816 : RECOVERY_NOT_PAUSED)
1817 0 : recoveryPausesHere(false);
1818 : }
1819 :
1820 : /*
1821 : * Apply the record
1822 : */
1823 5361632 : ApplyWalRecord(xlogreader, record, &replayTLI);
1824 :
1825 : /* Exit loop if we reached inclusive recovery target */
1826 5361628 : if (recoveryStopsAfter(xlogreader))
1827 : {
1828 12 : reachedRecoveryTarget = true;
1829 12 : break;
1830 : }
1831 :
1832 : /* Else, try to fetch the next WAL record */
1833 5361616 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1834 5361516 : } while (record != NULL);
1835 :
1836 : /*
1837 : * end of main redo apply loop
1838 : */
1839 :
1840 290 : if (reachedRecoveryTarget)
1841 : {
1842 14 : if (!reachedConsistency)
1843 0 : ereport(FATAL,
1844 : (errmsg("requested recovery stop point is before consistent recovery point")));
1845 :
1846 : /*
1847 : * This is the last point where we can restart recovery with a new
1848 : * recovery target, if we shutdown and begin again. After this,
1849 : * Resource Managers may choose to do permanent corrective actions
1850 : * at end of recovery.
1851 : */
1852 14 : switch (recoveryTargetAction)
1853 : {
1854 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1855 :
1856 : /*
1857 : * exit with special return code to request shutdown of
1858 : * postmaster. Log messages issued from postmaster.
1859 : */
1860 0 : proc_exit(3);
1861 :
1862 2 : case RECOVERY_TARGET_ACTION_PAUSE:
1863 2 : SetRecoveryPause(true);
1864 2 : recoveryPausesHere(true);
1865 :
1866 : /* drop into promote */
1867 :
1868 14 : case RECOVERY_TARGET_ACTION_PROMOTE:
1869 14 : break;
1870 : }
1871 276 : }
1872 :
1873 290 : RmgrCleanup();
1874 :
1875 290 : ereport(LOG,
1876 : (errmsg("redo done at %X/%X system usage: %s",
1877 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1878 : pg_rusage_show(&ru0))));
1879 290 : xtime = GetLatestXTime();
1880 290 : if (xtime)
1881 70 : ereport(LOG,
1882 : (errmsg("last completed transaction was at log time %s",
1883 : timestamptz_to_str(xtime))));
1884 :
1885 290 : InRedo = false;
1886 : }
1887 : else
1888 : {
1889 : /* there are no WAL records following the checkpoint */
1890 18 : ereport(LOG,
1891 : (errmsg("redo is not required")));
1892 : }
1893 :
1894 : /*
1895 : * This check is intentionally after the above log messages that indicate
1896 : * how far recovery went.
1897 : */
1898 308 : if (ArchiveRecoveryRequested &&
1899 100 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1900 16 : !reachedRecoveryTarget)
1901 2 : ereport(FATAL,
1902 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1903 : errmsg("recovery ended before configured recovery target was reached")));
1904 306 : }
1905 :
1906 : /*
1907 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1908 : */
1909 : static void
1910 5361632 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1911 : {
1912 : ErrorContextCallback errcallback;
1913 5361632 : bool switchedTLI = false;
1914 :
1915 : /* Setup error traceback support for ereport() */
1916 5361632 : errcallback.callback = rm_redo_error_callback;
1917 5361632 : errcallback.arg = xlogreader;
1918 5361632 : errcallback.previous = error_context_stack;
1919 5361632 : error_context_stack = &errcallback;
1920 :
1921 : /*
1922 : * TransamVariables->nextXid must be beyond record's xid.
1923 : */
1924 5361632 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1925 :
1926 : /*
1927 : * Before replaying this record, check if this record causes the current
1928 : * timeline to change. The record is already considered to be part of the
1929 : * new timeline, so we update replayTLI before replaying it. That's
1930 : * important so that replayEndTLI, which is recorded as the minimum
1931 : * recovery point's TLI if recovery stops after this record, is set
1932 : * correctly.
1933 : */
1934 5361632 : if (record->xl_rmid == RM_XLOG_ID)
1935 : {
1936 80116 : TimeLineID newReplayTLI = *replayTLI;
1937 80116 : TimeLineID prevReplayTLI = *replayTLI;
1938 80116 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1939 :
1940 80116 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1941 : {
1942 : CheckPoint checkPoint;
1943 :
1944 68 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1945 68 : newReplayTLI = checkPoint.ThisTimeLineID;
1946 68 : prevReplayTLI = checkPoint.PrevTimeLineID;
1947 : }
1948 80048 : else if (info == XLOG_END_OF_RECOVERY)
1949 : {
1950 : xl_end_of_recovery xlrec;
1951 :
1952 20 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1953 20 : newReplayTLI = xlrec.ThisTimeLineID;
1954 20 : prevReplayTLI = xlrec.PrevTimeLineID;
1955 : }
1956 :
1957 80116 : if (newReplayTLI != *replayTLI)
1958 : {
1959 : /* Check that it's OK to switch to this TLI */
1960 22 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1961 : newReplayTLI, prevReplayTLI, *replayTLI);
1962 :
1963 : /* Following WAL records should be run with new TLI */
1964 22 : *replayTLI = newReplayTLI;
1965 22 : switchedTLI = true;
1966 : }
1967 : }
1968 :
1969 : /*
1970 : * Update shared replayEndRecPtr before replaying this record, so that
1971 : * XLogFlush will update minRecoveryPoint correctly.
1972 : */
1973 5361632 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1974 5361632 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1975 5361632 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1976 5361632 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1977 :
1978 : /*
1979 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1980 : */
1981 5361632 : if (standbyState >= STANDBY_INITIALIZED &&
1982 4873944 : TransactionIdIsValid(record->xl_xid))
1983 4786806 : RecordKnownAssignedTransactionIds(record->xl_xid);
1984 :
1985 : /*
1986 : * Some XLOG record types that are related to recovery are processed
1987 : * directly here, rather than in xlog_redo()
1988 : */
1989 5361632 : if (record->xl_rmid == RM_XLOG_ID)
1990 80116 : xlogrecovery_redo(xlogreader, *replayTLI);
1991 :
1992 : /* Now apply the WAL record itself */
1993 5361632 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1994 :
1995 : /*
1996 : * After redo, check whether the backup pages associated with the WAL
1997 : * record are consistent with the existing pages. This check is done only
1998 : * if consistency check is enabled for this record.
1999 : */
2000 5361628 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2001 4217628 : verifyBackupPageConsistency(xlogreader);
2002 :
2003 : /* Pop the error context stack */
2004 5361628 : error_context_stack = errcallback.previous;
2005 :
2006 : /*
2007 : * Update lastReplayedEndRecPtr after this record has been successfully
2008 : * replayed.
2009 : */
2010 5361628 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2011 5361628 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2012 5361628 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2013 5361628 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2014 5361628 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2015 :
2016 : /* ------
2017 : * Wakeup walsenders:
2018 : *
2019 : * On the standby, the WAL is flushed first (which will only wake up
2020 : * physical walsenders) and then applied, which will only wake up logical
2021 : * walsenders.
2022 : *
2023 : * Indeed, logical walsenders on standby can't decode and send data until
2024 : * it's been applied.
2025 : *
2026 : * Physical walsenders don't need to be woken up during replay unless
2027 : * cascading replication is allowed and time line change occurred (so that
2028 : * they can notice that they are on a new time line).
2029 : *
2030 : * That's why the wake up conditions are for:
2031 : *
2032 : * - physical walsenders in case of new time line and cascade
2033 : * replication is allowed
2034 : * - logical walsenders in case cascade replication is allowed (could not
2035 : * be created otherwise)
2036 : * ------
2037 : */
2038 5361628 : if (AllowCascadeReplication())
2039 4983302 : WalSndWakeup(switchedTLI, true);
2040 :
2041 : /*
2042 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2043 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2044 : * a reply to the primary.
2045 : */
2046 5361628 : if (doRequestWalReceiverReply)
2047 : {
2048 4 : doRequestWalReceiverReply = false;
2049 4 : WalRcvForceReply();
2050 : }
2051 :
2052 : /* Allow read-only connections if we're consistent now */
2053 5361628 : CheckRecoveryConsistency();
2054 :
2055 : /* Is this a timeline switch? */
2056 5361628 : if (switchedTLI)
2057 : {
2058 : /*
2059 : * Before we continue on the new timeline, clean up any (possibly
2060 : * bogus) future WAL segments on the old timeline.
2061 : */
2062 22 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2063 :
2064 : /* Reset the prefetcher. */
2065 22 : XLogPrefetchReconfigure();
2066 : }
2067 5361628 : }
2068 :
2069 : /*
2070 : * Some XLOG RM record types that are directly related to WAL recovery are
2071 : * handled here rather than in the xlog_redo()
2072 : */
2073 : static void
2074 80116 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2075 : {
2076 80116 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2077 80116 : XLogRecPtr lsn = record->EndRecPtr;
2078 :
2079 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2080 :
2081 80116 : if (info == XLOG_OVERWRITE_CONTRECORD)
2082 : {
2083 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2084 : xl_overwrite_contrecord xlrec;
2085 :
2086 2 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2087 2 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2088 0 : elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2089 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2090 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2091 :
2092 : /* We have safely skipped the aborted record */
2093 2 : abortedRecPtr = InvalidXLogRecPtr;
2094 2 : missingContrecPtr = InvalidXLogRecPtr;
2095 :
2096 2 : ereport(LOG,
2097 : (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2098 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2099 : timestamptz_to_str(xlrec.overwrite_time))));
2100 :
2101 : /* Verifying the record should only happen once */
2102 2 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2103 : }
2104 80114 : else if (info == XLOG_BACKUP_END)
2105 : {
2106 : XLogRecPtr startpoint;
2107 :
2108 164 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2109 :
2110 164 : if (backupStartPoint == startpoint)
2111 : {
2112 : /*
2113 : * We have reached the end of base backup, the point where
2114 : * pg_backup_stop() was done. The data on disk is now consistent
2115 : * (assuming we have also reached minRecoveryPoint). Set
2116 : * backupEndPoint to the current LSN, so that the next call to
2117 : * CheckRecoveryConsistency() will notice it and do the
2118 : * end-of-backup processing.
2119 : */
2120 134 : elog(DEBUG1, "end of backup record reached");
2121 :
2122 134 : backupEndPoint = lsn;
2123 : }
2124 : else
2125 30 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2126 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2127 : }
2128 80116 : }
2129 :
2130 : /*
2131 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2132 : * directories.
2133 : *
2134 : * Replay of database creation XLOG records for databases that were later
2135 : * dropped can create fake directories in pg_tblspc. By the time consistency
2136 : * is reached these directories should have been removed; here we verify
2137 : * that this did indeed happen. This is to be called at the point where
2138 : * consistent state is reached.
2139 : *
2140 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2141 : * useful for testing purposes, and also allows for an escape hatch in case
2142 : * things go south.
2143 : */
2144 : static void
2145 216 : CheckTablespaceDirectory(void)
2146 : {
2147 : DIR *dir;
2148 : struct dirent *de;
2149 :
2150 216 : dir = AllocateDir(PG_TBLSPC_DIR);
2151 662 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2152 : {
2153 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2154 :
2155 : /* Skip entries of non-oid names */
2156 446 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2157 432 : continue;
2158 :
2159 14 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2160 :
2161 14 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2162 8 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2163 : (errcode(ERRCODE_DATA_CORRUPTED),
2164 : errmsg("unexpected directory entry \"%s\" found in %s",
2165 : de->d_name, PG_TBLSPC_DIR),
2166 : errdetail("All directory entries in %s/ should be symbolic links.",
2167 : PG_TBLSPC_DIR),
2168 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2169 : }
2170 216 : }
2171 :
2172 : /*
2173 : * Checks if recovery has reached a consistent state. When consistency is
2174 : * reached and we have a valid starting standby snapshot, tell postmaster
2175 : * that it can start accepting read-only connections.
2176 : */
2177 : static void
2178 5362044 : CheckRecoveryConsistency(void)
2179 : {
2180 : XLogRecPtr lastReplayedEndRecPtr;
2181 : TimeLineID lastReplayedTLI;
2182 :
2183 : /*
2184 : * During crash recovery, we don't reach a consistent state until we've
2185 : * replayed all the WAL.
2186 : */
2187 5362044 : if (XLogRecPtrIsInvalid(minRecoveryPoint))
2188 517360 : return;
2189 :
2190 : Assert(InArchiveRecovery);
2191 :
2192 : /*
2193 : * assume that we are called in the startup process, and hence don't need
2194 : * a lock to read lastReplayedEndRecPtr
2195 : */
2196 4844684 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2197 4844684 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2198 :
2199 : /*
2200 : * Have we reached the point where our base backup was completed?
2201 : */
2202 4844684 : if (!XLogRecPtrIsInvalid(backupEndPoint) &&
2203 198 : backupEndPoint <= lastReplayedEndRecPtr)
2204 : {
2205 138 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2206 138 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2207 :
2208 138 : elog(DEBUG1, "end of backup reached");
2209 :
2210 : /*
2211 : * We have reached the end of base backup, as indicated by pg_control.
2212 : * Update the control file accordingly.
2213 : */
2214 138 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2215 138 : backupStartPoint = InvalidXLogRecPtr;
2216 138 : backupEndPoint = InvalidXLogRecPtr;
2217 138 : backupEndRequired = false;
2218 :
2219 138 : ereport(LOG,
2220 : (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2221 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2222 : LSN_FORMAT_ARGS(saveBackupEndPoint))));
2223 : }
2224 :
2225 : /*
2226 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2227 : * known to be incorrectly set if recovering from a backup, until the
2228 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2229 : * All we know prior to that is that we're not consistent yet.
2230 : */
2231 4844684 : if (!reachedConsistency && !backupEndRequired &&
2232 14306 : minRecoveryPoint <= lastReplayedEndRecPtr)
2233 : {
2234 : /*
2235 : * Check to see if the XLOG sequence contained any unresolved
2236 : * references to uninitialized pages.
2237 : */
2238 216 : XLogCheckInvalidPages();
2239 :
2240 : /*
2241 : * Check that pg_tblspc doesn't contain any real directories. Replay
2242 : * of Database/CREATE_* records may have created fictitious tablespace
2243 : * directories that should have been removed by the time consistency
2244 : * was reached.
2245 : */
2246 216 : CheckTablespaceDirectory();
2247 :
2248 216 : reachedConsistency = true;
2249 216 : ereport(LOG,
2250 : (errmsg("consistent recovery state reached at %X/%X",
2251 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2252 : }
2253 :
2254 : /*
2255 : * Have we got a valid starting snapshot that will allow queries to be
2256 : * run? If so, we can tell postmaster that the database is consistent now,
2257 : * enabling connections.
2258 : */
2259 4844684 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2260 4844254 : !LocalHotStandbyActive &&
2261 200 : reachedConsistency &&
2262 : IsUnderPostmaster)
2263 : {
2264 200 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2265 200 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2266 200 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2267 :
2268 200 : LocalHotStandbyActive = true;
2269 :
2270 200 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2271 : }
2272 : }
2273 :
2274 : /*
2275 : * Error context callback for errors occurring during rm_redo().
2276 : */
2277 : static void
2278 202 : rm_redo_error_callback(void *arg)
2279 : {
2280 202 : XLogReaderState *record = (XLogReaderState *) arg;
2281 : StringInfoData buf;
2282 :
2283 202 : initStringInfo(&buf);
2284 202 : xlog_outdesc(&buf, record);
2285 202 : xlog_block_info(&buf, record);
2286 :
2287 : /* translator: %s is a WAL record description */
2288 202 : errcontext("WAL redo at %X/%X for %s",
2289 202 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2290 : buf.data);
2291 :
2292 202 : pfree(buf.data);
2293 202 : }
2294 :
2295 : /*
2296 : * Returns a string describing an XLogRecord, consisting of its identity
2297 : * optionally followed by a colon, a space, and a further description.
2298 : */
2299 : void
2300 202 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2301 : {
2302 202 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2303 202 : uint8 info = XLogRecGetInfo(record);
2304 : const char *id;
2305 :
2306 202 : appendStringInfoString(buf, rmgr.rm_name);
2307 202 : appendStringInfoChar(buf, '/');
2308 :
2309 202 : id = rmgr.rm_identify(info);
2310 202 : if (id == NULL)
2311 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2312 : else
2313 202 : appendStringInfo(buf, "%s: ", id);
2314 :
2315 202 : rmgr.rm_desc(buf, record);
2316 202 : }
2317 :
2318 : #ifdef WAL_DEBUG
2319 :
2320 : static void
2321 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2322 : {
2323 : appendStringInfo(buf, "prev %X/%X; xid %u",
2324 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2325 : XLogRecGetXid(record));
2326 :
2327 : appendStringInfo(buf, "; len %u",
2328 : XLogRecGetDataLen(record));
2329 :
2330 : xlog_block_info(buf, record);
2331 : }
2332 : #endif /* WAL_DEBUG */
2333 :
2334 : /*
2335 : * Returns a string giving information about all the blocks in an
2336 : * XLogRecord.
2337 : */
2338 : static void
2339 202 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2340 : {
2341 : int block_id;
2342 :
2343 : /* decode block references */
2344 294 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2345 : {
2346 : RelFileLocator rlocator;
2347 : ForkNumber forknum;
2348 : BlockNumber blk;
2349 :
2350 92 : if (!XLogRecGetBlockTagExtended(record, block_id,
2351 : &rlocator, &forknum, &blk, NULL))
2352 0 : continue;
2353 :
2354 92 : if (forknum != MAIN_FORKNUM)
2355 4 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2356 : block_id,
2357 : rlocator.spcOid, rlocator.dbOid,
2358 : rlocator.relNumber,
2359 : forknum,
2360 : blk);
2361 : else
2362 88 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2363 : block_id,
2364 : rlocator.spcOid, rlocator.dbOid,
2365 : rlocator.relNumber,
2366 : blk);
2367 92 : if (XLogRecHasBlockImage(record, block_id))
2368 46 : appendStringInfoString(buf, " FPW");
2369 : }
2370 202 : }
2371 :
2372 :
2373 : /*
2374 : * Check that it's OK to switch to new timeline during recovery.
2375 : *
2376 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2377 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2378 : */
2379 : static void
2380 22 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2381 : TimeLineID replayTLI)
2382 : {
2383 : /* Check that the record agrees on what the current (old) timeline is */
2384 22 : if (prevTLI != replayTLI)
2385 0 : ereport(PANIC,
2386 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2387 : prevTLI, replayTLI)));
2388 :
2389 : /*
2390 : * The new timeline better be in the list of timelines we expect to see,
2391 : * according to the timeline history. It should also not decrease.
2392 : */
2393 22 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2394 0 : ereport(PANIC,
2395 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2396 : newTLI, replayTLI)));
2397 :
2398 : /*
2399 : * If we have not yet reached min recovery point, and we're about to
2400 : * switch to a timeline greater than the timeline of the min recovery
2401 : * point: trouble. After switching to the new timeline, we could not
2402 : * possibly visit the min recovery point on the correct timeline anymore.
2403 : * This can happen if there is a newer timeline in the archive that
2404 : * branched before the timeline the min recovery point is on, and you
2405 : * attempt to do PITR to the new timeline.
2406 : */
2407 22 : if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
2408 18 : lsn < minRecoveryPoint &&
2409 2 : newTLI > minRecoveryPointTLI)
2410 0 : ereport(PANIC,
2411 : (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2412 : newTLI,
2413 : LSN_FORMAT_ARGS(minRecoveryPoint),
2414 : minRecoveryPointTLI)));
2415 :
2416 : /* Looks good */
2417 22 : }
2418 :
2419 :
2420 : /*
2421 : * Extract timestamp from WAL record.
2422 : *
2423 : * If the record contains a timestamp, returns true, and saves the timestamp
2424 : * in *recordXtime. If the record type has no timestamp, returns false.
2425 : * Currently, only transaction commit/abort records and restore points contain
2426 : * timestamps.
2427 : */
2428 : static bool
2429 83096 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2430 : {
2431 83096 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2432 83096 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2433 83096 : uint8 rmid = XLogRecGetRmid(record);
2434 :
2435 83096 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2436 : {
2437 4 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2438 4 : return true;
2439 : }
2440 83092 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2441 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2442 : {
2443 76172 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2444 76172 : return true;
2445 : }
2446 6920 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2447 : xact_info == XLOG_XACT_ABORT_PREPARED))
2448 : {
2449 6920 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2450 6920 : return true;
2451 : }
2452 0 : return false;
2453 : }
2454 :
2455 : /*
2456 : * Checks whether the current buffer page and backup page stored in the
2457 : * WAL record are consistent or not. Before comparing the two pages, a
2458 : * masking can be applied to the pages to ignore certain areas like hint bits,
2459 : * unused space between pd_lower and pd_upper among other things. This
2460 : * function should be called once WAL replay has been completed for a
2461 : * given record.
2462 : */
2463 : static void
2464 4217628 : verifyBackupPageConsistency(XLogReaderState *record)
2465 : {
2466 4217628 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2467 : RelFileLocator rlocator;
2468 : ForkNumber forknum;
2469 : BlockNumber blkno;
2470 : int block_id;
2471 :
2472 : /* Records with no backup blocks have no need for consistency checks. */
2473 4217628 : if (!XLogRecHasAnyBlockRefs(record))
2474 0 : return;
2475 :
2476 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2477 :
2478 8762696 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2479 : {
2480 : Buffer buf;
2481 : Page page;
2482 :
2483 4545068 : if (!XLogRecGetBlockTagExtended(record, block_id,
2484 : &rlocator, &forknum, &blkno, NULL))
2485 : {
2486 : /*
2487 : * WAL record doesn't contain a block reference with the given id.
2488 : * Do nothing.
2489 : */
2490 3998 : continue;
2491 : }
2492 :
2493 : Assert(XLogRecHasBlockImage(record, block_id));
2494 :
2495 4541070 : if (XLogRecBlockImageApply(record, block_id))
2496 : {
2497 : /*
2498 : * WAL record has already applied the page, so bypass the
2499 : * consistency check as that would result in comparing the full
2500 : * page stored in the record with itself.
2501 : */
2502 39976 : continue;
2503 : }
2504 :
2505 : /*
2506 : * Read the contents from the current buffer and store it in a
2507 : * temporary page.
2508 : */
2509 4501094 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2510 : RBM_NORMAL_NO_LOG,
2511 : InvalidBuffer);
2512 4501094 : if (!BufferIsValid(buf))
2513 0 : continue;
2514 :
2515 4501094 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2516 4501094 : page = BufferGetPage(buf);
2517 :
2518 : /*
2519 : * Take a copy of the local page where WAL has been applied to have a
2520 : * comparison base before masking it...
2521 : */
2522 4501094 : memcpy(replay_image_masked, page, BLCKSZ);
2523 :
2524 : /* No need for this page anymore now that a copy is in. */
2525 4501094 : UnlockReleaseBuffer(buf);
2526 :
2527 : /*
2528 : * If the block LSN is already ahead of this WAL record, we can't
2529 : * expect contents to match. This can happen if recovery is
2530 : * restarted.
2531 : */
2532 4501094 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2533 0 : continue;
2534 :
2535 : /*
2536 : * Read the contents from the backup copy, stored in WAL record and
2537 : * store it in a temporary page. There is no need to allocate a new
2538 : * page here, a local buffer is fine to hold its contents and a mask
2539 : * can be directly applied on it.
2540 : */
2541 4501094 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2542 0 : ereport(ERROR,
2543 : (errcode(ERRCODE_INTERNAL_ERROR),
2544 : errmsg_internal("%s", record->errormsg_buf)));
2545 :
2546 : /*
2547 : * If masking function is defined, mask both the primary and replay
2548 : * images
2549 : */
2550 4501094 : if (rmgr.rm_mask != NULL)
2551 : {
2552 4501094 : rmgr.rm_mask(replay_image_masked, blkno);
2553 4501094 : rmgr.rm_mask(primary_image_masked, blkno);
2554 : }
2555 :
2556 : /* Time to compare the primary and replay images. */
2557 4501094 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2558 : {
2559 0 : elog(FATAL,
2560 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2561 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2562 : forknum, blkno);
2563 : }
2564 : }
2565 : }
2566 :
2567 : /*
2568 : * For point-in-time recovery, this function decides whether we want to
2569 : * stop applying the XLOG before the current record.
2570 : *
2571 : * Returns true if we are stopping, false otherwise. If stopping, some
2572 : * information is saved in recoveryStopXid et al for use in annotating the
2573 : * new timeline's history file.
2574 : */
2575 : static bool
2576 5361634 : recoveryStopsBefore(XLogReaderState *record)
2577 : {
2578 5361634 : bool stopsHere = false;
2579 : uint8 xact_info;
2580 : bool isCommit;
2581 5361634 : TimestampTz recordXtime = 0;
2582 : TransactionId recordXid;
2583 :
2584 : /*
2585 : * Ignore recovery target settings when not in archive recovery (meaning
2586 : * we are in crash recovery).
2587 : */
2588 5361634 : if (!ArchiveRecoveryRequested)
2589 487660 : return false;
2590 :
2591 : /* Check if we should stop as soon as reaching consistency */
2592 4873974 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2593 : {
2594 0 : ereport(LOG,
2595 : (errmsg("recovery stopping after reaching consistency")));
2596 :
2597 0 : recoveryStopAfter = false;
2598 0 : recoveryStopXid = InvalidTransactionId;
2599 0 : recoveryStopLSN = InvalidXLogRecPtr;
2600 0 : recoveryStopTime = 0;
2601 0 : recoveryStopName[0] = '\0';
2602 0 : return true;
2603 : }
2604 :
2605 : /* Check if target LSN has been reached */
2606 4873974 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2607 14268 : !recoveryTargetInclusive &&
2608 702 : record->ReadRecPtr >= recoveryTargetLSN)
2609 : {
2610 2 : recoveryStopAfter = false;
2611 2 : recoveryStopXid = InvalidTransactionId;
2612 2 : recoveryStopLSN = record->ReadRecPtr;
2613 2 : recoveryStopTime = 0;
2614 2 : recoveryStopName[0] = '\0';
2615 2 : ereport(LOG,
2616 : (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2617 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2618 2 : return true;
2619 : }
2620 :
2621 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2622 4873972 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2623 4831994 : return false;
2624 :
2625 41978 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2626 :
2627 41978 : if (xact_info == XLOG_XACT_COMMIT)
2628 : {
2629 38040 : isCommit = true;
2630 38040 : recordXid = XLogRecGetXid(record);
2631 : }
2632 3938 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2633 : {
2634 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2635 : xl_xact_parsed_commit parsed;
2636 :
2637 48 : isCommit = true;
2638 48 : ParseCommitRecord(XLogRecGetInfo(record),
2639 : xlrec,
2640 : &parsed);
2641 48 : recordXid = parsed.twophase_xid;
2642 : }
2643 3890 : else if (xact_info == XLOG_XACT_ABORT)
2644 : {
2645 3438 : isCommit = false;
2646 3438 : recordXid = XLogRecGetXid(record);
2647 : }
2648 452 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2649 : {
2650 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2651 : xl_xact_parsed_abort parsed;
2652 :
2653 22 : isCommit = false;
2654 22 : ParseAbortRecord(XLogRecGetInfo(record),
2655 : xlrec,
2656 : &parsed);
2657 22 : recordXid = parsed.twophase_xid;
2658 : }
2659 : else
2660 430 : return false;
2661 :
2662 41548 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2663 : {
2664 : /*
2665 : * There can be only one transaction end record with this exact
2666 : * transactionid
2667 : *
2668 : * when testing for an xid, we MUST test for equality only, since
2669 : * transactions are numbered in the order they start, not the order
2670 : * they complete. A higher numbered xid will complete before you about
2671 : * 50% of the time...
2672 : */
2673 0 : stopsHere = (recordXid == recoveryTargetXid);
2674 : }
2675 :
2676 : /*
2677 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2678 : * We don't expect getRecordTimestamp ever to fail, since we already know
2679 : * this is a commit or abort record; but test its result anyway.
2680 : */
2681 41548 : if (getRecordTimestamp(record, &recordXtime) &&
2682 41548 : recoveryTarget == RECOVERY_TARGET_TIME)
2683 : {
2684 : /*
2685 : * There can be many transactions that share the same commit time, so
2686 : * we stop after the last one, if we are inclusive, or stop at the
2687 : * first one if we are exclusive
2688 : */
2689 0 : if (recoveryTargetInclusive)
2690 0 : stopsHere = (recordXtime > recoveryTargetTime);
2691 : else
2692 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2693 : }
2694 :
2695 41548 : if (stopsHere)
2696 : {
2697 0 : recoveryStopAfter = false;
2698 0 : recoveryStopXid = recordXid;
2699 0 : recoveryStopTime = recordXtime;
2700 0 : recoveryStopLSN = InvalidXLogRecPtr;
2701 0 : recoveryStopName[0] = '\0';
2702 :
2703 0 : if (isCommit)
2704 : {
2705 0 : ereport(LOG,
2706 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2707 : recoveryStopXid,
2708 : timestamptz_to_str(recoveryStopTime))));
2709 : }
2710 : else
2711 : {
2712 0 : ereport(LOG,
2713 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2714 : recoveryStopXid,
2715 : timestamptz_to_str(recoveryStopTime))));
2716 : }
2717 : }
2718 :
2719 41548 : return stopsHere;
2720 : }
2721 :
2722 : /*
2723 : * Same as recoveryStopsBefore, but called after applying the record.
2724 : *
2725 : * We also track the timestamp of the latest applied COMMIT/ABORT
2726 : * record in XLogRecoveryCtl->recoveryLastXTime.
2727 : */
2728 : static bool
2729 5361628 : recoveryStopsAfter(XLogReaderState *record)
2730 : {
2731 : uint8 info;
2732 : uint8 xact_info;
2733 : uint8 rmid;
2734 5361628 : TimestampTz recordXtime = 0;
2735 :
2736 : /*
2737 : * Ignore recovery target settings when not in archive recovery (meaning
2738 : * we are in crash recovery).
2739 : */
2740 5361628 : if (!ArchiveRecoveryRequested)
2741 487660 : return false;
2742 :
2743 4873968 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2744 4873968 : rmid = XLogRecGetRmid(record);
2745 :
2746 : /*
2747 : * There can be many restore points that share the same name; we stop at
2748 : * the first one.
2749 : */
2750 4873968 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2751 40 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2752 : {
2753 : xl_restore_point *recordRestorePointData;
2754 :
2755 6 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2756 :
2757 6 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2758 : {
2759 4 : recoveryStopAfter = true;
2760 4 : recoveryStopXid = InvalidTransactionId;
2761 4 : recoveryStopLSN = InvalidXLogRecPtr;
2762 4 : (void) getRecordTimestamp(record, &recoveryStopTime);
2763 4 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2764 :
2765 4 : ereport(LOG,
2766 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2767 : recoveryStopName,
2768 : timestamptz_to_str(recoveryStopTime))));
2769 4 : return true;
2770 : }
2771 : }
2772 :
2773 : /* Check if the target LSN has been reached */
2774 4873964 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2775 13566 : recoveryTargetInclusive &&
2776 13566 : record->ReadRecPtr >= recoveryTargetLSN)
2777 : {
2778 8 : recoveryStopAfter = true;
2779 8 : recoveryStopXid = InvalidTransactionId;
2780 8 : recoveryStopLSN = record->ReadRecPtr;
2781 8 : recoveryStopTime = 0;
2782 8 : recoveryStopName[0] = '\0';
2783 8 : ereport(LOG,
2784 : (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2785 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2786 8 : return true;
2787 : }
2788 :
2789 4873956 : if (rmid != RM_XACT_ID)
2790 4831982 : return false;
2791 :
2792 41974 : xact_info = info & XLOG_XACT_OPMASK;
2793 :
2794 41974 : if (xact_info == XLOG_XACT_COMMIT ||
2795 3890 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2796 452 : xact_info == XLOG_XACT_ABORT ||
2797 : xact_info == XLOG_XACT_ABORT_PREPARED)
2798 : {
2799 : TransactionId recordXid;
2800 :
2801 : /* Update the last applied transaction timestamp */
2802 41544 : if (getRecordTimestamp(record, &recordXtime))
2803 41544 : SetLatestXTime(recordXtime);
2804 :
2805 : /* Extract the XID of the committed/aborted transaction */
2806 41544 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2807 : {
2808 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2809 : xl_xact_parsed_commit parsed;
2810 :
2811 48 : ParseCommitRecord(XLogRecGetInfo(record),
2812 : xlrec,
2813 : &parsed);
2814 48 : recordXid = parsed.twophase_xid;
2815 : }
2816 41496 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2817 : {
2818 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2819 : xl_xact_parsed_abort parsed;
2820 :
2821 22 : ParseAbortRecord(XLogRecGetInfo(record),
2822 : xlrec,
2823 : &parsed);
2824 22 : recordXid = parsed.twophase_xid;
2825 : }
2826 : else
2827 41474 : recordXid = XLogRecGetXid(record);
2828 :
2829 : /*
2830 : * There can be only one transaction end record with this exact
2831 : * transactionid
2832 : *
2833 : * when testing for an xid, we MUST test for equality only, since
2834 : * transactions are numbered in the order they start, not the order
2835 : * they complete. A higher numbered xid will complete before you about
2836 : * 50% of the time...
2837 : */
2838 41544 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2839 0 : recordXid == recoveryTargetXid)
2840 : {
2841 0 : recoveryStopAfter = true;
2842 0 : recoveryStopXid = recordXid;
2843 0 : recoveryStopTime = recordXtime;
2844 0 : recoveryStopLSN = InvalidXLogRecPtr;
2845 0 : recoveryStopName[0] = '\0';
2846 :
2847 0 : if (xact_info == XLOG_XACT_COMMIT ||
2848 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2849 : {
2850 0 : ereport(LOG,
2851 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2852 : recoveryStopXid,
2853 : timestamptz_to_str(recoveryStopTime))));
2854 : }
2855 0 : else if (xact_info == XLOG_XACT_ABORT ||
2856 : xact_info == XLOG_XACT_ABORT_PREPARED)
2857 : {
2858 0 : ereport(LOG,
2859 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2860 : recoveryStopXid,
2861 : timestamptz_to_str(recoveryStopTime))));
2862 : }
2863 0 : return true;
2864 : }
2865 : }
2866 :
2867 : /* Check if we should stop as soon as reaching consistency */
2868 41974 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2869 : {
2870 0 : ereport(LOG,
2871 : (errmsg("recovery stopping after reaching consistency")));
2872 :
2873 0 : recoveryStopAfter = true;
2874 0 : recoveryStopXid = InvalidTransactionId;
2875 0 : recoveryStopTime = 0;
2876 0 : recoveryStopLSN = InvalidXLogRecPtr;
2877 0 : recoveryStopName[0] = '\0';
2878 0 : return true;
2879 : }
2880 :
2881 41974 : return false;
2882 : }
2883 :
2884 : /*
2885 : * Create a comment for the history file to explain why and where
2886 : * timeline changed.
2887 : */
2888 : static char *
2889 1562 : getRecoveryStopReason(void)
2890 : {
2891 : char reason[200];
2892 :
2893 1562 : if (recoveryTarget == RECOVERY_TARGET_XID)
2894 0 : snprintf(reason, sizeof(reason),
2895 : "%s transaction %u",
2896 0 : recoveryStopAfter ? "after" : "before",
2897 : recoveryStopXid);
2898 1562 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2899 0 : snprintf(reason, sizeof(reason),
2900 : "%s %s\n",
2901 0 : recoveryStopAfter ? "after" : "before",
2902 : timestamptz_to_str(recoveryStopTime));
2903 1562 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2904 14 : snprintf(reason, sizeof(reason),
2905 : "%s LSN %X/%X\n",
2906 14 : recoveryStopAfter ? "after" : "before",
2907 14 : LSN_FORMAT_ARGS(recoveryStopLSN));
2908 1548 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2909 6 : snprintf(reason, sizeof(reason),
2910 : "at restore point \"%s\"",
2911 : recoveryStopName);
2912 1542 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2913 0 : snprintf(reason, sizeof(reason), "reached consistency");
2914 : else
2915 1542 : snprintf(reason, sizeof(reason), "no recovery target specified");
2916 :
2917 1562 : return pstrdup(reason);
2918 : }
2919 :
2920 : /*
2921 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2922 : *
2923 : * endOfRecovery is true if the recovery target is reached and
2924 : * the paused state starts at the end of recovery because of
2925 : * recovery_target_action=pause, and false otherwise.
2926 : */
2927 : static void
2928 6 : recoveryPausesHere(bool endOfRecovery)
2929 : {
2930 : /* Don't pause unless users can connect! */
2931 6 : if (!LocalHotStandbyActive)
2932 0 : return;
2933 :
2934 : /* Don't pause after standby promotion has been triggered */
2935 6 : if (LocalPromoteIsTriggered)
2936 0 : return;
2937 :
2938 6 : if (endOfRecovery)
2939 2 : ereport(LOG,
2940 : (errmsg("pausing at the end of recovery"),
2941 : errhint("Execute pg_wal_replay_resume() to promote.")));
2942 : else
2943 4 : ereport(LOG,
2944 : (errmsg("recovery has paused"),
2945 : errhint("Execute pg_wal_replay_resume() to continue.")));
2946 :
2947 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2948 18 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2949 : {
2950 16 : HandleStartupProcInterrupts();
2951 16 : if (CheckForStandbyTrigger())
2952 4 : return;
2953 :
2954 : /*
2955 : * If recovery pause is requested then set it paused. While we are in
2956 : * the loop, user might resume and pause again so set this every time.
2957 : */
2958 12 : ConfirmRecoveryPaused();
2959 :
2960 : /*
2961 : * We wait on a condition variable that will wake us as soon as the
2962 : * pause ends, but we use a timeout so we can check the above exit
2963 : * condition periodically too.
2964 : */
2965 12 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2966 : WAIT_EVENT_RECOVERY_PAUSE);
2967 : }
2968 2 : ConditionVariableCancelSleep();
2969 : }
2970 :
2971 : /*
2972 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2973 : * certain record types are applied at least that interval behind the primary.
2974 : *
2975 : * Returns true if we waited.
2976 : *
2977 : * Note that the delay is calculated between the WAL record log time and
2978 : * the current time on standby. We would prefer to keep track of when this
2979 : * standby received each WAL record, which would allow a more consistent
2980 : * approach and one not affected by time synchronisation issues, but that
2981 : * is significantly more effort and complexity for little actual gain in
2982 : * usability.
2983 : */
2984 : static bool
2985 5361632 : recoveryApplyDelay(XLogReaderState *record)
2986 : {
2987 : uint8 xact_info;
2988 : TimestampTz xtime;
2989 : TimestampTz delayUntil;
2990 : long msecs;
2991 :
2992 : /* nothing to do if no delay configured */
2993 5361632 : if (recovery_min_apply_delay <= 0)
2994 5361632 : return false;
2995 :
2996 : /* no delay is applied on a database not yet consistent */
2997 0 : if (!reachedConsistency)
2998 0 : return false;
2999 :
3000 : /* nothing to do if crash recovery is requested */
3001 0 : if (!ArchiveRecoveryRequested)
3002 0 : return false;
3003 :
3004 : /*
3005 : * Is it a COMMIT record?
3006 : *
3007 : * We deliberately choose not to delay aborts since they have no effect on
3008 : * MVCC. We already allow replay of records that don't have a timestamp,
3009 : * so there is already opportunity for issues caused by early conflicts on
3010 : * standbys.
3011 : */
3012 0 : if (XLogRecGetRmid(record) != RM_XACT_ID)
3013 0 : return false;
3014 :
3015 0 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3016 :
3017 0 : if (xact_info != XLOG_XACT_COMMIT &&
3018 : xact_info != XLOG_XACT_COMMIT_PREPARED)
3019 0 : return false;
3020 :
3021 0 : if (!getRecordTimestamp(record, &xtime))
3022 0 : return false;
3023 :
3024 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3025 :
3026 : /*
3027 : * Exit without arming the latch if it's already past time to apply this
3028 : * record
3029 : */
3030 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3031 0 : if (msecs <= 0)
3032 0 : return false;
3033 :
3034 : while (true)
3035 : {
3036 0 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3037 :
3038 : /* This might change recovery_min_apply_delay. */
3039 0 : HandleStartupProcInterrupts();
3040 :
3041 0 : if (CheckForStandbyTrigger())
3042 0 : break;
3043 :
3044 : /*
3045 : * Recalculate delayUntil as recovery_min_apply_delay could have
3046 : * changed while waiting in this loop.
3047 : */
3048 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3049 :
3050 : /*
3051 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3052 : */
3053 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3054 : delayUntil);
3055 :
3056 0 : if (msecs <= 0)
3057 0 : break;
3058 :
3059 0 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3060 :
3061 0 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3062 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3063 : msecs,
3064 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3065 : }
3066 0 : return true;
3067 : }
3068 :
3069 : /*
3070 : * Get the current state of the recovery pause request.
3071 : */
3072 : RecoveryPauseState
3073 30 : GetRecoveryPauseState(void)
3074 : {
3075 : RecoveryPauseState state;
3076 :
3077 30 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3078 30 : state = XLogRecoveryCtl->recoveryPauseState;
3079 30 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3080 :
3081 30 : return state;
3082 : }
3083 :
3084 : /*
3085 : * Set the recovery pause state.
3086 : *
3087 : * If recovery pause is requested then sets the recovery pause state to
3088 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3089 : * to 'not paused' to resume the recovery. The recovery pause will be
3090 : * confirmed by the ConfirmRecoveryPaused.
3091 : */
3092 : void
3093 92 : SetRecoveryPause(bool recoveryPause)
3094 : {
3095 92 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3096 :
3097 92 : if (!recoveryPause)
3098 86 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3099 6 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3100 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3101 :
3102 92 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3103 :
3104 92 : if (!recoveryPause)
3105 86 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3106 92 : }
3107 :
3108 : /*
3109 : * Confirm the recovery pause by setting the recovery pause state to
3110 : * RECOVERY_PAUSED.
3111 : */
3112 : static void
3113 12 : ConfirmRecoveryPaused(void)
3114 : {
3115 : /* If recovery pause is requested then set it paused */
3116 12 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3117 12 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3118 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3119 12 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3120 12 : }
3121 :
3122 :
3123 : /*
3124 : * Attempt to read the next XLOG record.
3125 : *
3126 : * Before first call, the reader needs to be positioned to the first record
3127 : * by calling XLogPrefetcherBeginRead().
3128 : *
3129 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3130 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3131 : * record is available.
3132 : */
3133 : static XLogRecord *
3134 5365396 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3135 : bool fetching_ckpt, TimeLineID replayTLI)
3136 : {
3137 : XLogRecord *record;
3138 5365396 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3139 5365396 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3140 :
3141 : /* Pass through parameters to XLogPageRead */
3142 5365396 : private->fetching_ckpt = fetching_ckpt;
3143 5365396 : private->emode = emode;
3144 5365396 : private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3145 5365396 : private->replayTLI = replayTLI;
3146 :
3147 : /* This is the first attempt to read this page. */
3148 5365396 : lastSourceFailed = false;
3149 :
3150 : for (;;)
3151 192 : {
3152 : char *errormsg;
3153 :
3154 5365588 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3155 5365488 : if (record == NULL)
3156 : {
3157 : /*
3158 : * When we find that WAL ends in an incomplete record, keep track
3159 : * of that record. After recovery is done, we'll write a record
3160 : * to indicate to downstream WAL readers that that portion is to
3161 : * be ignored.
3162 : *
3163 : * However, when ArchiveRecoveryRequested = true, we're going to
3164 : * switch to a new timeline at the end of recovery. We will only
3165 : * copy WAL over to the new timeline up to the end of the last
3166 : * complete record, so if we did this, we would later create an
3167 : * overwrite contrecord in the wrong place, breaking everything.
3168 : */
3169 486 : if (!ArchiveRecoveryRequested &&
3170 208 : !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
3171 : {
3172 22 : abortedRecPtr = xlogreader->abortedRecPtr;
3173 22 : missingContrecPtr = xlogreader->missingContrecPtr;
3174 : }
3175 :
3176 486 : if (readFile >= 0)
3177 : {
3178 444 : close(readFile);
3179 444 : readFile = -1;
3180 : }
3181 :
3182 : /*
3183 : * We only end up here without a message when XLogPageRead()
3184 : * failed - in that case we already logged something. In
3185 : * StandbyMode that only happens if we have been triggered, so we
3186 : * shouldn't loop anymore in that case.
3187 : */
3188 486 : if (errormsg)
3189 444 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3190 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3191 : }
3192 :
3193 : /*
3194 : * Check page TLI is one of the expected values.
3195 : */
3196 5365002 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3197 : {
3198 : char fname[MAXFNAMELEN];
3199 : XLogSegNo segno;
3200 : int32 offset;
3201 :
3202 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3203 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3204 : wal_segment_size);
3205 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3206 : wal_segment_size);
3207 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3208 : (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3209 : xlogreader->latestPageTLI,
3210 : fname,
3211 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3212 : offset)));
3213 0 : record = NULL;
3214 : }
3215 :
3216 5365488 : if (record)
3217 : {
3218 : /* Great, got a record */
3219 5365296 : return record;
3220 : }
3221 : else
3222 : {
3223 : /* No valid record available from this source */
3224 486 : lastSourceFailed = true;
3225 :
3226 : /*
3227 : * If archive recovery was requested, but we were still doing
3228 : * crash recovery, switch to archive recovery and retry using the
3229 : * offline archive. We have now replayed all the valid WAL in
3230 : * pg_wal, so we are presumably now consistent.
3231 : *
3232 : * We require that there's at least some valid WAL present in
3233 : * pg_wal, however (!fetching_ckpt). We could recover using the
3234 : * WAL from the archive, even if pg_wal is completely empty, but
3235 : * we'd have no idea how far we'd have to replay to reach
3236 : * consistency. So err on the safe side and give up.
3237 : */
3238 486 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3239 4 : !fetching_ckpt)
3240 : {
3241 4 : ereport(DEBUG1,
3242 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3243 4 : InArchiveRecovery = true;
3244 4 : if (StandbyModeRequested)
3245 4 : EnableStandbyMode();
3246 :
3247 4 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3248 4 : minRecoveryPoint = xlogreader->EndRecPtr;
3249 4 : minRecoveryPointTLI = replayTLI;
3250 :
3251 4 : CheckRecoveryConsistency();
3252 :
3253 : /*
3254 : * Before we retry, reset lastSourceFailed and currentSource
3255 : * so that we will check the archive next.
3256 : */
3257 4 : lastSourceFailed = false;
3258 4 : currentSource = XLOG_FROM_ANY;
3259 :
3260 192 : continue;
3261 : }
3262 :
3263 : /* In standby mode, loop back to retry. Otherwise, give up. */
3264 482 : if (StandbyMode && !CheckForStandbyTrigger())
3265 188 : continue;
3266 : else
3267 294 : return NULL;
3268 : }
3269 : }
3270 : }
3271 :
3272 : /*
3273 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3274 : * already). Returns number of bytes read, if the page is read successfully,
3275 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3276 : * but only if they have not been previously reported.
3277 : *
3278 : * See XLogReaderRoutine.page_read for more details.
3279 : *
3280 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3281 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3282 : *
3283 : * This is responsible for restoring files from archive as needed, as well
3284 : * as for waiting for the requested WAL record to arrive in standby mode.
3285 : *
3286 : * xlogreader->private_data->emode specifies the log level used for reporting
3287 : * "file not found" or "end of WAL" situations in archive recovery, or in
3288 : * standby mode when promotion is triggered. If set to WARNING or below,
3289 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3290 : * levels the ereport() won't return.
3291 : *
3292 : * In standby mode, if after a successful return of XLogPageRead() the
3293 : * caller finds the record it's interested in to be broken, it should
3294 : * ereport the error with the level determined by
3295 : * emode_for_corrupt_record(), and then set lastSourceFailed
3296 : * and call XLogPageRead() again with the same arguments. This lets
3297 : * XLogPageRead() to try fetching the record from another source, or to
3298 : * sleep and retry.
3299 : */
3300 : static int
3301 2750064 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3302 : XLogRecPtr targetRecPtr, char *readBuf)
3303 : {
3304 2750064 : XLogPageReadPrivate *private =
3305 : (XLogPageReadPrivate *) xlogreader->private_data;
3306 2750064 : int emode = private->emode;
3307 : uint32 targetPageOff;
3308 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3309 : int r;
3310 : instr_time io_start;
3311 :
3312 2750064 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3313 2750064 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3314 :
3315 : /*
3316 : * See if we need to switch to a new segment because the requested record
3317 : * is not in the currently open one.
3318 : */
3319 2750064 : if (readFile >= 0 &&
3320 2746762 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3321 : {
3322 : /*
3323 : * Request a restartpoint if we've replayed too much xlog since the
3324 : * last one.
3325 : */
3326 2492 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3327 : {
3328 2462 : if (XLogCheckpointNeeded(readSegNo))
3329 : {
3330 2256 : (void) GetRedoRecPtr();
3331 2256 : if (XLogCheckpointNeeded(readSegNo))
3332 2242 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3333 : }
3334 : }
3335 :
3336 2492 : close(readFile);
3337 2492 : readFile = -1;
3338 2492 : readSource = XLOG_FROM_ANY;
3339 : }
3340 :
3341 2750064 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3342 :
3343 2750068 : retry:
3344 : /* See if we need to retrieve more data */
3345 2750068 : if (readFile < 0 ||
3346 2744270 : (readSource == XLOG_FROM_STREAM &&
3347 2720682 : flushedUpto < targetPagePtr + reqLen))
3348 : {
3349 23082 : if (readFile >= 0 &&
3350 17284 : xlogreader->nonblocking &&
3351 8506 : readSource == XLOG_FROM_STREAM &&
3352 8506 : flushedUpto < targetPagePtr + reqLen)
3353 8506 : return XLREAD_WOULDBLOCK;
3354 :
3355 14476 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3356 14576 : private->randAccess,
3357 14576 : private->fetching_ckpt,
3358 : targetRecPtr,
3359 : private->replayTLI,
3360 : xlogreader->EndRecPtr,
3361 14576 : xlogreader->nonblocking))
3362 : {
3363 1112 : case XLREAD_WOULDBLOCK:
3364 1112 : return XLREAD_WOULDBLOCK;
3365 80 : case XLREAD_FAIL:
3366 80 : if (readFile >= 0)
3367 0 : close(readFile);
3368 80 : readFile = -1;
3369 80 : readLen = 0;
3370 80 : readSource = XLOG_FROM_ANY;
3371 80 : return XLREAD_FAIL;
3372 13284 : case XLREAD_SUCCESS:
3373 13284 : break;
3374 : }
3375 2726986 : }
3376 :
3377 : /*
3378 : * At this point, we have the right segment open and if we're streaming we
3379 : * know the requested record is in it.
3380 : */
3381 : Assert(readFile != -1);
3382 :
3383 : /*
3384 : * If the current segment is being streamed from the primary, calculate
3385 : * how much of the current page we have received already. We know the
3386 : * requested record has been received, but this is for the benefit of
3387 : * future calls, to allow quick exit at the top of this function.
3388 : */
3389 2740270 : if (readSource == XLOG_FROM_STREAM)
3390 : {
3391 2714478 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3392 2707272 : readLen = XLOG_BLCKSZ;
3393 : else
3394 7206 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3395 : targetPageOff;
3396 : }
3397 : else
3398 25792 : readLen = XLOG_BLCKSZ;
3399 :
3400 : /* Read the requested page */
3401 2740270 : readOff = targetPageOff;
3402 :
3403 : /* Measure I/O timing when reading segment */
3404 2740270 : io_start = pgstat_prepare_io_time(track_io_timing);
3405 :
3406 2740270 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3407 2740270 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3408 2740270 : if (r != XLOG_BLCKSZ)
3409 : {
3410 : char fname[MAXFNAMELEN];
3411 0 : int save_errno = errno;
3412 :
3413 0 : pgstat_report_wait_end();
3414 :
3415 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3416 : io_start, 1, r);
3417 :
3418 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3419 0 : if (r < 0)
3420 : {
3421 0 : errno = save_errno;
3422 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3423 : (errcode_for_file_access(),
3424 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3425 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3426 : readOff)));
3427 : }
3428 : else
3429 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3430 : (errcode(ERRCODE_DATA_CORRUPTED),
3431 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3432 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3433 : readOff, r, (Size) XLOG_BLCKSZ)));
3434 0 : goto next_record_is_invalid;
3435 : }
3436 2740270 : pgstat_report_wait_end();
3437 :
3438 2740270 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3439 : io_start, 1, r);
3440 :
3441 : Assert(targetSegNo == readSegNo);
3442 : Assert(targetPageOff == readOff);
3443 : Assert(reqLen <= readLen);
3444 :
3445 2740270 : xlogreader->seg.ws_tli = curFileTLI;
3446 :
3447 : /*
3448 : * Check the page header immediately, so that we can retry immediately if
3449 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3450 : * validates the page header anyway, and would propagate the failure up to
3451 : * ReadRecord(), which would retry. However, there's a corner case with
3452 : * continuation records, if a record is split across two pages such that
3453 : * we would need to read the two pages from different sources across two
3454 : * WAL segments.
3455 : *
3456 : * The first page is only available locally, in pg_wal, because it's
3457 : * already been recycled on the primary. The second page, however, is not
3458 : * present in pg_wal, and we should stream it from the primary. There is a
3459 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3460 : * We would read the first page from the local WAL segment, but when
3461 : * reading the second page, we would read the bogus, recycled, WAL
3462 : * segment. If we didn't catch that case here, we would never recover,
3463 : * because ReadRecord() would retry reading the whole record from the
3464 : * beginning.
3465 : *
3466 : * Of course, this only catches errors in the page header, which is what
3467 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3468 : * corruption still has the same problem. But this at least fixes the
3469 : * common case, which can happen as part of normal operation.
3470 : *
3471 : * Validating the page header is cheap enough that doing it twice
3472 : * shouldn't be a big deal from a performance point of view.
3473 : *
3474 : * When not in standby mode, an invalid page header should cause recovery
3475 : * to end, not retry reading the page, so we don't need to validate the
3476 : * page header here for the retry. Instead, ReadPageInternal() is
3477 : * responsible for the validation.
3478 : */
3479 2740270 : if (StandbyMode &&
3480 2720964 : (targetPagePtr % wal_segment_size) == 0 &&
3481 1926 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3482 : {
3483 : /*
3484 : * Emit this error right now then retry this page immediately. Use
3485 : * errmsg_internal() because the message was already translated.
3486 : */
3487 6 : if (xlogreader->errormsg_buf[0])
3488 6 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3489 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3490 :
3491 : /* reset any error XLogReaderValidatePageHeader() might have set */
3492 6 : XLogReaderResetError(xlogreader);
3493 6 : goto next_record_is_invalid;
3494 : }
3495 :
3496 2740264 : return readLen;
3497 :
3498 6 : next_record_is_invalid:
3499 :
3500 : /*
3501 : * If we're reading ahead, give up fast. Retries and error reporting will
3502 : * be handled by a later read when recovery catches up to this point.
3503 : */
3504 6 : if (xlogreader->nonblocking)
3505 2 : return XLREAD_WOULDBLOCK;
3506 :
3507 4 : lastSourceFailed = true;
3508 :
3509 4 : if (readFile >= 0)
3510 4 : close(readFile);
3511 4 : readFile = -1;
3512 4 : readLen = 0;
3513 4 : readSource = XLOG_FROM_ANY;
3514 :
3515 : /* In standby-mode, keep trying */
3516 4 : if (StandbyMode)
3517 4 : goto retry;
3518 : else
3519 0 : return XLREAD_FAIL;
3520 : }
3521 :
3522 : /*
3523 : * Open the WAL segment containing WAL location 'RecPtr'.
3524 : *
3525 : * The segment can be fetched via restore_command, or via walreceiver having
3526 : * streamed the record, or it can already be present in pg_wal. Checking
3527 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3528 : * too, in case someone copies a new segment directly to pg_wal. That is not
3529 : * documented or recommended, though.
3530 : *
3531 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3532 : * prepare to read WAL starting from RedoStartLSN after this.
3533 : *
3534 : * 'RecPtr' might not point to the beginning of the record we're interested
3535 : * in, it might also point to the page or segment header. In that case,
3536 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3537 : * used to decide which timeline to stream the requested WAL from.
3538 : *
3539 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3540 : * timelines, we can reject a switch to a timeline that branched off before
3541 : * this point.
3542 : *
3543 : * If the record is not immediately available, the function returns false
3544 : * if we're not in standby mode. In standby mode, waits for it to become
3545 : * available.
3546 : *
3547 : * When the requested record becomes available, the function opens the file
3548 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3549 : * of standby mode is triggered by the user, and there is no more WAL
3550 : * available, returns XLREAD_FAIL.
3551 : *
3552 : * If nonblocking is true, then give up immediately if we can't satisfy the
3553 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3554 : */
3555 : static XLogPageReadResult
3556 14576 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3557 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3558 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3559 : bool nonblocking)
3560 : {
3561 : static TimestampTz last_fail_time = 0;
3562 : TimestampTz now;
3563 14576 : bool streaming_reply_sent = false;
3564 :
3565 : /*-------
3566 : * Standby mode is implemented by a state machine:
3567 : *
3568 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3569 : * pg_wal (XLOG_FROM_PG_WAL)
3570 : * 2. Check for promotion trigger request
3571 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3572 : * 4. Rescan timelines
3573 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3574 : *
3575 : * Failure to read from the current source advances the state machine to
3576 : * the next state.
3577 : *
3578 : * 'currentSource' indicates the current state. There are no currentSource
3579 : * values for "check trigger", "rescan timelines", and "sleep" states,
3580 : * those actions are taken when reading from the previous source fails, as
3581 : * part of advancing to the next state.
3582 : *
3583 : * If standby mode is turned off while reading WAL from stream, we move
3584 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3585 : * the files (which would be required at end of recovery, e.g., timeline
3586 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3587 : * here because it's already stopped when standby mode is turned off at
3588 : * the end of recovery.
3589 : *-------
3590 : */
3591 14576 : if (!InArchiveRecovery)
3592 1682 : currentSource = XLOG_FROM_PG_WAL;
3593 12894 : else if (currentSource == XLOG_FROM_ANY ||
3594 12678 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3595 : {
3596 216 : lastSourceFailed = false;
3597 216 : currentSource = XLOG_FROM_ARCHIVE;
3598 : }
3599 :
3600 : for (;;)
3601 12424 : {
3602 27000 : XLogSource oldSource = currentSource;
3603 27000 : bool startWalReceiver = false;
3604 :
3605 : /*
3606 : * First check if we failed to read from the current source, and
3607 : * advance the state machine if so. The failure to read might've
3608 : * happened outside this function, e.g when a CRC check fails on a
3609 : * record, or within this loop.
3610 : */
3611 27000 : if (lastSourceFailed)
3612 : {
3613 : /*
3614 : * Don't allow any retry loops to occur during nonblocking
3615 : * readahead. Let the caller process everything that has been
3616 : * decoded already first.
3617 : */
3618 810 : if (nonblocking)
3619 134 : return XLREAD_WOULDBLOCK;
3620 :
3621 676 : switch (currentSource)
3622 : {
3623 420 : case XLOG_FROM_ARCHIVE:
3624 : case XLOG_FROM_PG_WAL:
3625 :
3626 : /*
3627 : * Check to see if promotion is requested. Note that we do
3628 : * this only after failure, so when you promote, we still
3629 : * finish replaying as much as we can from archive and
3630 : * pg_wal before failover.
3631 : */
3632 420 : if (StandbyMode && CheckForStandbyTrigger())
3633 : {
3634 40 : XLogShutdownWalRcv();
3635 40 : return XLREAD_FAIL;
3636 : }
3637 :
3638 : /*
3639 : * Not in standby mode, and we've now tried the archive
3640 : * and pg_wal.
3641 : */
3642 380 : if (!StandbyMode)
3643 40 : return XLREAD_FAIL;
3644 :
3645 : /*
3646 : * Move to XLOG_FROM_STREAM state, and set to start a
3647 : * walreceiver if necessary.
3648 : */
3649 340 : currentSource = XLOG_FROM_STREAM;
3650 340 : startWalReceiver = true;
3651 340 : break;
3652 :
3653 256 : case XLOG_FROM_STREAM:
3654 :
3655 : /*
3656 : * Failure while streaming. Most likely, we got here
3657 : * because streaming replication was terminated, or
3658 : * promotion was triggered. But we also get here if we
3659 : * find an invalid record in the WAL streamed from the
3660 : * primary, in which case something is seriously wrong.
3661 : * There's little chance that the problem will just go
3662 : * away, but PANIC is not good for availability either,
3663 : * especially in hot standby mode. So, we treat that the
3664 : * same as disconnection, and retry from archive/pg_wal
3665 : * again. The WAL in the archive should be identical to
3666 : * what was streamed, so it's unlikely that it helps, but
3667 : * one can hope...
3668 : */
3669 :
3670 : /*
3671 : * We should be able to move to XLOG_FROM_STREAM only in
3672 : * standby mode.
3673 : */
3674 : Assert(StandbyMode);
3675 :
3676 : /*
3677 : * Before we leave XLOG_FROM_STREAM state, make sure that
3678 : * walreceiver is not active, so that it won't overwrite
3679 : * WAL that we restore from archive.
3680 : */
3681 256 : XLogShutdownWalRcv();
3682 :
3683 : /*
3684 : * Before we sleep, re-scan for possible new timelines if
3685 : * we were requested to recover to the latest timeline.
3686 : */
3687 256 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3688 : {
3689 256 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3690 : {
3691 12 : currentSource = XLOG_FROM_ARCHIVE;
3692 12 : break;
3693 : }
3694 : }
3695 :
3696 : /*
3697 : * XLOG_FROM_STREAM is the last state in our state
3698 : * machine, so we've exhausted all the options for
3699 : * obtaining the requested WAL. We're going to loop back
3700 : * and retry from the archive, but if it hasn't been long
3701 : * since last attempt, sleep wal_retrieve_retry_interval
3702 : * milliseconds to avoid busy-waiting.
3703 : */
3704 244 : now = GetCurrentTimestamp();
3705 244 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3706 : wal_retrieve_retry_interval))
3707 : {
3708 : long wait_time;
3709 :
3710 240 : wait_time = wal_retrieve_retry_interval -
3711 120 : TimestampDifferenceMilliseconds(last_fail_time, now);
3712 :
3713 120 : elog(LOG, "waiting for WAL to become available at %X/%X",
3714 : LSN_FORMAT_ARGS(RecPtr));
3715 :
3716 : /* Do background tasks that might benefit us later. */
3717 120 : KnownAssignedTransactionIdsIdleMaintenance();
3718 :
3719 120 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3720 : WL_LATCH_SET | WL_TIMEOUT |
3721 : WL_EXIT_ON_PM_DEATH,
3722 : wait_time,
3723 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3724 120 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3725 120 : now = GetCurrentTimestamp();
3726 :
3727 : /* Handle interrupt signals of startup process */
3728 120 : HandleStartupProcInterrupts();
3729 : }
3730 224 : last_fail_time = now;
3731 224 : currentSource = XLOG_FROM_ARCHIVE;
3732 224 : break;
3733 :
3734 0 : default:
3735 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3736 : }
3737 : }
3738 26190 : else if (currentSource == XLOG_FROM_PG_WAL)
3739 : {
3740 : /*
3741 : * We just successfully read a file in pg_wal. We prefer files in
3742 : * the archive over ones in pg_wal, so try the next file again
3743 : * from the archive first.
3744 : */
3745 1678 : if (InArchiveRecovery)
3746 0 : currentSource = XLOG_FROM_ARCHIVE;
3747 : }
3748 :
3749 26766 : if (currentSource != oldSource)
3750 576 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3751 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3752 : lastSourceFailed ? "failure" : "success");
3753 :
3754 : /*
3755 : * We've now handled possible failure. Try to read from the chosen
3756 : * source.
3757 : */
3758 26766 : lastSourceFailed = false;
3759 :
3760 26766 : switch (currentSource)
3761 : {
3762 2528 : case XLOG_FROM_ARCHIVE:
3763 : case XLOG_FROM_PG_WAL:
3764 :
3765 : /*
3766 : * WAL receiver must not be running when reading WAL from
3767 : * archive or pg_wal.
3768 : */
3769 : Assert(!WalRcvStreaming());
3770 :
3771 : /* Close any old file we might have open. */
3772 2528 : if (readFile >= 0)
3773 : {
3774 124 : close(readFile);
3775 124 : readFile = -1;
3776 : }
3777 : /* Reset curFileTLI if random fetch. */
3778 2528 : if (randAccess)
3779 1966 : curFileTLI = 0;
3780 :
3781 : /*
3782 : * Try to restore the file from archive, or read an existing
3783 : * file from pg_wal.
3784 : */
3785 2528 : readFile = XLogFileReadAnyTLI(readSegNo,
3786 2528 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3787 : currentSource);
3788 2528 : if (readFile >= 0)
3789 2204 : return XLREAD_SUCCESS; /* success! */
3790 :
3791 : /*
3792 : * Nope, not found in archive or pg_wal.
3793 : */
3794 324 : lastSourceFailed = true;
3795 324 : break;
3796 :
3797 24238 : case XLOG_FROM_STREAM:
3798 : {
3799 : bool havedata;
3800 :
3801 : /*
3802 : * We should be able to move to XLOG_FROM_STREAM only in
3803 : * standby mode.
3804 : */
3805 : Assert(StandbyMode);
3806 :
3807 : /*
3808 : * First, shutdown walreceiver if its restart has been
3809 : * requested -- but no point if we're already slated for
3810 : * starting it.
3811 : */
3812 24238 : if (pendingWalRcvRestart && !startWalReceiver)
3813 : {
3814 6 : XLogShutdownWalRcv();
3815 :
3816 : /*
3817 : * Re-scan for possible new timelines if we were
3818 : * requested to recover to the latest timeline.
3819 : */
3820 6 : if (recoveryTargetTimeLineGoal ==
3821 : RECOVERY_TARGET_TIMELINE_LATEST)
3822 6 : rescanLatestTimeLine(replayTLI, replayLSN);
3823 :
3824 6 : startWalReceiver = true;
3825 : }
3826 24238 : pendingWalRcvRestart = false;
3827 :
3828 : /*
3829 : * Launch walreceiver if needed.
3830 : *
3831 : * If fetching_ckpt is true, RecPtr points to the initial
3832 : * checkpoint location. In that case, we use RedoStartLSN
3833 : * as the streaming start position instead of RecPtr, so
3834 : * that when we later jump backwards to start redo at
3835 : * RedoStartLSN, we will have the logs streamed already.
3836 : */
3837 24238 : if (startWalReceiver &&
3838 346 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3839 : {
3840 : XLogRecPtr ptr;
3841 : TimeLineID tli;
3842 :
3843 298 : if (fetching_ckpt)
3844 : {
3845 0 : ptr = RedoStartLSN;
3846 0 : tli = RedoStartTLI;
3847 : }
3848 : else
3849 : {
3850 298 : ptr = RecPtr;
3851 :
3852 : /*
3853 : * Use the record begin position to determine the
3854 : * TLI, rather than the position we're reading.
3855 : */
3856 298 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3857 :
3858 298 : if (curFileTLI > 0 && tli < curFileTLI)
3859 0 : elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3860 : LSN_FORMAT_ARGS(tliRecPtr),
3861 : tli, curFileTLI);
3862 : }
3863 298 : curFileTLI = tli;
3864 298 : SetInstallXLogFileSegmentActive();
3865 298 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3866 : PrimarySlotName,
3867 : wal_receiver_create_temp_slot);
3868 298 : flushedUpto = 0;
3869 : }
3870 :
3871 : /*
3872 : * Check if WAL receiver is active or wait to start up.
3873 : */
3874 24238 : if (!WalRcvStreaming())
3875 : {
3876 200 : lastSourceFailed = true;
3877 200 : break;
3878 : }
3879 :
3880 : /*
3881 : * Walreceiver is active, so see if new data has arrived.
3882 : *
3883 : * We only advance XLogReceiptTime when we obtain fresh
3884 : * WAL from walreceiver and observe that we had already
3885 : * processed everything before the most recent "chunk"
3886 : * that it flushed to disk. In steady state where we are
3887 : * keeping up with the incoming data, XLogReceiptTime will
3888 : * be updated on each cycle. When we are behind,
3889 : * XLogReceiptTime will not advance, so the grace time
3890 : * allotted to conflicting queries will decrease.
3891 : */
3892 24038 : if (RecPtr < flushedUpto)
3893 3540 : havedata = true;
3894 : else
3895 : {
3896 : XLogRecPtr latestChunkStart;
3897 :
3898 20498 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3899 20498 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3900 : {
3901 10022 : havedata = true;
3902 10022 : if (latestChunkStart <= RecPtr)
3903 : {
3904 8534 : XLogReceiptTime = GetCurrentTimestamp();
3905 8534 : SetCurrentChunkStartTime(XLogReceiptTime);
3906 : }
3907 : }
3908 : else
3909 10476 : havedata = false;
3910 : }
3911 24038 : if (havedata)
3912 : {
3913 : /*
3914 : * Great, streamed far enough. Open the file if it's
3915 : * not open already. Also read the timeline history
3916 : * file if we haven't initialized timeline history
3917 : * yet; it should be streamed over and present in
3918 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3919 : * info is set correctly and XLogReceiptTime isn't
3920 : * changed.
3921 : *
3922 : * NB: We must set readTimeLineHistory based on
3923 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3924 : * be the same, but if recovery_target_timeline is
3925 : * 'latest' and archiving is configured, then it's
3926 : * possible that we managed to retrieve one or more
3927 : * new timeline history files from the archive,
3928 : * updating recoveryTargetTLI.
3929 : */
3930 13562 : if (readFile < 0)
3931 : {
3932 2482 : if (!expectedTLEs)
3933 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3934 2482 : readFile = XLogFileRead(readSegNo, receiveTLI,
3935 : XLOG_FROM_STREAM, false);
3936 : Assert(readFile >= 0);
3937 : }
3938 : else
3939 : {
3940 : /* just make sure source info is correct... */
3941 11080 : readSource = XLOG_FROM_STREAM;
3942 11080 : XLogReceiptSource = XLOG_FROM_STREAM;
3943 11080 : return XLREAD_SUCCESS;
3944 : }
3945 2482 : break;
3946 : }
3947 :
3948 : /* In nonblocking mode, return rather than sleeping. */
3949 10476 : if (nonblocking)
3950 978 : return XLREAD_WOULDBLOCK;
3951 :
3952 : /*
3953 : * Data not here yet. Check for trigger, then wait for
3954 : * walreceiver to wake us up when new WAL arrives.
3955 : */
3956 9498 : if (CheckForStandbyTrigger())
3957 : {
3958 : /*
3959 : * Note that we don't return XLREAD_FAIL immediately
3960 : * here. After being triggered, we still want to
3961 : * replay all the WAL that was already streamed. It's
3962 : * in pg_wal now, so we just treat this as a failure,
3963 : * and the state machine will move on to replay the
3964 : * streamed WAL from pg_wal, and then recheck the
3965 : * trigger and exit replay.
3966 : */
3967 56 : lastSourceFailed = true;
3968 56 : break;
3969 : }
3970 :
3971 : /*
3972 : * Since we have replayed everything we have received so
3973 : * far and are about to start waiting for more WAL, let's
3974 : * tell the upstream server our replay location now so
3975 : * that pg_stat_replication doesn't show stale
3976 : * information.
3977 : */
3978 9442 : if (!streaming_reply_sent)
3979 : {
3980 7694 : WalRcvForceReply();
3981 7694 : streaming_reply_sent = true;
3982 : }
3983 :
3984 : /* Do any background tasks that might benefit us later. */
3985 9442 : KnownAssignedTransactionIdsIdleMaintenance();
3986 :
3987 : /* Update pg_stat_recovery_prefetch before sleeping. */
3988 9442 : XLogPrefetcherComputeStats(xlogprefetcher);
3989 :
3990 : /*
3991 : * Wait for more WAL to arrive, when we will be woken
3992 : * immediately by the WAL receiver.
3993 : */
3994 9442 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3995 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
3996 : -1L,
3997 : WAIT_EVENT_RECOVERY_WAL_STREAM);
3998 9442 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3999 9442 : break;
4000 : }
4001 :
4002 0 : default:
4003 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
4004 : }
4005 :
4006 : /*
4007 : * Check for recovery pause here so that we can confirm more quickly
4008 : * that a requested pause has actually taken effect.
4009 : */
4010 12504 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4011 : RECOVERY_NOT_PAUSED)
4012 4 : recoveryPausesHere(false);
4013 :
4014 : /*
4015 : * This possibly-long loop needs to handle interrupts of startup
4016 : * process.
4017 : */
4018 12504 : HandleStartupProcInterrupts();
4019 : }
4020 :
4021 : return XLREAD_FAIL; /* not reached */
4022 : }
4023 :
4024 :
4025 : /*
4026 : * Determine what log level should be used to report a corrupt WAL record
4027 : * in the current WAL page, previously read by XLogPageRead().
4028 : *
4029 : * 'emode' is the error mode that would be used to report a file-not-found
4030 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4031 : * we're retrying the exact same record that we've tried previously, only
4032 : * complain the first time to keep the noise down. However, we only do when
4033 : * reading from pg_wal, because we don't expect any invalid records in archive
4034 : * or in records streamed from the primary. Files in the archive should be complete,
4035 : * and we should never hit the end of WAL because we stop and wait for more WAL
4036 : * to arrive before replaying it.
4037 : *
4038 : * NOTE: This function remembers the RecPtr value it was last called with,
4039 : * to suppress repeated messages about the same record. Only call this when
4040 : * you are about to ereport(), or you might cause a later message to be
4041 : * erroneously suppressed.
4042 : */
4043 : static int
4044 450 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4045 : {
4046 : static XLogRecPtr lastComplaint = 0;
4047 :
4048 450 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4049 : {
4050 444 : if (RecPtr == lastComplaint)
4051 80 : emode = DEBUG1;
4052 : else
4053 364 : lastComplaint = RecPtr;
4054 : }
4055 450 : return emode;
4056 : }
4057 :
4058 :
4059 : /*
4060 : * Subroutine to try to fetch and validate a prior checkpoint record.
4061 : */
4062 : static XLogRecord *
4063 1668 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4064 : TimeLineID replayTLI)
4065 : {
4066 : XLogRecord *record;
4067 : uint8 info;
4068 :
4069 : Assert(xlogreader != NULL);
4070 :
4071 1668 : if (!XRecOffIsValid(RecPtr))
4072 : {
4073 0 : ereport(LOG,
4074 : (errmsg("invalid checkpoint location")));
4075 0 : return NULL;
4076 : }
4077 :
4078 1668 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4079 1668 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4080 :
4081 1668 : if (record == NULL)
4082 : {
4083 0 : ereport(LOG,
4084 : (errmsg("invalid checkpoint record")));
4085 0 : return NULL;
4086 : }
4087 1668 : if (record->xl_rmid != RM_XLOG_ID)
4088 : {
4089 0 : ereport(LOG,
4090 : (errmsg("invalid resource manager ID in checkpoint record")));
4091 0 : return NULL;
4092 : }
4093 1668 : info = record->xl_info & ~XLR_INFO_MASK;
4094 1668 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4095 : info != XLOG_CHECKPOINT_ONLINE)
4096 : {
4097 0 : ereport(LOG,
4098 : (errmsg("invalid xl_info in checkpoint record")));
4099 0 : return NULL;
4100 : }
4101 1668 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4102 : {
4103 0 : ereport(LOG,
4104 : (errmsg("invalid length of checkpoint record")));
4105 0 : return NULL;
4106 : }
4107 1668 : return record;
4108 : }
4109 :
4110 : /*
4111 : * Scan for new timelines that might have appeared in the archive since we
4112 : * started recovery.
4113 : *
4114 : * If there are any, the function changes recovery target TLI to the latest
4115 : * one and returns 'true'.
4116 : */
4117 : static bool
4118 262 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4119 : {
4120 : List *newExpectedTLEs;
4121 : bool found;
4122 : ListCell *cell;
4123 : TimeLineID newtarget;
4124 262 : TimeLineID oldtarget = recoveryTargetTLI;
4125 262 : TimeLineHistoryEntry *currentTle = NULL;
4126 :
4127 262 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4128 262 : if (newtarget == recoveryTargetTLI)
4129 : {
4130 : /* No new timelines found */
4131 250 : return false;
4132 : }
4133 :
4134 : /*
4135 : * Determine the list of expected TLIs for the new TLI
4136 : */
4137 :
4138 12 : newExpectedTLEs = readTimeLineHistory(newtarget);
4139 :
4140 : /*
4141 : * If the current timeline is not part of the history of the new timeline,
4142 : * we cannot proceed to it.
4143 : */
4144 12 : found = false;
4145 24 : foreach(cell, newExpectedTLEs)
4146 : {
4147 24 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4148 :
4149 24 : if (currentTle->tli == recoveryTargetTLI)
4150 : {
4151 12 : found = true;
4152 12 : break;
4153 : }
4154 : }
4155 12 : if (!found)
4156 : {
4157 0 : ereport(LOG,
4158 : (errmsg("new timeline %u is not a child of database system timeline %u",
4159 : newtarget,
4160 : replayTLI)));
4161 0 : return false;
4162 : }
4163 :
4164 : /*
4165 : * The current timeline was found in the history file, but check that the
4166 : * next timeline was forked off from it *after* the current recovery
4167 : * location.
4168 : */
4169 12 : if (currentTle->end < replayLSN)
4170 : {
4171 0 : ereport(LOG,
4172 : (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4173 : newtarget,
4174 : replayTLI,
4175 : LSN_FORMAT_ARGS(replayLSN))));
4176 0 : return false;
4177 : }
4178 :
4179 : /* The new timeline history seems valid. Switch target */
4180 12 : recoveryTargetTLI = newtarget;
4181 12 : list_free_deep(expectedTLEs);
4182 12 : expectedTLEs = newExpectedTLEs;
4183 :
4184 : /*
4185 : * As in StartupXLOG(), try to ensure we have all the history files
4186 : * between the old target and new target in pg_wal.
4187 : */
4188 12 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4189 :
4190 12 : ereport(LOG,
4191 : (errmsg("new target timeline is %u",
4192 : recoveryTargetTLI)));
4193 :
4194 12 : return true;
4195 : }
4196 :
4197 :
4198 : /*
4199 : * Open a logfile segment for reading (during recovery).
4200 : *
4201 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4202 : * Otherwise, it's assumed to be already available in pg_wal.
4203 : */
4204 : static int
4205 5824 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4206 : XLogSource source, bool notfoundOk)
4207 : {
4208 : char xlogfname[MAXFNAMELEN];
4209 : char activitymsg[MAXFNAMELEN + 16];
4210 : char path[MAXPGPATH];
4211 : int fd;
4212 :
4213 5824 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4214 :
4215 5824 : switch (source)
4216 : {
4217 870 : case XLOG_FROM_ARCHIVE:
4218 : /* Report recovery progress in PS display */
4219 870 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4220 : xlogfname);
4221 870 : set_ps_display(activitymsg);
4222 :
4223 870 : if (!RestoreArchivedFile(path, xlogfname,
4224 : "RECOVERYXLOG",
4225 : wal_segment_size,
4226 : InRedo))
4227 794 : return -1;
4228 76 : break;
4229 :
4230 4954 : case XLOG_FROM_PG_WAL:
4231 : case XLOG_FROM_STREAM:
4232 4954 : XLogFilePath(path, tli, segno, wal_segment_size);
4233 4954 : break;
4234 :
4235 0 : default:
4236 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4237 : }
4238 :
4239 : /*
4240 : * If the segment was fetched from archival storage, replace the existing
4241 : * xlog segment (if any) with the archival version.
4242 : */
4243 5030 : if (source == XLOG_FROM_ARCHIVE)
4244 : {
4245 : Assert(!IsInstallXLogFileSegmentActive());
4246 76 : KeepFileRestoredFromArchive(path, xlogfname);
4247 :
4248 : /*
4249 : * Set path to point at the new file in pg_wal.
4250 : */
4251 76 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4252 : }
4253 :
4254 5030 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4255 5030 : if (fd >= 0)
4256 : {
4257 : /* Success! */
4258 4686 : curFileTLI = tli;
4259 :
4260 : /* Report recovery progress in PS display */
4261 4686 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4262 : xlogfname);
4263 4686 : set_ps_display(activitymsg);
4264 :
4265 : /* Track source of data in assorted state variables */
4266 4686 : readSource = source;
4267 4686 : XLogReceiptSource = source;
4268 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4269 4686 : if (source != XLOG_FROM_STREAM)
4270 2204 : XLogReceiptTime = GetCurrentTimestamp();
4271 :
4272 4686 : return fd;
4273 : }
4274 344 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4275 0 : ereport(PANIC,
4276 : (errcode_for_file_access(),
4277 : errmsg("could not open file \"%s\": %m", path)));
4278 344 : return -1;
4279 : }
4280 :
4281 : /*
4282 : * Open a logfile segment for reading (during recovery).
4283 : *
4284 : * This version searches for the segment with any TLI listed in expectedTLEs.
4285 : */
4286 : static int
4287 2528 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4288 : {
4289 : char path[MAXPGPATH];
4290 : ListCell *cell;
4291 : int fd;
4292 : List *tles;
4293 :
4294 : /*
4295 : * Loop looking for a suitable timeline ID: we might need to read any of
4296 : * the timelines listed in expectedTLEs.
4297 : *
4298 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4299 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4300 : * to go backwards; this prevents us from picking up the wrong file when a
4301 : * parent timeline extends to higher segment numbers than the child we
4302 : * want to read.
4303 : *
4304 : * If we haven't read the timeline history file yet, read it now, so that
4305 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4306 : * however, unless we actually find a valid segment. That way if there is
4307 : * neither a timeline history file nor a WAL segment in the archive, and
4308 : * streaming replication is set up, we'll read the timeline history file
4309 : * streamed from the primary when we start streaming, instead of
4310 : * recovering with a dummy history generated here.
4311 : */
4312 2528 : if (expectedTLEs)
4313 860 : tles = expectedTLEs;
4314 : else
4315 1668 : tles = readTimeLineHistory(recoveryTargetTLI);
4316 :
4317 2886 : foreach(cell, tles)
4318 : {
4319 2570 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4320 2570 : TimeLineID tli = hent->tli;
4321 :
4322 2570 : if (tli < curFileTLI)
4323 8 : break; /* don't bother looking at too-old TLIs */
4324 :
4325 : /*
4326 : * Skip scanning the timeline ID that the logfile segment to read
4327 : * doesn't belong to
4328 : */
4329 2562 : if (hent->begin != InvalidXLogRecPtr)
4330 : {
4331 140 : XLogSegNo beginseg = 0;
4332 :
4333 140 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4334 :
4335 : /*
4336 : * The logfile segment that doesn't belong to the timeline is
4337 : * older or newer than the segment that the timeline started or
4338 : * ended at, respectively. It's sufficient to check only the
4339 : * starting segment of the timeline here. Since the timelines are
4340 : * scanned in descending order in this loop, any segments newer
4341 : * than the ending segment should belong to newer timeline and
4342 : * have already been read before. So it's not necessary to check
4343 : * the ending segment of the timeline here.
4344 : */
4345 140 : if (segno < beginseg)
4346 14 : continue;
4347 : }
4348 :
4349 2548 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4350 : {
4351 870 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4352 870 : if (fd != -1)
4353 : {
4354 76 : elog(DEBUG1, "got WAL segment from archive");
4355 76 : if (!expectedTLEs)
4356 30 : expectedTLEs = tles;
4357 2204 : return fd;
4358 : }
4359 : }
4360 :
4361 2472 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4362 : {
4363 2472 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4364 2472 : if (fd != -1)
4365 : {
4366 2128 : if (!expectedTLEs)
4367 1638 : expectedTLEs = tles;
4368 2128 : return fd;
4369 : }
4370 : }
4371 : }
4372 :
4373 : /* Couldn't find it. For simplicity, complain about front timeline */
4374 324 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4375 324 : errno = ENOENT;
4376 324 : ereport(DEBUG2,
4377 : (errcode_for_file_access(),
4378 : errmsg("could not open file \"%s\": %m", path)));
4379 324 : return -1;
4380 : }
4381 :
4382 : /*
4383 : * Set flag to signal the walreceiver to restart. (The startup process calls
4384 : * this on noticing a relevant configuration change.)
4385 : */
4386 : void
4387 6 : StartupRequestWalReceiverRestart(void)
4388 : {
4389 6 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4390 : {
4391 6 : ereport(LOG,
4392 : (errmsg("WAL receiver process shutdown requested")));
4393 :
4394 6 : pendingWalRcvRestart = true;
4395 : }
4396 6 : }
4397 :
4398 :
4399 : /*
4400 : * Has a standby promotion already been triggered?
4401 : *
4402 : * Unlike CheckForStandbyTrigger(), this works in any process
4403 : * that's connected to shared memory.
4404 : */
4405 : bool
4406 104 : PromoteIsTriggered(void)
4407 : {
4408 : /*
4409 : * We check shared state each time only until a standby promotion is
4410 : * triggered. We can't trigger a promotion again, so there's no need to
4411 : * keep checking after the shared variable has once been seen true.
4412 : */
4413 104 : if (LocalPromoteIsTriggered)
4414 84 : return true;
4415 :
4416 20 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4417 20 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4418 20 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4419 :
4420 20 : return LocalPromoteIsTriggered;
4421 : }
4422 :
4423 : static void
4424 84 : SetPromoteIsTriggered(void)
4425 : {
4426 84 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4427 84 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4428 84 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4429 :
4430 : /*
4431 : * Mark the recovery pause state as 'not paused' because the paused state
4432 : * ends and promotion continues if a promotion is triggered while recovery
4433 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4434 : * return 'paused' while a promotion is ongoing.
4435 : */
4436 84 : SetRecoveryPause(false);
4437 :
4438 84 : LocalPromoteIsTriggered = true;
4439 84 : }
4440 :
4441 : /*
4442 : * Check whether a promote request has arrived.
4443 : */
4444 : static bool
4445 10164 : CheckForStandbyTrigger(void)
4446 : {
4447 10164 : if (LocalPromoteIsTriggered)
4448 98 : return true;
4449 :
4450 10066 : if (IsPromoteSignaled() && CheckPromoteSignal())
4451 : {
4452 84 : ereport(LOG, (errmsg("received promote request")));
4453 84 : RemovePromoteSignalFiles();
4454 84 : ResetPromoteSignaled();
4455 84 : SetPromoteIsTriggered();
4456 84 : return true;
4457 : }
4458 :
4459 9982 : return false;
4460 : }
4461 :
4462 : /*
4463 : * Remove the files signaling a standby promotion request.
4464 : */
4465 : void
4466 1634 : RemovePromoteSignalFiles(void)
4467 : {
4468 1634 : unlink(PROMOTE_SIGNAL_FILE);
4469 1634 : }
4470 :
4471 : /*
4472 : * Check to see if a promote request has arrived.
4473 : */
4474 : bool
4475 1440 : CheckPromoteSignal(void)
4476 : {
4477 : struct stat stat_buf;
4478 :
4479 1440 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4480 168 : return true;
4481 :
4482 1272 : return false;
4483 : }
4484 :
4485 : /*
4486 : * Wake up startup process to replay newly arrived WAL, or to notice that
4487 : * failover has been requested.
4488 : */
4489 : void
4490 19510 : WakeupRecovery(void)
4491 : {
4492 19510 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4493 19510 : }
4494 :
4495 : /*
4496 : * Schedule a walreceiver wakeup in the main recovery loop.
4497 : */
4498 : void
4499 4 : XLogRequestWalReceiverReply(void)
4500 : {
4501 4 : doRequestWalReceiverReply = true;
4502 4 : }
4503 :
4504 : /*
4505 : * Is HotStandby active yet? This is only important in special backends
4506 : * since normal backends won't ever be able to connect until this returns
4507 : * true. Postmaster knows this by way of signal, not via shared memory.
4508 : *
4509 : * Unlike testing standbyState, this works in any process that's connected to
4510 : * shared memory. (And note that standbyState alone doesn't tell the truth
4511 : * anyway.)
4512 : */
4513 : bool
4514 308 : HotStandbyActive(void)
4515 : {
4516 : /*
4517 : * We check shared state each time only until Hot Standby is active. We
4518 : * can't de-activate Hot Standby, so there's no need to keep checking
4519 : * after the shared variable has once been seen true.
4520 : */
4521 308 : if (LocalHotStandbyActive)
4522 44 : return true;
4523 : else
4524 : {
4525 : /* spinlock is essential on machines with weak memory ordering! */
4526 264 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4527 264 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4528 264 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4529 :
4530 264 : return LocalHotStandbyActive;
4531 : }
4532 : }
4533 :
4534 : /*
4535 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4536 : * where we don't need to ask any other process what the state is.
4537 : */
4538 : static bool
4539 0 : HotStandbyActiveInReplay(void)
4540 : {
4541 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4542 0 : return LocalHotStandbyActive;
4543 : }
4544 :
4545 : /*
4546 : * Get latest redo apply position.
4547 : *
4548 : * Exported to allow WALReceiver to read the pointer directly.
4549 : */
4550 : XLogRecPtr
4551 60302 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4552 : {
4553 : XLogRecPtr recptr;
4554 : TimeLineID tli;
4555 :
4556 60302 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4557 60302 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4558 60302 : tli = XLogRecoveryCtl->lastReplayedTLI;
4559 60302 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4560 :
4561 60302 : if (replayTLI)
4562 4312 : *replayTLI = tli;
4563 60302 : return recptr;
4564 : }
4565 :
4566 :
4567 : /*
4568 : * Get position of last applied, or the record being applied.
4569 : *
4570 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4571 : * record is currently being applied, this includes that record.
4572 : */
4573 : XLogRecPtr
4574 10370 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4575 : {
4576 : XLogRecPtr recptr;
4577 : TimeLineID tli;
4578 :
4579 10370 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4580 10370 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4581 10370 : tli = XLogRecoveryCtl->replayEndTLI;
4582 10370 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4583 :
4584 10370 : if (replayEndTLI)
4585 10370 : *replayEndTLI = tli;
4586 10370 : return recptr;
4587 : }
4588 :
4589 : /*
4590 : * Save timestamp of latest processed commit/abort record.
4591 : *
4592 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4593 : * seen by processes other than the startup process. Note in particular
4594 : * that CreateRestartPoint is executed in the checkpointer.
4595 : */
4596 : static void
4597 41544 : SetLatestXTime(TimestampTz xtime)
4598 : {
4599 41544 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4600 41544 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4601 41544 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4602 41544 : }
4603 :
4604 : /*
4605 : * Fetch timestamp of latest processed commit/abort record.
4606 : */
4607 : TimestampTz
4608 662 : GetLatestXTime(void)
4609 : {
4610 : TimestampTz xtime;
4611 :
4612 662 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4613 662 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4614 662 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4615 :
4616 662 : return xtime;
4617 : }
4618 :
4619 : /*
4620 : * Save timestamp of the next chunk of WAL records to apply.
4621 : *
4622 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4623 : * seen by all backends.
4624 : */
4625 : static void
4626 8534 : SetCurrentChunkStartTime(TimestampTz xtime)
4627 : {
4628 8534 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4629 8534 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4630 8534 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4631 8534 : }
4632 :
4633 : /*
4634 : * Fetch timestamp of latest processed commit/abort record.
4635 : * Startup process maintains an accurate local copy in XLogReceiptTime
4636 : */
4637 : TimestampTz
4638 204 : GetCurrentChunkReplayStartTime(void)
4639 : {
4640 : TimestampTz xtime;
4641 :
4642 204 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4643 204 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4644 204 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4645 :
4646 204 : return xtime;
4647 : }
4648 :
4649 : /*
4650 : * Returns time of receipt of current chunk of XLOG data, as well as
4651 : * whether it was received from streaming replication or from archives.
4652 : */
4653 : void
4654 58 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4655 : {
4656 : /*
4657 : * This must be executed in the startup process, since we don't export the
4658 : * relevant state to shared memory.
4659 : */
4660 : Assert(InRecovery);
4661 :
4662 58 : *rtime = XLogReceiptTime;
4663 58 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4664 58 : }
4665 :
4666 : /*
4667 : * Note that text field supplied is a parameter name and does not require
4668 : * translation
4669 : */
4670 : void
4671 1150 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4672 : {
4673 1150 : if (currValue < minValue)
4674 : {
4675 0 : if (HotStandbyActiveInReplay())
4676 : {
4677 0 : bool warned_for_promote = false;
4678 :
4679 0 : ereport(WARNING,
4680 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4681 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4682 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4683 : param_name,
4684 : currValue,
4685 : minValue)));
4686 :
4687 0 : SetRecoveryPause(true);
4688 :
4689 0 : ereport(LOG,
4690 : (errmsg("recovery has paused"),
4691 : errdetail("If recovery is unpaused, the server will shut down."),
4692 : errhint("You can then restart the server after making the necessary configuration changes.")));
4693 :
4694 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4695 : {
4696 0 : HandleStartupProcInterrupts();
4697 :
4698 0 : if (CheckForStandbyTrigger())
4699 : {
4700 0 : if (!warned_for_promote)
4701 0 : ereport(WARNING,
4702 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4703 : errmsg("promotion is not possible because of insufficient parameter settings"),
4704 :
4705 : /*
4706 : * Repeat the detail from above so it's easy to find
4707 : * in the log.
4708 : */
4709 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4710 : param_name,
4711 : currValue,
4712 : minValue),
4713 : errhint("Restart the server after making the necessary configuration changes.")));
4714 0 : warned_for_promote = true;
4715 : }
4716 :
4717 : /*
4718 : * If recovery pause is requested then set it paused. While
4719 : * we are in the loop, user might resume and pause again so
4720 : * set this every time.
4721 : */
4722 0 : ConfirmRecoveryPaused();
4723 :
4724 : /*
4725 : * We wait on a condition variable that will wake us as soon
4726 : * as the pause ends, but we use a timeout so we can check the
4727 : * above conditions periodically too.
4728 : */
4729 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4730 : WAIT_EVENT_RECOVERY_PAUSE);
4731 : }
4732 0 : ConditionVariableCancelSleep();
4733 : }
4734 :
4735 0 : ereport(FATAL,
4736 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4737 : errmsg("recovery aborted because of insufficient parameter settings"),
4738 : /* Repeat the detail from above so it's easy to find in the log. */
4739 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4740 : param_name,
4741 : currValue,
4742 : minValue),
4743 : errhint("You can restart the server after making the necessary configuration changes.")));
4744 : }
4745 1150 : }
4746 :
4747 :
4748 : /*
4749 : * GUC check_hook for primary_slot_name
4750 : */
4751 : bool
4752 2290 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4753 : {
4754 2290 : if (*newval && strcmp(*newval, "") != 0 &&
4755 282 : !ReplicationSlotValidateName(*newval, WARNING))
4756 0 : return false;
4757 :
4758 2290 : return true;
4759 : }
4760 :
4761 : /*
4762 : * Recovery target settings: Only one of the several recovery_target* settings
4763 : * may be set. Setting a second one results in an error. The global variable
4764 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4765 : * variables store the actual target value (for example a string or a xid).
4766 : * The assign functions of the parameters check whether a competing parameter
4767 : * was already set. But we want to allow setting the same parameter multiple
4768 : * times. We also want to allow unsetting a parameter and setting a different
4769 : * one, so we unset recoveryTarget when the parameter is set to an empty
4770 : * string.
4771 : *
4772 : * XXX this code is broken by design. Throwing an error from a GUC assign
4773 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4774 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4775 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4776 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4777 : */
4778 :
4779 : static void
4780 : pg_attribute_noreturn()
4781 2 : error_multiple_recovery_targets(void)
4782 : {
4783 2 : ereport(ERROR,
4784 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4785 : errmsg("multiple recovery targets specified"),
4786 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4787 : }
4788 :
4789 : /*
4790 : * GUC check_hook for recovery_target
4791 : */
4792 : bool
4793 2010 : check_recovery_target(char **newval, void **extra, GucSource source)
4794 : {
4795 2010 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4796 : {
4797 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4798 0 : return false;
4799 : }
4800 2010 : return true;
4801 : }
4802 :
4803 : /*
4804 : * GUC assign_hook for recovery_target
4805 : */
4806 : void
4807 2010 : assign_recovery_target(const char *newval, void *extra)
4808 : {
4809 2010 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4810 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4811 0 : error_multiple_recovery_targets();
4812 :
4813 2010 : if (newval && strcmp(newval, "") != 0)
4814 2 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4815 : else
4816 2008 : recoveryTarget = RECOVERY_TARGET_UNSET;
4817 2010 : }
4818 :
4819 : /*
4820 : * GUC check_hook for recovery_target_lsn
4821 : */
4822 : bool
4823 2020 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4824 : {
4825 2020 : if (strcmp(*newval, "") != 0)
4826 : {
4827 : XLogRecPtr lsn;
4828 : XLogRecPtr *myextra;
4829 16 : bool have_error = false;
4830 :
4831 16 : lsn = pg_lsn_in_internal(*newval, &have_error);
4832 16 : if (have_error)
4833 0 : return false;
4834 :
4835 16 : myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4836 16 : *myextra = lsn;
4837 16 : *extra = myextra;
4838 : }
4839 2020 : return true;
4840 : }
4841 :
4842 : /*
4843 : * GUC assign_hook for recovery_target_lsn
4844 : */
4845 : void
4846 2020 : assign_recovery_target_lsn(const char *newval, void *extra)
4847 : {
4848 2020 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4849 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4850 0 : error_multiple_recovery_targets();
4851 :
4852 2020 : if (newval && strcmp(newval, "") != 0)
4853 : {
4854 16 : recoveryTarget = RECOVERY_TARGET_LSN;
4855 16 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4856 : }
4857 : else
4858 2004 : recoveryTarget = RECOVERY_TARGET_UNSET;
4859 2020 : }
4860 :
4861 : /*
4862 : * GUC check_hook for recovery_target_name
4863 : */
4864 : bool
4865 2022 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4866 : {
4867 : /* Use the value of newval directly */
4868 2022 : if (strlen(*newval) >= MAXFNAMELEN)
4869 : {
4870 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4871 : "recovery_target_name", MAXFNAMELEN - 1);
4872 0 : return false;
4873 : }
4874 2022 : return true;
4875 : }
4876 :
4877 : /*
4878 : * GUC assign_hook for recovery_target_name
4879 : */
4880 : void
4881 2022 : assign_recovery_target_name(const char *newval, void *extra)
4882 : {
4883 2022 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4884 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4885 0 : error_multiple_recovery_targets();
4886 :
4887 2022 : if (newval && strcmp(newval, "") != 0)
4888 : {
4889 12 : recoveryTarget = RECOVERY_TARGET_NAME;
4890 12 : recoveryTargetName = newval;
4891 : }
4892 : else
4893 2010 : recoveryTarget = RECOVERY_TARGET_UNSET;
4894 2022 : }
4895 :
4896 : /*
4897 : * GUC check_hook for recovery_target_time
4898 : *
4899 : * The interpretation of the recovery_target_time string can depend on the
4900 : * time zone setting, so we need to wait until after all GUC processing is
4901 : * done before we can do the final parsing of the string. This check function
4902 : * only does a parsing pass to catch syntax errors, but we store the string
4903 : * and parse it again when we need to use it.
4904 : */
4905 : bool
4906 2014 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4907 : {
4908 2014 : if (strcmp(*newval, "") != 0)
4909 : {
4910 : /* reject some special values */
4911 6 : if (strcmp(*newval, "now") == 0 ||
4912 6 : strcmp(*newval, "today") == 0 ||
4913 6 : strcmp(*newval, "tomorrow") == 0 ||
4914 6 : strcmp(*newval, "yesterday") == 0)
4915 : {
4916 0 : return false;
4917 : }
4918 :
4919 : /*
4920 : * parse timestamp value (see also timestamptz_in())
4921 : */
4922 : {
4923 6 : char *str = *newval;
4924 : fsec_t fsec;
4925 : struct pg_tm tt,
4926 6 : *tm = &tt;
4927 : int tz;
4928 : int dtype;
4929 : int nf;
4930 : int dterr;
4931 : char *field[MAXDATEFIELDS];
4932 : int ftype[MAXDATEFIELDS];
4933 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4934 : DateTimeErrorExtra dtextra;
4935 : TimestampTz timestamp;
4936 :
4937 6 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4938 : field, ftype, MAXDATEFIELDS, &nf);
4939 6 : if (dterr == 0)
4940 6 : dterr = DecodeDateTime(field, ftype, nf,
4941 : &dtype, tm, &fsec, &tz, &dtextra);
4942 6 : if (dterr != 0)
4943 0 : return false;
4944 6 : if (dtype != DTK_DATE)
4945 0 : return false;
4946 :
4947 6 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4948 : {
4949 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4950 0 : return false;
4951 : }
4952 : }
4953 : }
4954 2014 : return true;
4955 : }
4956 :
4957 : /*
4958 : * GUC assign_hook for recovery_target_time
4959 : */
4960 : void
4961 2014 : assign_recovery_target_time(const char *newval, void *extra)
4962 : {
4963 2014 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4964 2 : recoveryTarget != RECOVERY_TARGET_TIME)
4965 2 : error_multiple_recovery_targets();
4966 :
4967 2012 : if (newval && strcmp(newval, "") != 0)
4968 4 : recoveryTarget = RECOVERY_TARGET_TIME;
4969 : else
4970 2008 : recoveryTarget = RECOVERY_TARGET_UNSET;
4971 2012 : }
4972 :
4973 : /*
4974 : * GUC check_hook for recovery_target_timeline
4975 : */
4976 : bool
4977 2010 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4978 : {
4979 : RecoveryTargetTimeLineGoal rttg;
4980 : RecoveryTargetTimeLineGoal *myextra;
4981 :
4982 2010 : if (strcmp(*newval, "current") == 0)
4983 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4984 2010 : else if (strcmp(*newval, "latest") == 0)
4985 2010 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4986 : else
4987 : {
4988 0 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
4989 :
4990 0 : errno = 0;
4991 0 : strtoul(*newval, NULL, 0);
4992 0 : if (errno == EINVAL || errno == ERANGE)
4993 : {
4994 0 : GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4995 0 : return false;
4996 : }
4997 : }
4998 :
4999 2010 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(ERROR, sizeof(RecoveryTargetTimeLineGoal));
5000 2010 : *myextra = rttg;
5001 2010 : *extra = myextra;
5002 :
5003 2010 : return true;
5004 : }
5005 :
5006 : /*
5007 : * GUC assign_hook for recovery_target_timeline
5008 : */
5009 : void
5010 2010 : assign_recovery_target_timeline(const char *newval, void *extra)
5011 : {
5012 2010 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5013 2010 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5014 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5015 : else
5016 2010 : recoveryTargetTLIRequested = 0;
5017 2010 : }
5018 :
5019 : /*
5020 : * GUC check_hook for recovery_target_xid
5021 : */
5022 : bool
5023 2010 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5024 : {
5025 2010 : if (strcmp(*newval, "") != 0)
5026 : {
5027 : TransactionId xid;
5028 : TransactionId *myextra;
5029 :
5030 2 : errno = 0;
5031 2 : xid = (TransactionId) strtou64(*newval, NULL, 0);
5032 2 : if (errno == EINVAL || errno == ERANGE)
5033 0 : return false;
5034 :
5035 2 : myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5036 2 : *myextra = xid;
5037 2 : *extra = myextra;
5038 : }
5039 2010 : return true;
5040 : }
5041 :
5042 : /*
5043 : * GUC assign_hook for recovery_target_xid
5044 : */
5045 : void
5046 2010 : assign_recovery_target_xid(const char *newval, void *extra)
5047 : {
5048 2010 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5049 0 : recoveryTarget != RECOVERY_TARGET_XID)
5050 0 : error_multiple_recovery_targets();
5051 :
5052 2010 : if (newval && strcmp(newval, "") != 0)
5053 : {
5054 2 : recoveryTarget = RECOVERY_TARGET_XID;
5055 2 : recoveryTargetXid = *((TransactionId *) extra);
5056 : }
5057 : else
5058 2008 : recoveryTarget = RECOVERY_TARGET_UNSET;
5059 2010 : }
|