Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <math.h>
29 : #include <time.h>
30 : #include <sys/stat.h>
31 : #include <sys/time.h>
32 : #include <unistd.h>
33 :
34 : #include "access/timeline.h"
35 : #include "access/transam.h"
36 : #include "access/xact.h"
37 : #include "access/xlog_internal.h"
38 : #include "access/xlogarchive.h"
39 : #include "access/xlogprefetcher.h"
40 : #include "access/xlogreader.h"
41 : #include "access/xlogrecovery.h"
42 : #include "access/xlogutils.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "pgstat.h"
49 : #include "postmaster/bgwriter.h"
50 : #include "postmaster/startup.h"
51 : #include "replication/slot.h"
52 : #include "replication/slotsync.h"
53 : #include "replication/walreceiver.h"
54 : #include "storage/fd.h"
55 : #include "storage/ipc.h"
56 : #include "storage/latch.h"
57 : #include "storage/pmsignal.h"
58 : #include "storage/procarray.h"
59 : #include "storage/spin.h"
60 : #include "utils/datetime.h"
61 : #include "utils/fmgrprotos.h"
62 : #include "utils/guc_hooks.h"
63 : #include "utils/pg_lsn.h"
64 : #include "utils/ps_status.h"
65 : #include "utils/pg_rusage.h"
66 :
67 : /* Unsupported old recovery command file names (relative to $PGDATA) */
68 : #define RECOVERY_COMMAND_FILE "recovery.conf"
69 : #define RECOVERY_COMMAND_DONE "recovery.done"
70 :
71 : /*
72 : * GUC support
73 : */
74 : const struct config_enum_entry recovery_target_action_options[] = {
75 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
76 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
77 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
78 : {NULL, 0, false}
79 : };
80 :
81 : /* options formerly taken from recovery.conf for archive recovery */
82 : char *recoveryRestoreCommand = NULL;
83 : char *recoveryEndCommand = NULL;
84 : char *archiveCleanupCommand = NULL;
85 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
86 : bool recoveryTargetInclusive = true;
87 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
88 : TransactionId recoveryTargetXid;
89 : char *recovery_target_time_string;
90 : TimestampTz recoveryTargetTime;
91 : const char *recoveryTargetName;
92 : XLogRecPtr recoveryTargetLSN;
93 : int recovery_min_apply_delay = 0;
94 :
95 : /* options formerly taken from recovery.conf for XLOG streaming */
96 : char *PrimaryConnInfo = NULL;
97 : char *PrimarySlotName = NULL;
98 : bool wal_receiver_create_temp_slot = false;
99 :
100 : /*
101 : * recoveryTargetTimeLineGoal: what the user requested, if any
102 : *
103 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
104 : *
105 : * recoveryTargetTLI: the currently understood target timeline; changes
106 : *
107 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
109 : * always the first list member). Only these TLIs are expected to be seen in
110 : * the WAL segments we read, and indeed only these TLIs will be considered as
111 : * candidate WAL files to open at all.
112 : *
113 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
114 : * (This is not necessarily the same as the timeline from which we are
115 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116 : * scanning data that was copied from an ancestor timeline when the current
117 : * file was created.) During a sequential scan we do not allow this value
118 : * to decrease.
119 : */
120 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
121 : TimeLineID recoveryTargetTLIRequested = 0;
122 : TimeLineID recoveryTargetTLI = 0;
123 : static List *expectedTLEs;
124 : static TimeLineID curFileTLI;
125 :
126 : /*
127 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
128 : * ie. signal files were present. When InArchiveRecovery is set, we are
129 : * currently recovering using offline XLOG archives. These variables are only
130 : * valid in the startup process.
131 : *
132 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133 : * currently performing crash recovery using only XLOG files in pg_wal, but
134 : * will switch to using offline XLOG archives as soon as we reach the end of
135 : * WAL in pg_wal.
136 : */
137 : bool ArchiveRecoveryRequested = false;
138 : bool InArchiveRecovery = false;
139 :
140 : /*
141 : * When StandbyModeRequested is set, standby mode was requested, i.e.
142 : * standby.signal file was present. When StandbyMode is set, we are currently
143 : * in standby mode. These variables are only valid in the startup process.
144 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
145 : */
146 : static bool StandbyModeRequested = false;
147 : bool StandbyMode = false;
148 :
149 : /* was a signal file present at startup? */
150 : static bool standby_signal_file_found = false;
151 : static bool recovery_signal_file_found = false;
152 :
153 : /*
154 : * CheckPointLoc is the position of the checkpoint record that determines
155 : * where to start the replay. It comes from the backup label file or the
156 : * control file.
157 : *
158 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159 : * file or the control file. In standby mode, XLOG streaming usually starts
160 : * from the position where an invalid record was found. But if we fail to
161 : * read even the initial checkpoint record, we use the REDO location instead
162 : * of the checkpoint location as the start position of XLOG streaming.
163 : * Otherwise we would have to jump backwards to the REDO location after
164 : * reading the checkpoint record, because the REDO record can precede the
165 : * checkpoint record.
166 : */
167 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
168 : static TimeLineID CheckPointTLI = 0;
169 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
170 : static TimeLineID RedoStartTLI = 0;
171 :
172 : /*
173 : * Local copy of SharedHotStandbyActive variable. False actually means "not
174 : * known, need to check the shared state".
175 : */
176 : static bool LocalHotStandbyActive = false;
177 :
178 : /*
179 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180 : * known, need to check the shared state".
181 : */
182 : static bool LocalPromoteIsTriggered = false;
183 :
184 : /* Has the recovery code requested a walreceiver wakeup? */
185 : static bool doRequestWalReceiverReply;
186 :
187 : /* XLogReader object used to parse the WAL records */
188 : static XLogReaderState *xlogreader = NULL;
189 :
190 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
191 : static XLogPrefetcher *xlogprefetcher = NULL;
192 :
193 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
194 : typedef struct XLogPageReadPrivate
195 : {
196 : int emode;
197 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
198 : bool randAccess;
199 : TimeLineID replayTLI;
200 : } XLogPageReadPrivate;
201 :
202 : /* flag to tell XLogPageRead that we have started replaying */
203 : static bool InRedo = false;
204 :
205 : /*
206 : * Codes indicating where we got a WAL file from during recovery, or where
207 : * to attempt to get one.
208 : */
209 : typedef enum
210 : {
211 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
212 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
213 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
214 : XLOG_FROM_STREAM, /* streamed from primary */
215 : } XLogSource;
216 :
217 : /* human-readable names for XLogSources, for debugging output */
218 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
219 :
220 : /*
221 : * readFile is -1 or a kernel FD for the log file segment that's currently
222 : * open for reading. readSegNo identifies the segment. readOff is the offset
223 : * of the page just read, readLen indicates how much of it has been read into
224 : * readBuf, and readSource indicates where we got the currently open file from.
225 : *
226 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228 : * worthwhile, since the XLOG is not read by general-purpose sessions.
229 : */
230 : static int readFile = -1;
231 : static XLogSegNo readSegNo = 0;
232 : static uint32 readOff = 0;
233 : static uint32 readLen = 0;
234 : static XLogSource readSource = XLOG_FROM_ANY;
235 :
236 : /*
237 : * Keeps track of which source we're currently reading from. This is
238 : * different from readSource in that this is always set, even when we don't
239 : * currently have a WAL file open. If lastSourceFailed is set, our last
240 : * attempt to read from currentSource failed, and we should try another source
241 : * next.
242 : *
243 : * pendingWalRcvRestart is set when a config change occurs that requires a
244 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
245 : */
246 : static XLogSource currentSource = XLOG_FROM_ANY;
247 : static bool lastSourceFailed = false;
248 : static bool pendingWalRcvRestart = false;
249 :
250 : /*
251 : * These variables track when we last obtained some WAL data to process,
252 : * and where we got it from. (XLogReceiptSource is initially the same as
253 : * readSource, but readSource gets reset to zero when we don't have data
254 : * to process right now. It is also different from currentSource, which
255 : * also changes when we try to read from a source and fail, while
256 : * XLogReceiptSource tracks where we last successfully read some WAL.)
257 : */
258 : static TimestampTz XLogReceiptTime = 0;
259 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
260 :
261 : /* Local copy of WalRcv->flushedUpto */
262 : static XLogRecPtr flushedUpto = 0;
263 : static TimeLineID receiveTLI = 0;
264 :
265 : /*
266 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
267 : *
268 : * In order to reach consistency, we must replay the WAL up to
269 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
270 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271 : * to backupStartPoint.
272 : *
273 : * Note: In archive recovery, after consistency has been reached, the
274 : * functions in xlog.c will start updating minRecoveryPoint in the control
275 : * file. But this copy of minRecoveryPoint variable reflects the value at the
276 : * beginning of recovery, and is *not* updated after consistency is reached.
277 : */
278 : static XLogRecPtr minRecoveryPoint;
279 : static TimeLineID minRecoveryPointTLI;
280 :
281 : static XLogRecPtr backupStartPoint;
282 : static XLogRecPtr backupEndPoint;
283 : static bool backupEndRequired = false;
284 :
285 : /*
286 : * Have we reached a consistent database state? In crash recovery, we have
287 : * to replay all the WAL, so reachedConsistency is never set. During archive
288 : * recovery, the database is consistent once minRecoveryPoint is reached.
289 : *
290 : * Consistent state means that the system is internally consistent, all
291 : * the WAL has been replayed up to a certain point, and importantly, there
292 : * is no trace of later actions on disk.
293 : */
294 : bool reachedConsistency = false;
295 :
296 : /* Buffers dedicated to consistency checks of size BLCKSZ */
297 : static char *replay_image_masked = NULL;
298 : static char *primary_image_masked = NULL;
299 :
300 :
301 : /*
302 : * Shared-memory state for WAL recovery.
303 : */
304 : typedef struct XLogRecoveryCtlData
305 : {
306 : /*
307 : * SharedHotStandbyActive indicates if we allow hot standby queries to be
308 : * run. Protected by info_lck.
309 : */
310 : bool SharedHotStandbyActive;
311 :
312 : /*
313 : * SharedPromoteIsTriggered indicates if a standby promotion has been
314 : * triggered. Protected by info_lck.
315 : */
316 : bool SharedPromoteIsTriggered;
317 :
318 : /*
319 : * recoveryWakeupLatch is used to wake up the startup process to continue
320 : * WAL replay, if it is waiting for WAL to arrive or promotion to be
321 : * requested.
322 : *
323 : * Note that the startup process also uses another latch, its procLatch,
324 : * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325 : * signaling the startup process in favor of using its procLatch, which
326 : * comports better with possible generic signal handlers using that latch.
327 : * But we should not do that because the startup process doesn't assume
328 : * that it's waken up by walreceiver process or SIGHUP signal handler
329 : * while it's waiting for recovery conflict. The separate latches,
330 : * recoveryWakeupLatch and procLatch, should be used for inter-process
331 : * communication for WAL replay and recovery conflict, respectively.
332 : */
333 : Latch recoveryWakeupLatch;
334 :
335 : /*
336 : * Last record successfully replayed.
337 : */
338 : XLogRecPtr lastReplayedReadRecPtr; /* start position */
339 : XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
340 : TimeLineID lastReplayedTLI; /* timeline */
341 :
342 : /*
343 : * When we're currently replaying a record, ie. in a redo function,
344 : * replayEndRecPtr points to the end+1 of the record being replayed,
345 : * otherwise it's equal to lastReplayedEndRecPtr.
346 : */
347 : XLogRecPtr replayEndRecPtr;
348 : TimeLineID replayEndTLI;
349 : /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
350 : TimestampTz recoveryLastXTime;
351 :
352 : /*
353 : * timestamp of when we started replaying the current chunk of WAL data,
354 : * only relevant for replication or archive recovery
355 : */
356 : TimestampTz currentChunkStartTime;
357 : /* Recovery pause state */
358 : RecoveryPauseState recoveryPauseState;
359 : ConditionVariable recoveryNotPausedCV;
360 :
361 : slock_t info_lck; /* locks shared variables shown above */
362 : } XLogRecoveryCtlData;
363 :
364 : static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
365 :
366 : /*
367 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
368 : * recovery completes; missingContrecPtr is the location of the first
369 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
370 : * details.
371 : */
372 : static XLogRecPtr abortedRecPtr;
373 : static XLogRecPtr missingContrecPtr;
374 :
375 : /*
376 : * if recoveryStopsBefore/After returns true, it saves information of the stop
377 : * point here
378 : */
379 : static TransactionId recoveryStopXid;
380 : static TimestampTz recoveryStopTime;
381 : static XLogRecPtr recoveryStopLSN;
382 : static char recoveryStopName[MAXFNAMELEN];
383 : static bool recoveryStopAfter;
384 :
385 : /* prototypes for local functions */
386 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
387 :
388 : static void EnableStandbyMode(void);
389 : static void readRecoverySignalFile(void);
390 : static void validateRecoveryParameters(void);
391 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
392 : TimeLineID *backupLabelTLI,
393 : bool *backupEndRequired, bool *backupFromStandby);
394 : static bool read_tablespace_map(List **tablespaces);
395 :
396 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
397 : static void CheckRecoveryConsistency(void);
398 : static void rm_redo_error_callback(void *arg);
399 : #ifdef WAL_DEBUG
400 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
401 : #endif
402 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
403 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
404 : TimeLineID prevTLI, TimeLineID replayTLI);
405 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
406 : static void verifyBackupPageConsistency(XLogReaderState *record);
407 :
408 : static bool recoveryStopsBefore(XLogReaderState *record);
409 : static bool recoveryStopsAfter(XLogReaderState *record);
410 : static char *getRecoveryStopReason(void);
411 : static void recoveryPausesHere(bool endOfRecovery);
412 : static bool recoveryApplyDelay(XLogReaderState *record);
413 : static void ConfirmRecoveryPaused(void);
414 :
415 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
416 : int emode, bool fetching_ckpt,
417 : TimeLineID replayTLI);
418 :
419 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
420 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
421 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
422 : bool randAccess,
423 : bool fetching_ckpt,
424 : XLogRecPtr tliRecPtr,
425 : TimeLineID replayTLI,
426 : XLogRecPtr replayLSN,
427 : bool nonblocking);
428 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
429 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
430 : XLogRecPtr RecPtr, TimeLineID replayTLI);
431 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
432 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
433 : XLogSource source, bool notfoundOk);
434 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
435 :
436 : static bool CheckForStandbyTrigger(void);
437 : static void SetPromoteIsTriggered(void);
438 : static bool HotStandbyActiveInReplay(void);
439 :
440 : static void SetCurrentChunkStartTime(TimestampTz xtime);
441 : static void SetLatestXTime(TimestampTz xtime);
442 :
443 : /*
444 : * Initialization of shared memory for WAL recovery
445 : */
446 : Size
447 5484 : XLogRecoveryShmemSize(void)
448 : {
449 : Size size;
450 :
451 : /* XLogRecoveryCtl */
452 5484 : size = sizeof(XLogRecoveryCtlData);
453 :
454 5484 : return size;
455 : }
456 :
457 : void
458 1918 : XLogRecoveryShmemInit(void)
459 : {
460 : bool found;
461 :
462 1918 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
463 1918 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
464 1918 : if (found)
465 0 : return;
466 1918 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
467 :
468 1918 : SpinLockInit(&XLogRecoveryCtl->info_lck);
469 1918 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
470 1918 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
471 : }
472 :
473 : /*
474 : * A thin wrapper to enable StandbyMode and do other preparatory work as
475 : * needed.
476 : */
477 : static void
478 192 : EnableStandbyMode(void)
479 : {
480 192 : StandbyMode = true;
481 :
482 : /*
483 : * To avoid server log bloat, we don't report recovery progress in a
484 : * standby as it will always be in recovery unless promoted. We disable
485 : * startup progress timeout in standby mode to avoid calling
486 : * startup_progress_timeout_handler() unnecessarily.
487 : */
488 192 : disable_startup_progress_timeout();
489 192 : }
490 :
491 : /*
492 : * Prepare the system for WAL recovery, if needed.
493 : *
494 : * This is called by StartupXLOG() which coordinates the server startup
495 : * sequence. This function analyzes the control file and the backup label
496 : * file, if any, and figures out whether we need to perform crash recovery or
497 : * archive recovery, and how far we need to replay the WAL to reach a
498 : * consistent state.
499 : *
500 : * This doesn't yet change the on-disk state, except for creating the symlinks
501 : * from table space map file if any, and for fetching WAL files needed to find
502 : * the checkpoint record. On entry, the caller has already read the control
503 : * file into memory, and passes it as argument. This function updates it to
504 : * reflect the recovery state, and the caller is expected to write it back to
505 : * disk does after initializing other subsystems, but before calling
506 : * PerformWalRecovery().
507 : *
508 : * This initializes some global variables like ArchiveRecoveryRequested, and
509 : * StandbyModeRequested and InRecovery.
510 : */
511 : void
512 1650 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
513 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
514 : {
515 : XLogPageReadPrivate *private;
516 : struct stat st;
517 : bool wasShutdown;
518 : XLogRecord *record;
519 : DBState dbstate_at_startup;
520 1650 : bool haveTblspcMap = false;
521 1650 : bool haveBackupLabel = false;
522 : CheckPoint checkPoint;
523 1650 : bool backupFromStandby = false;
524 :
525 1650 : dbstate_at_startup = ControlFile->state;
526 :
527 : /*
528 : * Initialize on the assumption we want to recover to the latest timeline
529 : * that's active according to pg_control.
530 : */
531 1650 : if (ControlFile->minRecoveryPointTLI >
532 1650 : ControlFile->checkPointCopy.ThisTimeLineID)
533 4 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
534 : else
535 1646 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
536 :
537 : /*
538 : * Check for signal files, and if so set up state for offline recovery
539 : */
540 1650 : readRecoverySignalFile();
541 1650 : validateRecoveryParameters();
542 :
543 : /*
544 : * Take ownership of the wakeup latch if we're going to sleep during
545 : * recovery, if required.
546 : */
547 1650 : if (ArchiveRecoveryRequested)
548 202 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
549 :
550 : /*
551 : * Set the WAL reading processor now, as it will be needed when reading
552 : * the checkpoint record required (backup_label or not).
553 : */
554 1650 : private = palloc0(sizeof(XLogPageReadPrivate));
555 1650 : xlogreader =
556 1650 : XLogReaderAllocate(wal_segment_size, NULL,
557 1650 : XL_ROUTINE(.page_read = &XLogPageRead,
558 : .segment_open = NULL,
559 : .segment_close = wal_segment_close),
560 : private);
561 1650 : if (!xlogreader)
562 0 : ereport(ERROR,
563 : (errcode(ERRCODE_OUT_OF_MEMORY),
564 : errmsg("out of memory"),
565 : errdetail("Failed while allocating a WAL reading processor.")));
566 1650 : xlogreader->system_identifier = ControlFile->system_identifier;
567 :
568 : /*
569 : * Set the WAL decode buffer size. This limits how far ahead we can read
570 : * in the WAL.
571 : */
572 1650 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
573 :
574 : /* Create a WAL prefetcher. */
575 1650 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
576 :
577 : /*
578 : * Allocate two page buffers dedicated to WAL consistency checks. We do
579 : * it this way, rather than just making static arrays, for two reasons:
580 : * (1) no need to waste the storage in most instantiations of the backend;
581 : * (2) a static char array isn't guaranteed to have any particular
582 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
583 : */
584 1650 : replay_image_masked = (char *) palloc(BLCKSZ);
585 1650 : primary_image_masked = (char *) palloc(BLCKSZ);
586 :
587 : /*
588 : * Read the backup_label file. We want to run this part of the recovery
589 : * process after checking for signal files and after performing validation
590 : * of the recovery parameters.
591 : */
592 1650 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
593 : &backupFromStandby))
594 : {
595 136 : List *tablespaces = NIL;
596 :
597 : /*
598 : * Archive recovery was requested, and thanks to the backup label
599 : * file, we know how far we need to replay to reach consistency. Enter
600 : * archive recovery directly.
601 : */
602 136 : InArchiveRecovery = true;
603 136 : if (StandbyModeRequested)
604 114 : EnableStandbyMode();
605 :
606 : /*
607 : * Omitting backup_label when creating a new replica, PITR node etc.
608 : * unfortunately is a common cause of corruption. Logging that
609 : * backup_label was used makes it a bit easier to exclude that as the
610 : * cause of observed corruption.
611 : *
612 : * Do so before we try to read the checkpoint record (which can fail),
613 : * as otherwise it can be hard to understand why a checkpoint other
614 : * than ControlFile->checkPoint is used.
615 : */
616 136 : ereport(LOG,
617 : (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
618 : LSN_FORMAT_ARGS(RedoStartLSN),
619 : LSN_FORMAT_ARGS(CheckPointLoc),
620 : CheckPointTLI)));
621 :
622 : /*
623 : * When a backup_label file is present, we want to roll forward from
624 : * the checkpoint it identifies, rather than using pg_control.
625 : */
626 136 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
627 : CheckPointTLI);
628 136 : if (record != NULL)
629 : {
630 136 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
631 136 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
632 136 : ereport(DEBUG1,
633 : (errmsg_internal("checkpoint record is at %X/%X",
634 : LSN_FORMAT_ARGS(CheckPointLoc))));
635 136 : InRecovery = true; /* force recovery even if SHUTDOWNED */
636 :
637 : /*
638 : * Make sure that REDO location exists. This may not be the case
639 : * if there was a crash during an online backup, which left a
640 : * backup_label around that references a WAL segment that's
641 : * already been archived.
642 : */
643 136 : if (checkPoint.redo < CheckPointLoc)
644 : {
645 136 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
646 136 : if (!ReadRecord(xlogprefetcher, LOG, false,
647 : checkPoint.ThisTimeLineID))
648 0 : ereport(FATAL,
649 : (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
650 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
651 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
652 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
653 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
654 : DataDir, DataDir, DataDir, DataDir)));
655 : }
656 : }
657 : else
658 : {
659 0 : ereport(FATAL,
660 : (errmsg("could not locate required checkpoint record at %X/%X",
661 : LSN_FORMAT_ARGS(CheckPointLoc)),
662 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
663 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
664 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
665 : DataDir, DataDir, DataDir, DataDir)));
666 : wasShutdown = false; /* keep compiler quiet */
667 : }
668 :
669 : /* Read the tablespace_map file if present and create symlinks. */
670 136 : if (read_tablespace_map(&tablespaces))
671 : {
672 : ListCell *lc;
673 :
674 8 : foreach(lc, tablespaces)
675 : {
676 4 : tablespaceinfo *ti = lfirst(lc);
677 : char *linkloc;
678 :
679 4 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
680 :
681 : /*
682 : * Remove the existing symlink if any and Create the symlink
683 : * under PGDATA.
684 : */
685 4 : remove_tablespace_symlink(linkloc);
686 :
687 4 : if (symlink(ti->path, linkloc) < 0)
688 0 : ereport(ERROR,
689 : (errcode_for_file_access(),
690 : errmsg("could not create symbolic link \"%s\": %m",
691 : linkloc)));
692 :
693 4 : pfree(ti->path);
694 4 : pfree(ti);
695 : }
696 :
697 : /* tell the caller to delete it later */
698 4 : haveTblspcMap = true;
699 : }
700 :
701 : /* tell the caller to delete it later */
702 136 : haveBackupLabel = true;
703 : }
704 : else
705 : {
706 : /* No backup_label file has been found if we are here. */
707 :
708 : /*
709 : * If tablespace_map file is present without backup_label file, there
710 : * is no use of such file. There is no harm in retaining it, but it
711 : * is better to get rid of the map file so that we don't have any
712 : * redundant file in data directory and it will avoid any sort of
713 : * confusion. It seems prudent though to just rename the file out of
714 : * the way rather than delete it completely, also we ignore any error
715 : * that occurs in rename operation as even if map file is present
716 : * without backup_label file, it is harmless.
717 : */
718 1514 : if (stat(TABLESPACE_MAP, &st) == 0)
719 : {
720 2 : unlink(TABLESPACE_MAP_OLD);
721 2 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
722 2 : ereport(LOG,
723 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
724 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
725 : errdetail("File \"%s\" was renamed to \"%s\".",
726 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
727 : else
728 0 : ereport(LOG,
729 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
730 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
731 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
732 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
733 : }
734 :
735 : /*
736 : * It's possible that archive recovery was requested, but we don't
737 : * know how far we need to replay the WAL before we reach consistency.
738 : * This can happen for example if a base backup is taken from a
739 : * running server using an atomic filesystem snapshot, without calling
740 : * pg_backup_start/stop. Or if you just kill a running primary server
741 : * and put it into archive recovery by creating a recovery signal
742 : * file.
743 : *
744 : * Our strategy in that case is to perform crash recovery first,
745 : * replaying all the WAL present in pg_wal, and only enter archive
746 : * recovery after that.
747 : *
748 : * But usually we already know how far we need to replay the WAL (up
749 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
750 : * end-of-backup record), and we can enter archive recovery directly.
751 : */
752 1514 : if (ArchiveRecoveryRequested &&
753 78 : (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
754 18 : ControlFile->backupEndRequired ||
755 18 : ControlFile->backupEndPoint != InvalidXLogRecPtr ||
756 18 : ControlFile->state == DB_SHUTDOWNED))
757 : {
758 74 : InArchiveRecovery = true;
759 74 : if (StandbyModeRequested)
760 74 : EnableStandbyMode();
761 : }
762 :
763 : /*
764 : * For the same reason as when starting up with backup_label present,
765 : * emit a log message when we continue initializing from a base
766 : * backup.
767 : */
768 1514 : if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
769 0 : ereport(LOG,
770 : (errmsg("restarting backup recovery with redo LSN %X/%X",
771 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
772 :
773 : /* Get the last valid checkpoint record. */
774 1514 : CheckPointLoc = ControlFile->checkPoint;
775 1514 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
776 1514 : RedoStartLSN = ControlFile->checkPointCopy.redo;
777 1514 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
778 1514 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
779 : CheckPointTLI);
780 1514 : if (record != NULL)
781 : {
782 1514 : ereport(DEBUG1,
783 : (errmsg_internal("checkpoint record is at %X/%X",
784 : LSN_FORMAT_ARGS(CheckPointLoc))));
785 : }
786 : else
787 : {
788 : /*
789 : * We used to attempt to go back to a secondary checkpoint record
790 : * here, but only when not in standby mode. We now just fail if we
791 : * can't read the last checkpoint because this allows us to
792 : * simplify processing around checkpoints.
793 : */
794 0 : ereport(PANIC,
795 : (errmsg("could not locate a valid checkpoint record at %X/%X",
796 : LSN_FORMAT_ARGS(CheckPointLoc))));
797 : }
798 1514 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
799 1514 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
800 : }
801 :
802 1650 : if (ArchiveRecoveryRequested)
803 : {
804 202 : if (StandbyModeRequested)
805 192 : ereport(LOG,
806 : (errmsg("entering standby mode")));
807 10 : else if (recoveryTarget == RECOVERY_TARGET_XID)
808 0 : ereport(LOG,
809 : (errmsg("starting point-in-time recovery to XID %u",
810 : recoveryTargetXid)));
811 10 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
812 0 : ereport(LOG,
813 : (errmsg("starting point-in-time recovery to %s",
814 : timestamptz_to_str(recoveryTargetTime))));
815 10 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
816 6 : ereport(LOG,
817 : (errmsg("starting point-in-time recovery to \"%s\"",
818 : recoveryTargetName)));
819 4 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
820 0 : ereport(LOG,
821 : (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
822 : LSN_FORMAT_ARGS(recoveryTargetLSN))));
823 4 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
824 0 : ereport(LOG,
825 : (errmsg("starting point-in-time recovery to earliest consistent point")));
826 : else
827 4 : ereport(LOG,
828 : (errmsg("starting archive recovery")));
829 : }
830 :
831 : /*
832 : * If the location of the checkpoint record is not on the expected
833 : * timeline in the history of the requested timeline, we cannot proceed:
834 : * the backup is not part of the history of the requested timeline.
835 : */
836 : Assert(expectedTLEs); /* was initialized by reading checkpoint
837 : * record */
838 1650 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
839 : CheckPointTLI)
840 : {
841 : XLogRecPtr switchpoint;
842 :
843 : /*
844 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
845 : * not in expectedTLEs at all.
846 : */
847 0 : switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
848 0 : ereport(FATAL,
849 : (errmsg("requested timeline %u is not a child of this server's history",
850 : recoveryTargetTLI),
851 : errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
852 : LSN_FORMAT_ARGS(ControlFile->checkPoint),
853 : ControlFile->checkPointCopy.ThisTimeLineID,
854 : LSN_FORMAT_ARGS(switchpoint))));
855 : }
856 :
857 : /*
858 : * The min recovery point should be part of the requested timeline's
859 : * history, too.
860 : */
861 1650 : if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
862 72 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
863 72 : ControlFile->minRecoveryPointTLI)
864 0 : ereport(FATAL,
865 : (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
866 : recoveryTargetTLI,
867 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
868 : ControlFile->minRecoveryPointTLI)));
869 :
870 1650 : ereport(DEBUG1,
871 : (errmsg_internal("redo record is at %X/%X; shutdown %s",
872 : LSN_FORMAT_ARGS(checkPoint.redo),
873 : wasShutdown ? "true" : "false")));
874 1650 : ereport(DEBUG1,
875 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
876 : U64FromFullTransactionId(checkPoint.nextXid),
877 : checkPoint.nextOid)));
878 1650 : ereport(DEBUG1,
879 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
880 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
881 1650 : ereport(DEBUG1,
882 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
883 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
884 1650 : ereport(DEBUG1,
885 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
886 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
887 1650 : ereport(DEBUG1,
888 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
889 : checkPoint.oldestCommitTsXid,
890 : checkPoint.newestCommitTsXid)));
891 1650 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
892 0 : ereport(PANIC,
893 : (errmsg("invalid next transaction ID")));
894 :
895 : /* sanity check */
896 1650 : if (checkPoint.redo > CheckPointLoc)
897 0 : ereport(PANIC,
898 : (errmsg("invalid redo in checkpoint record")));
899 :
900 : /*
901 : * Check whether we need to force recovery from WAL. If it appears to
902 : * have been a clean shutdown and we did not have a recovery signal file,
903 : * then assume no recovery needed.
904 : */
905 1650 : if (checkPoint.redo < CheckPointLoc)
906 : {
907 212 : if (wasShutdown)
908 0 : ereport(PANIC,
909 : (errmsg("invalid redo record in shutdown checkpoint")));
910 212 : InRecovery = true;
911 : }
912 1438 : else if (ControlFile->state != DB_SHUTDOWNED)
913 184 : InRecovery = true;
914 1254 : else if (ArchiveRecoveryRequested)
915 : {
916 : /* force recovery due to presence of recovery signal file */
917 14 : InRecovery = true;
918 : }
919 :
920 : /*
921 : * If recovery is needed, update our in-memory copy of pg_control to show
922 : * that we are recovering and to show the selected checkpoint as the place
923 : * we are starting from. We also mark pg_control with any minimum recovery
924 : * stop point obtained from a backup history file.
925 : *
926 : * We don't write the changes to disk yet, though. Only do that after
927 : * initializing various subsystems.
928 : */
929 1650 : if (InRecovery)
930 : {
931 410 : if (InArchiveRecovery)
932 : {
933 210 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
934 : }
935 : else
936 : {
937 200 : ereport(LOG,
938 : (errmsg("database system was not properly shut down; "
939 : "automatic recovery in progress")));
940 200 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
941 4 : ereport(LOG,
942 : (errmsg("crash recovery starts in timeline %u "
943 : "and has target timeline %u",
944 : ControlFile->checkPointCopy.ThisTimeLineID,
945 : recoveryTargetTLI)));
946 200 : ControlFile->state = DB_IN_CRASH_RECOVERY;
947 : }
948 410 : ControlFile->checkPoint = CheckPointLoc;
949 410 : ControlFile->checkPointCopy = checkPoint;
950 410 : if (InArchiveRecovery)
951 : {
952 : /* initialize minRecoveryPoint if not set yet */
953 210 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
954 : {
955 142 : ControlFile->minRecoveryPoint = checkPoint.redo;
956 142 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
957 : }
958 : }
959 :
960 : /*
961 : * Set backupStartPoint if we're starting recovery from a base backup.
962 : *
963 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
964 : * location if we're starting recovery from a base backup which was
965 : * taken from a standby. In this case, the database system status in
966 : * pg_control must indicate that the database was already in recovery.
967 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
968 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
969 : * before reaching this point; e.g. because restore_command or
970 : * primary_conninfo were faulty.
971 : *
972 : * Any other state indicates that the backup somehow became corrupted
973 : * and we can't sensibly continue with recovery.
974 : */
975 410 : if (haveBackupLabel)
976 : {
977 136 : ControlFile->backupStartPoint = checkPoint.redo;
978 136 : ControlFile->backupEndRequired = backupEndRequired;
979 :
980 136 : if (backupFromStandby)
981 : {
982 8 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
983 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
984 0 : ereport(FATAL,
985 : (errmsg("backup_label contains data inconsistent with control file"),
986 : errhint("This means that the backup is corrupted and you will "
987 : "have to use another backup for recovery.")));
988 8 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
989 : }
990 : }
991 : }
992 :
993 : /* remember these, so that we know when we have reached consistency */
994 1650 : backupStartPoint = ControlFile->backupStartPoint;
995 1650 : backupEndRequired = ControlFile->backupEndRequired;
996 1650 : backupEndPoint = ControlFile->backupEndPoint;
997 1650 : if (InArchiveRecovery)
998 : {
999 210 : minRecoveryPoint = ControlFile->minRecoveryPoint;
1000 210 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1001 : }
1002 : else
1003 : {
1004 1440 : minRecoveryPoint = InvalidXLogRecPtr;
1005 1440 : minRecoveryPointTLI = 0;
1006 : }
1007 :
1008 : /*
1009 : * Start recovery assuming that the final record isn't lost.
1010 : */
1011 1650 : abortedRecPtr = InvalidXLogRecPtr;
1012 1650 : missingContrecPtr = InvalidXLogRecPtr;
1013 :
1014 1650 : *wasShutdown_ptr = wasShutdown;
1015 1650 : *haveBackupLabel_ptr = haveBackupLabel;
1016 1650 : *haveTblspcMap_ptr = haveTblspcMap;
1017 1650 : }
1018 :
1019 : /*
1020 : * See if there are any recovery signal files and if so, set state for
1021 : * recovery.
1022 : *
1023 : * See if there is a recovery command file (recovery.conf), and if so
1024 : * throw an ERROR since as of PG12 we no longer recognize that.
1025 : */
1026 : static void
1027 1650 : readRecoverySignalFile(void)
1028 : {
1029 : struct stat stat_buf;
1030 :
1031 1650 : if (IsBootstrapProcessingMode())
1032 1448 : return;
1033 :
1034 : /*
1035 : * Check for old recovery API file: recovery.conf
1036 : */
1037 1560 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1038 0 : ereport(FATAL,
1039 : (errcode_for_file_access(),
1040 : errmsg("using recovery command file \"%s\" is not supported",
1041 : RECOVERY_COMMAND_FILE)));
1042 :
1043 : /*
1044 : * Remove unused .done file, if present. Ignore if absent.
1045 : */
1046 1560 : unlink(RECOVERY_COMMAND_DONE);
1047 :
1048 : /*
1049 : * Check for recovery signal files and if found, fsync them since they
1050 : * represent server state information. We don't sweat too much about the
1051 : * possibility of fsync failure, however.
1052 : *
1053 : * If present, standby signal file takes precedence. If neither is present
1054 : * then we won't enter archive recovery.
1055 : */
1056 1560 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1057 : {
1058 : int fd;
1059 :
1060 192 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1061 : S_IRUSR | S_IWUSR);
1062 192 : if (fd >= 0)
1063 : {
1064 192 : (void) pg_fsync(fd);
1065 192 : close(fd);
1066 : }
1067 192 : standby_signal_file_found = true;
1068 : }
1069 1368 : else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1070 : {
1071 : int fd;
1072 :
1073 10 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1074 : S_IRUSR | S_IWUSR);
1075 10 : if (fd >= 0)
1076 : {
1077 10 : (void) pg_fsync(fd);
1078 10 : close(fd);
1079 : }
1080 10 : recovery_signal_file_found = true;
1081 : }
1082 :
1083 1560 : StandbyModeRequested = false;
1084 1560 : ArchiveRecoveryRequested = false;
1085 1560 : if (standby_signal_file_found)
1086 : {
1087 192 : StandbyModeRequested = true;
1088 192 : ArchiveRecoveryRequested = true;
1089 : }
1090 1368 : else if (recovery_signal_file_found)
1091 : {
1092 10 : StandbyModeRequested = false;
1093 10 : ArchiveRecoveryRequested = true;
1094 : }
1095 : else
1096 1358 : return;
1097 :
1098 : /*
1099 : * We don't support standby mode in standalone backends; that requires
1100 : * other processes such as the WAL receiver to be alive.
1101 : */
1102 202 : if (StandbyModeRequested && !IsUnderPostmaster)
1103 0 : ereport(FATAL,
1104 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1105 : errmsg("standby mode is not supported by single-user servers")));
1106 : }
1107 :
1108 : static void
1109 1650 : validateRecoveryParameters(void)
1110 : {
1111 1650 : if (!ArchiveRecoveryRequested)
1112 1448 : return;
1113 :
1114 : /*
1115 : * Check for compulsory parameters
1116 : */
1117 202 : if (StandbyModeRequested)
1118 : {
1119 192 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1120 18 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1121 4 : ereport(WARNING,
1122 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1123 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1124 : }
1125 : else
1126 : {
1127 10 : if (recoveryRestoreCommand == NULL ||
1128 10 : strcmp(recoveryRestoreCommand, "") == 0)
1129 0 : ereport(FATAL,
1130 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1131 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1132 : }
1133 :
1134 : /*
1135 : * Override any inconsistent requests. Note that this is a change of
1136 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1137 : * hot_standby = off, which was surprising behaviour.
1138 : */
1139 202 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1140 188 : !EnableHotStandby)
1141 4 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1142 :
1143 : /*
1144 : * Final parsing of recovery_target_time string; see also
1145 : * check_recovery_target_time().
1146 : */
1147 202 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1148 : {
1149 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1150 : CStringGetDatum(recovery_target_time_string),
1151 : ObjectIdGetDatum(InvalidOid),
1152 : Int32GetDatum(-1)));
1153 : }
1154 :
1155 : /*
1156 : * If user specified recovery_target_timeline, validate it or compute the
1157 : * "latest" value. We can't do this until after we've gotten the restore
1158 : * command and set InArchiveRecovery, because we need to fetch timeline
1159 : * history files from the archive.
1160 : */
1161 202 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1162 : {
1163 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1164 :
1165 : /* Timeline 1 does not have a history file, all else should */
1166 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1167 0 : ereport(FATAL,
1168 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1169 : errmsg("recovery target timeline %u does not exist",
1170 : rtli)));
1171 0 : recoveryTargetTLI = rtli;
1172 : }
1173 202 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1174 : {
1175 : /* We start the "latest" search from pg_control's timeline */
1176 202 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1177 : }
1178 : else
1179 : {
1180 : /*
1181 : * else we just use the recoveryTargetTLI as already read from
1182 : * ControlFile
1183 : */
1184 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1185 : }
1186 : }
1187 :
1188 : /*
1189 : * read_backup_label: check to see if a backup_label file is present
1190 : *
1191 : * If we see a backup_label during recovery, we assume that we are recovering
1192 : * from a backup dump file, and we therefore roll forward from the checkpoint
1193 : * identified by the label file, NOT what pg_control says. This avoids the
1194 : * problem that pg_control might have been archived one or more checkpoints
1195 : * later than the start of the dump, and so if we rely on it as the start
1196 : * point, we will fail to restore a consistent database state.
1197 : *
1198 : * Returns true if a backup_label was found (and fills the checkpoint
1199 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1200 : * returns false if not. If this backup_label came from a streamed backup,
1201 : * *backupEndRequired is set to true. If this backup_label was created during
1202 : * recovery, *backupFromStandby is set to true.
1203 : *
1204 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1205 : * and TLI read from the backup file.
1206 : */
1207 : static bool
1208 1650 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1209 : bool *backupEndRequired, bool *backupFromStandby)
1210 : {
1211 : char startxlogfilename[MAXFNAMELEN];
1212 : TimeLineID tli_from_walseg,
1213 : tli_from_file;
1214 : FILE *lfp;
1215 : char ch;
1216 : char backuptype[20];
1217 : char backupfrom[20];
1218 : char backuplabel[MAXPGPATH];
1219 : char backuptime[128];
1220 : uint32 hi,
1221 : lo;
1222 :
1223 : /* suppress possible uninitialized-variable warnings */
1224 1650 : *checkPointLoc = InvalidXLogRecPtr;
1225 1650 : *backupLabelTLI = 0;
1226 1650 : *backupEndRequired = false;
1227 1650 : *backupFromStandby = false;
1228 :
1229 : /*
1230 : * See if label file is present
1231 : */
1232 1650 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1233 1650 : if (!lfp)
1234 : {
1235 1514 : if (errno != ENOENT)
1236 0 : ereport(FATAL,
1237 : (errcode_for_file_access(),
1238 : errmsg("could not read file \"%s\": %m",
1239 : BACKUP_LABEL_FILE)));
1240 1514 : return false; /* it's not there, all is fine */
1241 : }
1242 :
1243 : /*
1244 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1245 : * is pretty crude, but we are not expecting any variability in the file
1246 : * format).
1247 : */
1248 136 : if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1249 136 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1250 0 : ereport(FATAL,
1251 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1252 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1253 136 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1254 136 : RedoStartTLI = tli_from_walseg;
1255 136 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1256 136 : &hi, &lo, &ch) != 3 || ch != '\n')
1257 0 : ereport(FATAL,
1258 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1259 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1260 136 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1261 136 : *backupLabelTLI = tli_from_walseg;
1262 :
1263 : /*
1264 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1265 : * which could mean either pg_basebackup or the pg_backup_start/stop
1266 : * method was used) or if this label came from somewhere else (the only
1267 : * other option today being from pg_rewind). If this was a streamed
1268 : * backup then we know that we need to play through until we get to the
1269 : * end of the WAL which was generated during the backup (at which point we
1270 : * will have reached consistency and backupEndRequired will be reset to be
1271 : * false).
1272 : */
1273 136 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1274 : {
1275 136 : if (strcmp(backuptype, "streamed") == 0)
1276 134 : *backupEndRequired = true;
1277 : }
1278 :
1279 : /*
1280 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1281 : * it was from a standby, we'll double-check that the control file state
1282 : * matches that of a standby.
1283 : */
1284 136 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1285 : {
1286 136 : if (strcmp(backupfrom, "standby") == 0)
1287 8 : *backupFromStandby = true;
1288 : }
1289 :
1290 : /*
1291 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1292 : * but checking for their presence is useful for debugging and the next
1293 : * sanity checks. Cope also with the fact that the result buffers have a
1294 : * pre-allocated size, hence if the backup_label file has been generated
1295 : * with strings longer than the maximum assumed here an incorrect parsing
1296 : * happens. That's fine as only minor consistency checks are done
1297 : * afterwards.
1298 : */
1299 136 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1300 136 : ereport(DEBUG1,
1301 : (errmsg_internal("backup time %s in file \"%s\"",
1302 : backuptime, BACKUP_LABEL_FILE)));
1303 :
1304 136 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1305 134 : ereport(DEBUG1,
1306 : (errmsg_internal("backup label %s in file \"%s\"",
1307 : backuplabel, BACKUP_LABEL_FILE)));
1308 :
1309 : /*
1310 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1311 : * it as a sanity check if present.
1312 : */
1313 136 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1314 : {
1315 134 : if (tli_from_walseg != tli_from_file)
1316 0 : ereport(FATAL,
1317 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1318 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1319 : errdetail("Timeline ID parsed is %u, but expected %u.",
1320 : tli_from_file, tli_from_walseg)));
1321 :
1322 134 : ereport(DEBUG1,
1323 : (errmsg_internal("backup timeline %u in file \"%s\"",
1324 : tli_from_file, BACKUP_LABEL_FILE)));
1325 : }
1326 :
1327 136 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1328 0 : ereport(FATAL,
1329 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1330 : errmsg("this is an incremental backup, not a data directory"),
1331 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1332 :
1333 136 : if (ferror(lfp) || FreeFile(lfp))
1334 0 : ereport(FATAL,
1335 : (errcode_for_file_access(),
1336 : errmsg("could not read file \"%s\": %m",
1337 : BACKUP_LABEL_FILE)));
1338 :
1339 136 : return true;
1340 : }
1341 :
1342 : /*
1343 : * read_tablespace_map: check to see if a tablespace_map file is present
1344 : *
1345 : * If we see a tablespace_map file during recovery, we assume that we are
1346 : * recovering from a backup dump file, and we therefore need to create symlinks
1347 : * as per the information present in tablespace_map file.
1348 : *
1349 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1350 : * with a tablespaceinfo struct for each tablespace listed in the file);
1351 : * returns false if not.
1352 : */
1353 : static bool
1354 136 : read_tablespace_map(List **tablespaces)
1355 : {
1356 : tablespaceinfo *ti;
1357 : FILE *lfp;
1358 : char str[MAXPGPATH];
1359 : int ch,
1360 : i,
1361 : n;
1362 : bool was_backslash;
1363 :
1364 : /*
1365 : * See if tablespace_map file is present
1366 : */
1367 136 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1368 136 : if (!lfp)
1369 : {
1370 132 : if (errno != ENOENT)
1371 0 : ereport(FATAL,
1372 : (errcode_for_file_access(),
1373 : errmsg("could not read file \"%s\": %m",
1374 : TABLESPACE_MAP)));
1375 132 : return false; /* it's not there, all is fine */
1376 : }
1377 :
1378 : /*
1379 : * Read and parse the link name and path lines from tablespace_map file
1380 : * (this code is pretty crude, but we are not expecting any variability in
1381 : * the file format). De-escape any backslashes that were inserted.
1382 : */
1383 4 : i = 0;
1384 4 : was_backslash = false;
1385 154 : while ((ch = fgetc(lfp)) != EOF)
1386 : {
1387 150 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1388 : {
1389 : char *endp;
1390 :
1391 4 : if (i == 0)
1392 0 : continue; /* \r immediately followed by \n */
1393 :
1394 : /*
1395 : * The de-escaped line should contain an OID followed by exactly
1396 : * one space followed by a path. The path might start with
1397 : * spaces, so don't be too liberal about parsing.
1398 : */
1399 4 : str[i] = '\0';
1400 4 : n = 0;
1401 24 : while (str[n] && str[n] != ' ')
1402 20 : n++;
1403 4 : if (n < 1 || n >= i - 1)
1404 0 : ereport(FATAL,
1405 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1406 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1407 4 : str[n++] = '\0';
1408 :
1409 4 : ti = palloc0(sizeof(tablespaceinfo));
1410 4 : errno = 0;
1411 4 : ti->oid = strtoul(str, &endp, 10);
1412 4 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1413 0 : ereport(FATAL,
1414 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1415 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1416 4 : ti->path = pstrdup(str + n);
1417 4 : *tablespaces = lappend(*tablespaces, ti);
1418 :
1419 4 : i = 0;
1420 4 : continue;
1421 : }
1422 146 : else if (!was_backslash && ch == '\\')
1423 0 : was_backslash = true;
1424 : else
1425 : {
1426 146 : if (i < sizeof(str) - 1)
1427 146 : str[i++] = ch;
1428 146 : was_backslash = false;
1429 : }
1430 : }
1431 :
1432 4 : if (i != 0 || was_backslash) /* last line not terminated? */
1433 0 : ereport(FATAL,
1434 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1435 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1436 :
1437 4 : if (ferror(lfp) || FreeFile(lfp))
1438 0 : ereport(FATAL,
1439 : (errcode_for_file_access(),
1440 : errmsg("could not read file \"%s\": %m",
1441 : TABLESPACE_MAP)));
1442 :
1443 4 : return true;
1444 : }
1445 :
1446 : /*
1447 : * Finish WAL recovery.
1448 : *
1449 : * This does not close the 'xlogreader' yet, because in some cases the caller
1450 : * still wants to re-read the last checkpoint record by calling
1451 : * ReadCheckpointRecord().
1452 : *
1453 : * Returns the position of the last valid or applied record, after which new
1454 : * WAL should be appended, information about why recovery was ended, and some
1455 : * other things. See the EndOfWalRecoveryInfo struct for details.
1456 : */
1457 : EndOfWalRecoveryInfo *
1458 1544 : FinishWalRecovery(void)
1459 : {
1460 1544 : EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1461 : XLogRecPtr lastRec;
1462 : TimeLineID lastRecTLI;
1463 : XLogRecPtr endOfLog;
1464 :
1465 : /*
1466 : * Kill WAL receiver, if it's still running, before we continue to write
1467 : * the startup checkpoint and aborted-contrecord records. It will trump
1468 : * over these records and subsequent ones if it's still alive when we
1469 : * start writing WAL.
1470 : */
1471 1544 : XLogShutdownWalRcv();
1472 :
1473 : /*
1474 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1475 : * it and to prevent it from keep trying to fetch the failover slots.
1476 : *
1477 : * We do not update the 'synced' column in 'pg_replication_slots' system
1478 : * view from true to false here, as any failed update could leave 'synced'
1479 : * column false for some slots. This could cause issues during slot sync
1480 : * after restarting the server as a standby. While updating the 'synced'
1481 : * column after switching to the new timeline is an option, it does not
1482 : * simplify the handling for the 'synced' column. Therefore, we retain the
1483 : * 'synced' column as true after promotion as it may provide useful
1484 : * information about the slot origin.
1485 : */
1486 1544 : ShutDownSlotSync();
1487 :
1488 : /*
1489 : * We are now done reading the xlog from stream. Turn off streaming
1490 : * recovery to force fetching the files (which would be required at end of
1491 : * recovery, e.g., timeline history file) from archive or pg_wal.
1492 : *
1493 : * Note that standby mode must be turned off after killing WAL receiver,
1494 : * i.e., calling XLogShutdownWalRcv().
1495 : */
1496 : Assert(!WalRcvStreaming());
1497 1544 : StandbyMode = false;
1498 :
1499 : /*
1500 : * Determine where to start writing WAL next.
1501 : *
1502 : * Re-fetch the last valid or last applied record, so we can identify the
1503 : * exact endpoint of what we consider the valid portion of WAL. There may
1504 : * be an incomplete continuation record after that, in which case
1505 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1506 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1507 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1508 : *
1509 : * An important side-effect of this is to load the last page into
1510 : * xlogreader. The caller uses it to initialize the WAL for writing.
1511 : */
1512 1544 : if (!InRecovery)
1513 : {
1514 1240 : lastRec = CheckPointLoc;
1515 1240 : lastRecTLI = CheckPointTLI;
1516 : }
1517 : else
1518 : {
1519 304 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1520 304 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1521 : }
1522 1544 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1523 1544 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1524 1544 : endOfLog = xlogreader->EndRecPtr;
1525 :
1526 : /*
1527 : * Remember the TLI in the filename of the XLOG segment containing the
1528 : * end-of-log. It could be different from the timeline that endOfLog
1529 : * nominally belongs to, if there was a timeline switch in that segment,
1530 : * and we were reading the old WAL from a segment belonging to a higher
1531 : * timeline.
1532 : */
1533 1544 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1534 :
1535 1544 : if (ArchiveRecoveryRequested)
1536 : {
1537 : /*
1538 : * We are no longer in archive recovery state.
1539 : *
1540 : * We are now done reading the old WAL. Turn off archive fetching if
1541 : * it was active.
1542 : */
1543 : Assert(InArchiveRecovery);
1544 96 : InArchiveRecovery = false;
1545 :
1546 : /*
1547 : * If the ending log segment is still open, close it (to avoid
1548 : * problems on Windows with trying to rename or delete an open file).
1549 : */
1550 96 : if (readFile >= 0)
1551 : {
1552 96 : close(readFile);
1553 96 : readFile = -1;
1554 : }
1555 : }
1556 :
1557 : /*
1558 : * Copy the last partial block to the caller, for initializing the WAL
1559 : * buffer for appending new WAL.
1560 : */
1561 1544 : if (endOfLog % XLOG_BLCKSZ != 0)
1562 : {
1563 : char *page;
1564 : int len;
1565 : XLogRecPtr pageBeginPtr;
1566 :
1567 1510 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1568 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1569 :
1570 : /* Copy the valid part of the last block */
1571 1510 : len = endOfLog % XLOG_BLCKSZ;
1572 1510 : page = palloc(len);
1573 1510 : memcpy(page, xlogreader->readBuf, len);
1574 :
1575 1510 : result->lastPageBeginPtr = pageBeginPtr;
1576 1510 : result->lastPage = page;
1577 : }
1578 : else
1579 : {
1580 : /* There is no partial block to copy. */
1581 34 : result->lastPageBeginPtr = endOfLog;
1582 34 : result->lastPage = NULL;
1583 : }
1584 :
1585 : /*
1586 : * Create a comment for the history file to explain why and where timeline
1587 : * changed.
1588 : */
1589 1544 : result->recoveryStopReason = getRecoveryStopReason();
1590 :
1591 1544 : result->lastRec = lastRec;
1592 1544 : result->lastRecTLI = lastRecTLI;
1593 1544 : result->endOfLog = endOfLog;
1594 :
1595 1544 : result->abortedRecPtr = abortedRecPtr;
1596 1544 : result->missingContrecPtr = missingContrecPtr;
1597 :
1598 1544 : result->standby_signal_file_found = standby_signal_file_found;
1599 1544 : result->recovery_signal_file_found = recovery_signal_file_found;
1600 :
1601 1544 : return result;
1602 : }
1603 :
1604 : /*
1605 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1606 : */
1607 : void
1608 1544 : ShutdownWalRecovery(void)
1609 : {
1610 : char recoveryPath[MAXPGPATH];
1611 :
1612 : /* Final update of pg_stat_recovery_prefetch. */
1613 1544 : XLogPrefetcherComputeStats(xlogprefetcher);
1614 :
1615 : /* Shut down xlogreader */
1616 1544 : if (readFile >= 0)
1617 : {
1618 1448 : close(readFile);
1619 1448 : readFile = -1;
1620 : }
1621 1544 : XLogReaderFree(xlogreader);
1622 1544 : XLogPrefetcherFree(xlogprefetcher);
1623 :
1624 1544 : if (ArchiveRecoveryRequested)
1625 : {
1626 : /*
1627 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1628 : * rid of it.
1629 : */
1630 96 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1631 96 : unlink(recoveryPath); /* ignore any error */
1632 :
1633 : /* Get rid of any remaining recovered timeline-history file, too */
1634 96 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1635 96 : unlink(recoveryPath); /* ignore any error */
1636 : }
1637 :
1638 : /*
1639 : * We don't need the latch anymore. It's not strictly necessary to disown
1640 : * it, but let's do it for the sake of tidiness.
1641 : */
1642 1544 : if (ArchiveRecoveryRequested)
1643 96 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1644 1544 : }
1645 :
1646 : /*
1647 : * Perform WAL recovery.
1648 : *
1649 : * If the system was shut down cleanly, this is never called.
1650 : */
1651 : void
1652 410 : PerformWalRecovery(void)
1653 : {
1654 : XLogRecord *record;
1655 410 : bool reachedRecoveryTarget = false;
1656 : TimeLineID replayTLI;
1657 :
1658 : /*
1659 : * Initialize shared variables for tracking progress of WAL replay, as if
1660 : * we had just replayed the record before the REDO location (or the
1661 : * checkpoint record itself, if it's a shutdown checkpoint).
1662 : */
1663 410 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1664 410 : if (RedoStartLSN < CheckPointLoc)
1665 : {
1666 212 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1667 212 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1668 212 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1669 : }
1670 : else
1671 : {
1672 198 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1673 198 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1674 198 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1675 : }
1676 410 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1677 410 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1678 410 : XLogRecoveryCtl->recoveryLastXTime = 0;
1679 410 : XLogRecoveryCtl->currentChunkStartTime = 0;
1680 410 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1681 410 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1682 :
1683 : /* Also ensure XLogReceiptTime has a sane value */
1684 410 : XLogReceiptTime = GetCurrentTimestamp();
1685 :
1686 : /*
1687 : * Let postmaster know we've started redo now, so that it can launch the
1688 : * archiver if necessary.
1689 : */
1690 410 : if (IsUnderPostmaster)
1691 392 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1692 :
1693 : /*
1694 : * Allow read-only connections immediately if we're consistent already.
1695 : */
1696 410 : CheckRecoveryConsistency();
1697 :
1698 : /*
1699 : * Find the first record that logically follows the checkpoint --- it
1700 : * might physically precede it, though.
1701 : */
1702 410 : if (RedoStartLSN < CheckPointLoc)
1703 : {
1704 : /* back up to find the record */
1705 212 : replayTLI = RedoStartTLI;
1706 212 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1707 212 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1708 :
1709 : /*
1710 : * If a checkpoint record's redo pointer points back to an earlier
1711 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1712 : * record.
1713 : */
1714 212 : if (record->xl_rmid != RM_XLOG_ID ||
1715 212 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1716 0 : ereport(FATAL,
1717 : (errmsg("unexpected record type found at redo point %X/%X",
1718 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1719 : }
1720 : else
1721 : {
1722 : /* just have to read next record after CheckPoint */
1723 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1724 198 : replayTLI = CheckPointTLI;
1725 198 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1726 : }
1727 :
1728 410 : if (record != NULL)
1729 : {
1730 : TimestampTz xtime;
1731 : PGRUsage ru0;
1732 :
1733 392 : pg_rusage_init(&ru0);
1734 :
1735 392 : InRedo = true;
1736 :
1737 392 : RmgrStartup();
1738 :
1739 392 : ereport(LOG,
1740 : (errmsg("redo starts at %X/%X",
1741 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
1742 :
1743 : /* Prepare to report progress of the redo phase. */
1744 392 : if (!StandbyMode)
1745 210 : begin_startup_progress_phase();
1746 :
1747 : /*
1748 : * main redo apply loop
1749 : */
1750 : do
1751 : {
1752 5319504 : if (!StandbyMode)
1753 528048 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1754 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1755 :
1756 : #ifdef WAL_DEBUG
1757 : if (XLOG_DEBUG)
1758 : {
1759 : StringInfoData buf;
1760 :
1761 : initStringInfo(&buf);
1762 : appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1763 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1764 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1765 : xlog_outrec(&buf, xlogreader);
1766 : appendStringInfoString(&buf, " - ");
1767 : xlog_outdesc(&buf, xlogreader);
1768 : elog(LOG, "%s", buf.data);
1769 : pfree(buf.data);
1770 : }
1771 : #endif
1772 :
1773 : /* Handle interrupt signals of startup process */
1774 5319504 : HandleStartupProcInterrupts();
1775 :
1776 : /*
1777 : * Pause WAL replay, if requested by a hot-standby session via
1778 : * SetRecoveryPause().
1779 : *
1780 : * Note that we intentionally don't take the info_lck spinlock
1781 : * here. We might therefore read a slightly stale value of the
1782 : * recoveryPause flag, but it can't be very stale (no worse than
1783 : * the last spinlock we did acquire). Since a pause request is a
1784 : * pretty asynchronous thing anyway, possibly responding to it one
1785 : * WAL record later than we otherwise would is a minor issue, so
1786 : * it doesn't seem worth adding another spinlock cycle to prevent
1787 : * that.
1788 : */
1789 5319504 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1790 : RECOVERY_NOT_PAUSED)
1791 0 : recoveryPausesHere(false);
1792 :
1793 : /*
1794 : * Have we reached our recovery target?
1795 : */
1796 5319504 : if (recoveryStopsBefore(xlogreader))
1797 : {
1798 2 : reachedRecoveryTarget = true;
1799 2 : break;
1800 : }
1801 :
1802 : /*
1803 : * If we've been asked to lag the primary, wait on latch until
1804 : * enough time has passed.
1805 : */
1806 5319502 : if (recoveryApplyDelay(xlogreader))
1807 : {
1808 : /*
1809 : * We test for paused recovery again here. If user sets
1810 : * delayed apply, it may be because they expect to pause
1811 : * recovery in case of problems, so we must test again here
1812 : * otherwise pausing during the delay-wait wouldn't work.
1813 : */
1814 0 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1815 : RECOVERY_NOT_PAUSED)
1816 0 : recoveryPausesHere(false);
1817 : }
1818 :
1819 : /*
1820 : * Apply the record
1821 : */
1822 5319502 : ApplyWalRecord(xlogreader, record, &replayTLI);
1823 :
1824 : /* Exit loop if we reached inclusive recovery target */
1825 5319498 : if (recoveryStopsAfter(xlogreader))
1826 : {
1827 12 : reachedRecoveryTarget = true;
1828 12 : break;
1829 : }
1830 :
1831 : /* Else, try to fetch the next WAL record */
1832 5319486 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1833 5319386 : } while (record != NULL);
1834 :
1835 : /*
1836 : * end of main redo apply loop
1837 : */
1838 :
1839 288 : if (reachedRecoveryTarget)
1840 : {
1841 14 : if (!reachedConsistency)
1842 0 : ereport(FATAL,
1843 : (errmsg("requested recovery stop point is before consistent recovery point")));
1844 :
1845 : /*
1846 : * This is the last point where we can restart recovery with a new
1847 : * recovery target, if we shutdown and begin again. After this,
1848 : * Resource Managers may choose to do permanent corrective actions
1849 : * at end of recovery.
1850 : */
1851 14 : switch (recoveryTargetAction)
1852 : {
1853 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1854 :
1855 : /*
1856 : * exit with special return code to request shutdown of
1857 : * postmaster. Log messages issued from postmaster.
1858 : */
1859 0 : proc_exit(3);
1860 :
1861 2 : case RECOVERY_TARGET_ACTION_PAUSE:
1862 2 : SetRecoveryPause(true);
1863 2 : recoveryPausesHere(true);
1864 :
1865 : /* drop into promote */
1866 :
1867 14 : case RECOVERY_TARGET_ACTION_PROMOTE:
1868 14 : break;
1869 : }
1870 274 : }
1871 :
1872 288 : RmgrCleanup();
1873 :
1874 288 : ereport(LOG,
1875 : (errmsg("redo done at %X/%X system usage: %s",
1876 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1877 : pg_rusage_show(&ru0))));
1878 288 : xtime = GetLatestXTime();
1879 288 : if (xtime)
1880 68 : ereport(LOG,
1881 : (errmsg("last completed transaction was at log time %s",
1882 : timestamptz_to_str(xtime))));
1883 :
1884 288 : InRedo = false;
1885 : }
1886 : else
1887 : {
1888 : /* there are no WAL records following the checkpoint */
1889 18 : ereport(LOG,
1890 : (errmsg("redo is not required")));
1891 : }
1892 :
1893 : /*
1894 : * This check is intentionally after the above log messages that indicate
1895 : * how far recovery went.
1896 : */
1897 306 : if (ArchiveRecoveryRequested &&
1898 98 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1899 16 : !reachedRecoveryTarget)
1900 2 : ereport(FATAL,
1901 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1902 : errmsg("recovery ended before configured recovery target was reached")));
1903 304 : }
1904 :
1905 : /*
1906 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1907 : */
1908 : static void
1909 5319502 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1910 : {
1911 : ErrorContextCallback errcallback;
1912 5319502 : bool switchedTLI = false;
1913 :
1914 : /* Setup error traceback support for ereport() */
1915 5319502 : errcallback.callback = rm_redo_error_callback;
1916 5319502 : errcallback.arg = xlogreader;
1917 5319502 : errcallback.previous = error_context_stack;
1918 5319502 : error_context_stack = &errcallback;
1919 :
1920 : /*
1921 : * TransamVariables->nextXid must be beyond record's xid.
1922 : */
1923 5319502 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1924 :
1925 : /*
1926 : * Before replaying this record, check if this record causes the current
1927 : * timeline to change. The record is already considered to be part of the
1928 : * new timeline, so we update replayTLI before replaying it. That's
1929 : * important so that replayEndTLI, which is recorded as the minimum
1930 : * recovery point's TLI if recovery stops after this record, is set
1931 : * correctly.
1932 : */
1933 5319502 : if (record->xl_rmid == RM_XLOG_ID)
1934 : {
1935 79184 : TimeLineID newReplayTLI = *replayTLI;
1936 79184 : TimeLineID prevReplayTLI = *replayTLI;
1937 79184 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1938 :
1939 79184 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1940 : {
1941 : CheckPoint checkPoint;
1942 :
1943 68 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1944 68 : newReplayTLI = checkPoint.ThisTimeLineID;
1945 68 : prevReplayTLI = checkPoint.PrevTimeLineID;
1946 : }
1947 79116 : else if (info == XLOG_END_OF_RECOVERY)
1948 : {
1949 : xl_end_of_recovery xlrec;
1950 :
1951 20 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1952 20 : newReplayTLI = xlrec.ThisTimeLineID;
1953 20 : prevReplayTLI = xlrec.PrevTimeLineID;
1954 : }
1955 :
1956 79184 : if (newReplayTLI != *replayTLI)
1957 : {
1958 : /* Check that it's OK to switch to this TLI */
1959 22 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1960 : newReplayTLI, prevReplayTLI, *replayTLI);
1961 :
1962 : /* Following WAL records should be run with new TLI */
1963 22 : *replayTLI = newReplayTLI;
1964 22 : switchedTLI = true;
1965 : }
1966 : }
1967 :
1968 : /*
1969 : * Update shared replayEndRecPtr before replaying this record, so that
1970 : * XLogFlush will update minRecoveryPoint correctly.
1971 : */
1972 5319502 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1973 5319502 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1974 5319502 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1975 5319502 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1976 :
1977 : /*
1978 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1979 : */
1980 5319502 : if (standbyState >= STANDBY_INITIALIZED &&
1981 4831590 : TransactionIdIsValid(record->xl_xid))
1982 4745236 : RecordKnownAssignedTransactionIds(record->xl_xid);
1983 :
1984 : /*
1985 : * Some XLOG record types that are related to recovery are processed
1986 : * directly here, rather than in xlog_redo()
1987 : */
1988 5319502 : if (record->xl_rmid == RM_XLOG_ID)
1989 79184 : xlogrecovery_redo(xlogreader, *replayTLI);
1990 :
1991 : /* Now apply the WAL record itself */
1992 5319502 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1993 :
1994 : /*
1995 : * After redo, check whether the backup pages associated with the WAL
1996 : * record are consistent with the existing pages. This check is done only
1997 : * if consistency check is enabled for this record.
1998 : */
1999 5319498 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2000 4178584 : verifyBackupPageConsistency(xlogreader);
2001 :
2002 : /* Pop the error context stack */
2003 5319498 : error_context_stack = errcallback.previous;
2004 :
2005 : /*
2006 : * Update lastReplayedEndRecPtr after this record has been successfully
2007 : * replayed.
2008 : */
2009 5319498 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2010 5319498 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2011 5319498 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2012 5319498 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2013 5319498 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2014 :
2015 : /* ------
2016 : * Wakeup walsenders:
2017 : *
2018 : * On the standby, the WAL is flushed first (which will only wake up
2019 : * physical walsenders) and then applied, which will only wake up logical
2020 : * walsenders.
2021 : *
2022 : * Indeed, logical walsenders on standby can't decode and send data until
2023 : * it's been applied.
2024 : *
2025 : * Physical walsenders don't need to be woken up during replay unless
2026 : * cascading replication is allowed and time line change occurred (so that
2027 : * they can notice that they are on a new time line).
2028 : *
2029 : * That's why the wake up conditions are for:
2030 : *
2031 : * - physical walsenders in case of new time line and cascade
2032 : * replication is allowed
2033 : * - logical walsenders in case cascade replication is allowed (could not
2034 : * be created otherwise)
2035 : * ------
2036 : */
2037 5319498 : if (AllowCascadeReplication())
2038 4941144 : WalSndWakeup(switchedTLI, true);
2039 :
2040 : /*
2041 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2042 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2043 : * a reply to the primary.
2044 : */
2045 5319498 : if (doRequestWalReceiverReply)
2046 : {
2047 4 : doRequestWalReceiverReply = false;
2048 4 : WalRcvForceReply();
2049 : }
2050 :
2051 : /* Allow read-only connections if we're consistent now */
2052 5319498 : CheckRecoveryConsistency();
2053 :
2054 : /* Is this a timeline switch? */
2055 5319498 : if (switchedTLI)
2056 : {
2057 : /*
2058 : * Before we continue on the new timeline, clean up any (possibly
2059 : * bogus) future WAL segments on the old timeline.
2060 : */
2061 22 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2062 :
2063 : /* Reset the prefetcher. */
2064 22 : XLogPrefetchReconfigure();
2065 : }
2066 5319498 : }
2067 :
2068 : /*
2069 : * Some XLOG RM record types that are directly related to WAL recovery are
2070 : * handled here rather than in the xlog_redo()
2071 : */
2072 : static void
2073 79184 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2074 : {
2075 79184 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2076 79184 : XLogRecPtr lsn = record->EndRecPtr;
2077 :
2078 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2079 :
2080 79184 : if (info == XLOG_OVERWRITE_CONTRECORD)
2081 : {
2082 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2083 : xl_overwrite_contrecord xlrec;
2084 :
2085 2 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2086 2 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2087 0 : elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2088 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2089 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2090 :
2091 : /* We have safely skipped the aborted record */
2092 2 : abortedRecPtr = InvalidXLogRecPtr;
2093 2 : missingContrecPtr = InvalidXLogRecPtr;
2094 :
2095 2 : ereport(LOG,
2096 : (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2097 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2098 : timestamptz_to_str(xlrec.overwrite_time))));
2099 :
2100 : /* Verifying the record should only happen once */
2101 2 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2102 : }
2103 79182 : else if (info == XLOG_BACKUP_END)
2104 : {
2105 : XLogRecPtr startpoint;
2106 :
2107 162 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2108 :
2109 162 : if (backupStartPoint == startpoint)
2110 : {
2111 : /*
2112 : * We have reached the end of base backup, the point where
2113 : * pg_backup_stop() was done. The data on disk is now consistent
2114 : * (assuming we have also reached minRecoveryPoint). Set
2115 : * backupEndPoint to the current LSN, so that the next call to
2116 : * CheckRecoveryConsistency() will notice it and do the
2117 : * end-of-backup processing.
2118 : */
2119 132 : elog(DEBUG1, "end of backup record reached");
2120 :
2121 132 : backupEndPoint = lsn;
2122 : }
2123 : else
2124 30 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2125 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2126 : }
2127 79184 : }
2128 :
2129 : /*
2130 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2131 : * directories.
2132 : *
2133 : * Replay of database creation XLOG records for databases that were later
2134 : * dropped can create fake directories in pg_tblspc. By the time consistency
2135 : * is reached these directories should have been removed; here we verify
2136 : * that this did indeed happen. This is to be called at the point where
2137 : * consistent state is reached.
2138 : *
2139 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2140 : * useful for testing purposes, and also allows for an escape hatch in case
2141 : * things go south.
2142 : */
2143 : static void
2144 214 : CheckTablespaceDirectory(void)
2145 : {
2146 : DIR *dir;
2147 : struct dirent *de;
2148 :
2149 214 : dir = AllocateDir(PG_TBLSPC_DIR);
2150 656 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2151 : {
2152 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2153 :
2154 : /* Skip entries of non-oid names */
2155 442 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2156 428 : continue;
2157 :
2158 14 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2159 :
2160 14 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2161 8 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2162 : (errcode(ERRCODE_DATA_CORRUPTED),
2163 : errmsg("unexpected directory entry \"%s\" found in %s",
2164 : de->d_name, PG_TBLSPC_DIR),
2165 : errdetail("All directory entries in %s/ should be symbolic links.",
2166 : PG_TBLSPC_DIR),
2167 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2168 : }
2169 214 : }
2170 :
2171 : /*
2172 : * Checks if recovery has reached a consistent state. When consistency is
2173 : * reached and we have a valid starting standby snapshot, tell postmaster
2174 : * that it can start accepting read-only connections.
2175 : */
2176 : static void
2177 5319912 : CheckRecoveryConsistency(void)
2178 : {
2179 : XLogRecPtr lastReplayedEndRecPtr;
2180 : TimeLineID lastReplayedTLI;
2181 :
2182 : /*
2183 : * During crash recovery, we don't reach a consistent state until we've
2184 : * replayed all the WAL.
2185 : */
2186 5319912 : if (XLogRecPtrIsInvalid(minRecoveryPoint))
2187 517806 : return;
2188 :
2189 : Assert(InArchiveRecovery);
2190 :
2191 : /*
2192 : * assume that we are called in the startup process, and hence don't need
2193 : * a lock to read lastReplayedEndRecPtr
2194 : */
2195 4802106 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2196 4802106 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2197 :
2198 : /*
2199 : * Have we reached the point where our base backup was completed?
2200 : */
2201 4802106 : if (!XLogRecPtrIsInvalid(backupEndPoint) &&
2202 198 : backupEndPoint <= lastReplayedEndRecPtr)
2203 : {
2204 136 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2205 136 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2206 :
2207 136 : elog(DEBUG1, "end of backup reached");
2208 :
2209 : /*
2210 : * We have reached the end of base backup, as indicated by pg_control.
2211 : * Update the control file accordingly.
2212 : */
2213 136 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2214 136 : backupStartPoint = InvalidXLogRecPtr;
2215 136 : backupEndPoint = InvalidXLogRecPtr;
2216 136 : backupEndRequired = false;
2217 :
2218 136 : ereport(LOG,
2219 : (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2220 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2221 : LSN_FORMAT_ARGS(saveBackupEndPoint))));
2222 : }
2223 :
2224 : /*
2225 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2226 : * known to be incorrectly set if recovering from a backup, until the
2227 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2228 : * All we know prior to that is that we're not consistent yet.
2229 : */
2230 4802106 : if (!reachedConsistency && !backupEndRequired &&
2231 14294 : minRecoveryPoint <= lastReplayedEndRecPtr)
2232 : {
2233 : /*
2234 : * Check to see if the XLOG sequence contained any unresolved
2235 : * references to uninitialized pages.
2236 : */
2237 214 : XLogCheckInvalidPages();
2238 :
2239 : /*
2240 : * Check that pg_tblspc doesn't contain any real directories. Replay
2241 : * of Database/CREATE_* records may have created fictitious tablespace
2242 : * directories that should have been removed by the time consistency
2243 : * was reached.
2244 : */
2245 214 : CheckTablespaceDirectory();
2246 :
2247 214 : reachedConsistency = true;
2248 214 : ereport(LOG,
2249 : (errmsg("consistent recovery state reached at %X/%X",
2250 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2251 : }
2252 :
2253 : /*
2254 : * Have we got a valid starting snapshot that will allow queries to be
2255 : * run? If so, we can tell postmaster that the database is consistent now,
2256 : * enabling connections.
2257 : */
2258 4802106 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2259 4801678 : !LocalHotStandbyActive &&
2260 198 : reachedConsistency &&
2261 : IsUnderPostmaster)
2262 : {
2263 198 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2264 198 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2265 198 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2266 :
2267 198 : LocalHotStandbyActive = true;
2268 :
2269 198 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2270 : }
2271 : }
2272 :
2273 : /*
2274 : * Error context callback for errors occurring during rm_redo().
2275 : */
2276 : static void
2277 202 : rm_redo_error_callback(void *arg)
2278 : {
2279 202 : XLogReaderState *record = (XLogReaderState *) arg;
2280 : StringInfoData buf;
2281 :
2282 202 : initStringInfo(&buf);
2283 202 : xlog_outdesc(&buf, record);
2284 202 : xlog_block_info(&buf, record);
2285 :
2286 : /* translator: %s is a WAL record description */
2287 202 : errcontext("WAL redo at %X/%X for %s",
2288 202 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2289 : buf.data);
2290 :
2291 202 : pfree(buf.data);
2292 202 : }
2293 :
2294 : /*
2295 : * Returns a string describing an XLogRecord, consisting of its identity
2296 : * optionally followed by a colon, a space, and a further description.
2297 : */
2298 : void
2299 202 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2300 : {
2301 202 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2302 202 : uint8 info = XLogRecGetInfo(record);
2303 : const char *id;
2304 :
2305 202 : appendStringInfoString(buf, rmgr.rm_name);
2306 202 : appendStringInfoChar(buf, '/');
2307 :
2308 202 : id = rmgr.rm_identify(info);
2309 202 : if (id == NULL)
2310 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2311 : else
2312 202 : appendStringInfo(buf, "%s: ", id);
2313 :
2314 202 : rmgr.rm_desc(buf, record);
2315 202 : }
2316 :
2317 : #ifdef WAL_DEBUG
2318 :
2319 : static void
2320 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2321 : {
2322 : appendStringInfo(buf, "prev %X/%X; xid %u",
2323 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2324 : XLogRecGetXid(record));
2325 :
2326 : appendStringInfo(buf, "; len %u",
2327 : XLogRecGetDataLen(record));
2328 :
2329 : xlog_block_info(buf, record);
2330 : }
2331 : #endif /* WAL_DEBUG */
2332 :
2333 : /*
2334 : * Returns a string giving information about all the blocks in an
2335 : * XLogRecord.
2336 : */
2337 : static void
2338 202 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2339 : {
2340 : int block_id;
2341 :
2342 : /* decode block references */
2343 316 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2344 : {
2345 : RelFileLocator rlocator;
2346 : ForkNumber forknum;
2347 : BlockNumber blk;
2348 :
2349 114 : if (!XLogRecGetBlockTagExtended(record, block_id,
2350 : &rlocator, &forknum, &blk, NULL))
2351 0 : continue;
2352 :
2353 114 : if (forknum != MAIN_FORKNUM)
2354 6 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2355 : block_id,
2356 : rlocator.spcOid, rlocator.dbOid,
2357 : rlocator.relNumber,
2358 : forknum,
2359 : blk);
2360 : else
2361 108 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2362 : block_id,
2363 : rlocator.spcOid, rlocator.dbOid,
2364 : rlocator.relNumber,
2365 : blk);
2366 114 : if (XLogRecHasBlockImage(record, block_id))
2367 72 : appendStringInfoString(buf, " FPW");
2368 : }
2369 202 : }
2370 :
2371 :
2372 : /*
2373 : * Check that it's OK to switch to new timeline during recovery.
2374 : *
2375 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2376 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2377 : */
2378 : static void
2379 22 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2380 : TimeLineID replayTLI)
2381 : {
2382 : /* Check that the record agrees on what the current (old) timeline is */
2383 22 : if (prevTLI != replayTLI)
2384 0 : ereport(PANIC,
2385 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2386 : prevTLI, replayTLI)));
2387 :
2388 : /*
2389 : * The new timeline better be in the list of timelines we expect to see,
2390 : * according to the timeline history. It should also not decrease.
2391 : */
2392 22 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2393 0 : ereport(PANIC,
2394 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2395 : newTLI, replayTLI)));
2396 :
2397 : /*
2398 : * If we have not yet reached min recovery point, and we're about to
2399 : * switch to a timeline greater than the timeline of the min recovery
2400 : * point: trouble. After switching to the new timeline, we could not
2401 : * possibly visit the min recovery point on the correct timeline anymore.
2402 : * This can happen if there is a newer timeline in the archive that
2403 : * branched before the timeline the min recovery point is on, and you
2404 : * attempt to do PITR to the new timeline.
2405 : */
2406 22 : if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
2407 18 : lsn < minRecoveryPoint &&
2408 2 : newTLI > minRecoveryPointTLI)
2409 0 : ereport(PANIC,
2410 : (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2411 : newTLI,
2412 : LSN_FORMAT_ARGS(minRecoveryPoint),
2413 : minRecoveryPointTLI)));
2414 :
2415 : /* Looks good */
2416 22 : }
2417 :
2418 :
2419 : /*
2420 : * Extract timestamp from WAL record.
2421 : *
2422 : * If the record contains a timestamp, returns true, and saves the timestamp
2423 : * in *recordXtime. If the record type has no timestamp, returns false.
2424 : * Currently, only transaction commit/abort records and restore points contain
2425 : * timestamps.
2426 : */
2427 : static bool
2428 81120 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2429 : {
2430 81120 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2431 81120 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2432 81120 : uint8 rmid = XLogRecGetRmid(record);
2433 :
2434 81120 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2435 : {
2436 4 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2437 4 : return true;
2438 : }
2439 81116 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2440 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2441 : {
2442 74476 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2443 74476 : return true;
2444 : }
2445 6640 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2446 : xact_info == XLOG_XACT_ABORT_PREPARED))
2447 : {
2448 6640 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2449 6640 : return true;
2450 : }
2451 0 : return false;
2452 : }
2453 :
2454 : /*
2455 : * Checks whether the current buffer page and backup page stored in the
2456 : * WAL record are consistent or not. Before comparing the two pages, a
2457 : * masking can be applied to the pages to ignore certain areas like hint bits,
2458 : * unused space between pd_lower and pd_upper among other things. This
2459 : * function should be called once WAL replay has been completed for a
2460 : * given record.
2461 : */
2462 : static void
2463 4178584 : verifyBackupPageConsistency(XLogReaderState *record)
2464 : {
2465 4178584 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2466 : RelFileLocator rlocator;
2467 : ForkNumber forknum;
2468 : BlockNumber blkno;
2469 : int block_id;
2470 :
2471 : /* Records with no backup blocks have no need for consistency checks. */
2472 4178584 : if (!XLogRecHasAnyBlockRefs(record))
2473 0 : return;
2474 :
2475 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2476 :
2477 8684232 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2478 : {
2479 : Buffer buf;
2480 : Page page;
2481 :
2482 4505648 : if (!XLogRecGetBlockTagExtended(record, block_id,
2483 : &rlocator, &forknum, &blkno, NULL))
2484 : {
2485 : /*
2486 : * WAL record doesn't contain a block reference with the given id.
2487 : * Do nothing.
2488 : */
2489 3944 : continue;
2490 : }
2491 :
2492 : Assert(XLogRecHasBlockImage(record, block_id));
2493 :
2494 4501704 : if (XLogRecBlockImageApply(record, block_id))
2495 : {
2496 : /*
2497 : * WAL record has already applied the page, so bypass the
2498 : * consistency check as that would result in comparing the full
2499 : * page stored in the record with itself.
2500 : */
2501 38578 : continue;
2502 : }
2503 :
2504 : /*
2505 : * Read the contents from the current buffer and store it in a
2506 : * temporary page.
2507 : */
2508 4463126 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2509 : RBM_NORMAL_NO_LOG,
2510 : InvalidBuffer);
2511 4463126 : if (!BufferIsValid(buf))
2512 0 : continue;
2513 :
2514 4463126 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2515 4463126 : page = BufferGetPage(buf);
2516 :
2517 : /*
2518 : * Take a copy of the local page where WAL has been applied to have a
2519 : * comparison base before masking it...
2520 : */
2521 4463126 : memcpy(replay_image_masked, page, BLCKSZ);
2522 :
2523 : /* No need for this page anymore now that a copy is in. */
2524 4463126 : UnlockReleaseBuffer(buf);
2525 :
2526 : /*
2527 : * If the block LSN is already ahead of this WAL record, we can't
2528 : * expect contents to match. This can happen if recovery is
2529 : * restarted.
2530 : */
2531 4463126 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2532 0 : continue;
2533 :
2534 : /*
2535 : * Read the contents from the backup copy, stored in WAL record and
2536 : * store it in a temporary page. There is no need to allocate a new
2537 : * page here, a local buffer is fine to hold its contents and a mask
2538 : * can be directly applied on it.
2539 : */
2540 4463126 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2541 0 : ereport(ERROR,
2542 : (errcode(ERRCODE_INTERNAL_ERROR),
2543 : errmsg_internal("%s", record->errormsg_buf)));
2544 :
2545 : /*
2546 : * If masking function is defined, mask both the primary and replay
2547 : * images
2548 : */
2549 4463126 : if (rmgr.rm_mask != NULL)
2550 : {
2551 4463126 : rmgr.rm_mask(replay_image_masked, blkno);
2552 4463126 : rmgr.rm_mask(primary_image_masked, blkno);
2553 : }
2554 :
2555 : /* Time to compare the primary and replay images. */
2556 4463126 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2557 : {
2558 0 : elog(FATAL,
2559 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2560 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2561 : forknum, blkno);
2562 : }
2563 : }
2564 : }
2565 :
2566 : /*
2567 : * For point-in-time recovery, this function decides whether we want to
2568 : * stop applying the XLOG before the current record.
2569 : *
2570 : * Returns true if we are stopping, false otherwise. If stopping, some
2571 : * information is saved in recoveryStopXid et al for use in annotating the
2572 : * new timeline's history file.
2573 : */
2574 : static bool
2575 5319504 : recoveryStopsBefore(XLogReaderState *record)
2576 : {
2577 5319504 : bool stopsHere = false;
2578 : uint8 xact_info;
2579 : bool isCommit;
2580 5319504 : TimestampTz recordXtime = 0;
2581 : TransactionId recordXid;
2582 :
2583 : /*
2584 : * Ignore recovery target settings when not in archive recovery (meaning
2585 : * we are in crash recovery).
2586 : */
2587 5319504 : if (!ArchiveRecoveryRequested)
2588 487884 : return false;
2589 :
2590 : /* Check if we should stop as soon as reaching consistency */
2591 4831620 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2592 : {
2593 0 : ereport(LOG,
2594 : (errmsg("recovery stopping after reaching consistency")));
2595 :
2596 0 : recoveryStopAfter = false;
2597 0 : recoveryStopXid = InvalidTransactionId;
2598 0 : recoveryStopLSN = InvalidXLogRecPtr;
2599 0 : recoveryStopTime = 0;
2600 0 : recoveryStopName[0] = '\0';
2601 0 : return true;
2602 : }
2603 :
2604 : /* Check if target LSN has been reached */
2605 4831620 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2606 14260 : !recoveryTargetInclusive &&
2607 698 : record->ReadRecPtr >= recoveryTargetLSN)
2608 : {
2609 2 : recoveryStopAfter = false;
2610 2 : recoveryStopXid = InvalidTransactionId;
2611 2 : recoveryStopLSN = record->ReadRecPtr;
2612 2 : recoveryStopTime = 0;
2613 2 : recoveryStopName[0] = '\0';
2614 2 : ereport(LOG,
2615 : (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2616 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2617 2 : return true;
2618 : }
2619 :
2620 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2621 4831618 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2622 4790628 : return false;
2623 :
2624 40990 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2625 :
2626 40990 : if (xact_info == XLOG_XACT_COMMIT)
2627 : {
2628 37192 : isCommit = true;
2629 37192 : recordXid = XLogRecGetXid(record);
2630 : }
2631 3798 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2632 : {
2633 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2634 : xl_xact_parsed_commit parsed;
2635 :
2636 48 : isCommit = true;
2637 48 : ParseCommitRecord(XLogRecGetInfo(record),
2638 : xlrec,
2639 : &parsed);
2640 48 : recordXid = parsed.twophase_xid;
2641 : }
2642 3750 : else if (xact_info == XLOG_XACT_ABORT)
2643 : {
2644 3298 : isCommit = false;
2645 3298 : recordXid = XLogRecGetXid(record);
2646 : }
2647 452 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2648 : {
2649 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2650 : xl_xact_parsed_abort parsed;
2651 :
2652 22 : isCommit = false;
2653 22 : ParseAbortRecord(XLogRecGetInfo(record),
2654 : xlrec,
2655 : &parsed);
2656 22 : recordXid = parsed.twophase_xid;
2657 : }
2658 : else
2659 430 : return false;
2660 :
2661 40560 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2662 : {
2663 : /*
2664 : * There can be only one transaction end record with this exact
2665 : * transactionid
2666 : *
2667 : * when testing for an xid, we MUST test for equality only, since
2668 : * transactions are numbered in the order they start, not the order
2669 : * they complete. A higher numbered xid will complete before you about
2670 : * 50% of the time...
2671 : */
2672 0 : stopsHere = (recordXid == recoveryTargetXid);
2673 : }
2674 :
2675 : /*
2676 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2677 : * We don't expect getRecordTimestamp ever to fail, since we already know
2678 : * this is a commit or abort record; but test its result anyway.
2679 : */
2680 40560 : if (getRecordTimestamp(record, &recordXtime) &&
2681 40560 : recoveryTarget == RECOVERY_TARGET_TIME)
2682 : {
2683 : /*
2684 : * There can be many transactions that share the same commit time, so
2685 : * we stop after the last one, if we are inclusive, or stop at the
2686 : * first one if we are exclusive
2687 : */
2688 0 : if (recoveryTargetInclusive)
2689 0 : stopsHere = (recordXtime > recoveryTargetTime);
2690 : else
2691 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2692 : }
2693 :
2694 40560 : if (stopsHere)
2695 : {
2696 0 : recoveryStopAfter = false;
2697 0 : recoveryStopXid = recordXid;
2698 0 : recoveryStopTime = recordXtime;
2699 0 : recoveryStopLSN = InvalidXLogRecPtr;
2700 0 : recoveryStopName[0] = '\0';
2701 :
2702 0 : if (isCommit)
2703 : {
2704 0 : ereport(LOG,
2705 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2706 : recoveryStopXid,
2707 : timestamptz_to_str(recoveryStopTime))));
2708 : }
2709 : else
2710 : {
2711 0 : ereport(LOG,
2712 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2713 : recoveryStopXid,
2714 : timestamptz_to_str(recoveryStopTime))));
2715 : }
2716 : }
2717 :
2718 40560 : return stopsHere;
2719 : }
2720 :
2721 : /*
2722 : * Same as recoveryStopsBefore, but called after applying the record.
2723 : *
2724 : * We also track the timestamp of the latest applied COMMIT/ABORT
2725 : * record in XLogRecoveryCtl->recoveryLastXTime.
2726 : */
2727 : static bool
2728 5319498 : recoveryStopsAfter(XLogReaderState *record)
2729 : {
2730 : uint8 info;
2731 : uint8 xact_info;
2732 : uint8 rmid;
2733 5319498 : TimestampTz recordXtime = 0;
2734 :
2735 : /*
2736 : * Ignore recovery target settings when not in archive recovery (meaning
2737 : * we are in crash recovery).
2738 : */
2739 5319498 : if (!ArchiveRecoveryRequested)
2740 487884 : return false;
2741 :
2742 4831614 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2743 4831614 : rmid = XLogRecGetRmid(record);
2744 :
2745 : /*
2746 : * There can be many restore points that share the same name; we stop at
2747 : * the first one.
2748 : */
2749 4831614 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2750 44 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2751 : {
2752 : xl_restore_point *recordRestorePointData;
2753 :
2754 6 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2755 :
2756 6 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2757 : {
2758 4 : recoveryStopAfter = true;
2759 4 : recoveryStopXid = InvalidTransactionId;
2760 4 : recoveryStopLSN = InvalidXLogRecPtr;
2761 4 : (void) getRecordTimestamp(record, &recoveryStopTime);
2762 4 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2763 :
2764 4 : ereport(LOG,
2765 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2766 : recoveryStopName,
2767 : timestamptz_to_str(recoveryStopTime))));
2768 4 : return true;
2769 : }
2770 : }
2771 :
2772 : /* Check if the target LSN has been reached */
2773 4831610 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2774 13562 : recoveryTargetInclusive &&
2775 13562 : record->ReadRecPtr >= recoveryTargetLSN)
2776 : {
2777 8 : recoveryStopAfter = true;
2778 8 : recoveryStopXid = InvalidTransactionId;
2779 8 : recoveryStopLSN = record->ReadRecPtr;
2780 8 : recoveryStopTime = 0;
2781 8 : recoveryStopName[0] = '\0';
2782 8 : ereport(LOG,
2783 : (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2784 : LSN_FORMAT_ARGS(recoveryStopLSN))));
2785 8 : return true;
2786 : }
2787 :
2788 4831602 : if (rmid != RM_XACT_ID)
2789 4790616 : return false;
2790 :
2791 40986 : xact_info = info & XLOG_XACT_OPMASK;
2792 :
2793 40986 : if (xact_info == XLOG_XACT_COMMIT ||
2794 3750 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2795 452 : xact_info == XLOG_XACT_ABORT ||
2796 : xact_info == XLOG_XACT_ABORT_PREPARED)
2797 : {
2798 : TransactionId recordXid;
2799 :
2800 : /* Update the last applied transaction timestamp */
2801 40556 : if (getRecordTimestamp(record, &recordXtime))
2802 40556 : SetLatestXTime(recordXtime);
2803 :
2804 : /* Extract the XID of the committed/aborted transaction */
2805 40556 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2806 : {
2807 48 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2808 : xl_xact_parsed_commit parsed;
2809 :
2810 48 : ParseCommitRecord(XLogRecGetInfo(record),
2811 : xlrec,
2812 : &parsed);
2813 48 : recordXid = parsed.twophase_xid;
2814 : }
2815 40508 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2816 : {
2817 22 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2818 : xl_xact_parsed_abort parsed;
2819 :
2820 22 : ParseAbortRecord(XLogRecGetInfo(record),
2821 : xlrec,
2822 : &parsed);
2823 22 : recordXid = parsed.twophase_xid;
2824 : }
2825 : else
2826 40486 : recordXid = XLogRecGetXid(record);
2827 :
2828 : /*
2829 : * There can be only one transaction end record with this exact
2830 : * transactionid
2831 : *
2832 : * when testing for an xid, we MUST test for equality only, since
2833 : * transactions are numbered in the order they start, not the order
2834 : * they complete. A higher numbered xid will complete before you about
2835 : * 50% of the time...
2836 : */
2837 40556 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2838 0 : recordXid == recoveryTargetXid)
2839 : {
2840 0 : recoveryStopAfter = true;
2841 0 : recoveryStopXid = recordXid;
2842 0 : recoveryStopTime = recordXtime;
2843 0 : recoveryStopLSN = InvalidXLogRecPtr;
2844 0 : recoveryStopName[0] = '\0';
2845 :
2846 0 : if (xact_info == XLOG_XACT_COMMIT ||
2847 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2848 : {
2849 0 : ereport(LOG,
2850 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2851 : recoveryStopXid,
2852 : timestamptz_to_str(recoveryStopTime))));
2853 : }
2854 0 : else if (xact_info == XLOG_XACT_ABORT ||
2855 : xact_info == XLOG_XACT_ABORT_PREPARED)
2856 : {
2857 0 : ereport(LOG,
2858 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2859 : recoveryStopXid,
2860 : timestamptz_to_str(recoveryStopTime))));
2861 : }
2862 0 : return true;
2863 : }
2864 : }
2865 :
2866 : /* Check if we should stop as soon as reaching consistency */
2867 40986 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2868 : {
2869 0 : ereport(LOG,
2870 : (errmsg("recovery stopping after reaching consistency")));
2871 :
2872 0 : recoveryStopAfter = true;
2873 0 : recoveryStopXid = InvalidTransactionId;
2874 0 : recoveryStopTime = 0;
2875 0 : recoveryStopLSN = InvalidXLogRecPtr;
2876 0 : recoveryStopName[0] = '\0';
2877 0 : return true;
2878 : }
2879 :
2880 40986 : return false;
2881 : }
2882 :
2883 : /*
2884 : * Create a comment for the history file to explain why and where
2885 : * timeline changed.
2886 : */
2887 : static char *
2888 1544 : getRecoveryStopReason(void)
2889 : {
2890 : char reason[200];
2891 :
2892 1544 : if (recoveryTarget == RECOVERY_TARGET_XID)
2893 0 : snprintf(reason, sizeof(reason),
2894 : "%s transaction %u",
2895 0 : recoveryStopAfter ? "after" : "before",
2896 : recoveryStopXid);
2897 1544 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2898 0 : snprintf(reason, sizeof(reason),
2899 : "%s %s\n",
2900 0 : recoveryStopAfter ? "after" : "before",
2901 : timestamptz_to_str(recoveryStopTime));
2902 1544 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2903 14 : snprintf(reason, sizeof(reason),
2904 : "%s LSN %X/%X\n",
2905 14 : recoveryStopAfter ? "after" : "before",
2906 14 : LSN_FORMAT_ARGS(recoveryStopLSN));
2907 1530 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2908 6 : snprintf(reason, sizeof(reason),
2909 : "at restore point \"%s\"",
2910 : recoveryStopName);
2911 1524 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2912 0 : snprintf(reason, sizeof(reason), "reached consistency");
2913 : else
2914 1524 : snprintf(reason, sizeof(reason), "no recovery target specified");
2915 :
2916 1544 : return pstrdup(reason);
2917 : }
2918 :
2919 : /*
2920 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2921 : *
2922 : * endOfRecovery is true if the recovery target is reached and
2923 : * the paused state starts at the end of recovery because of
2924 : * recovery_target_action=pause, and false otherwise.
2925 : */
2926 : static void
2927 6 : recoveryPausesHere(bool endOfRecovery)
2928 : {
2929 : /* Don't pause unless users can connect! */
2930 6 : if (!LocalHotStandbyActive)
2931 0 : return;
2932 :
2933 : /* Don't pause after standby promotion has been triggered */
2934 6 : if (LocalPromoteIsTriggered)
2935 0 : return;
2936 :
2937 6 : if (endOfRecovery)
2938 2 : ereport(LOG,
2939 : (errmsg("pausing at the end of recovery"),
2940 : errhint("Execute pg_wal_replay_resume() to promote.")));
2941 : else
2942 4 : ereport(LOG,
2943 : (errmsg("recovery has paused"),
2944 : errhint("Execute pg_wal_replay_resume() to continue.")));
2945 :
2946 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2947 18 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2948 : {
2949 16 : HandleStartupProcInterrupts();
2950 16 : if (CheckForStandbyTrigger())
2951 4 : return;
2952 :
2953 : /*
2954 : * If recovery pause is requested then set it paused. While we are in
2955 : * the loop, user might resume and pause again so set this every time.
2956 : */
2957 12 : ConfirmRecoveryPaused();
2958 :
2959 : /*
2960 : * We wait on a condition variable that will wake us as soon as the
2961 : * pause ends, but we use a timeout so we can check the above exit
2962 : * condition periodically too.
2963 : */
2964 12 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2965 : WAIT_EVENT_RECOVERY_PAUSE);
2966 : }
2967 2 : ConditionVariableCancelSleep();
2968 : }
2969 :
2970 : /*
2971 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2972 : * certain record types are applied at least that interval behind the primary.
2973 : *
2974 : * Returns true if we waited.
2975 : *
2976 : * Note that the delay is calculated between the WAL record log time and
2977 : * the current time on standby. We would prefer to keep track of when this
2978 : * standby received each WAL record, which would allow a more consistent
2979 : * approach and one not affected by time synchronisation issues, but that
2980 : * is significantly more effort and complexity for little actual gain in
2981 : * usability.
2982 : */
2983 : static bool
2984 5319502 : recoveryApplyDelay(XLogReaderState *record)
2985 : {
2986 : uint8 xact_info;
2987 : TimestampTz xtime;
2988 : TimestampTz delayUntil;
2989 : long msecs;
2990 :
2991 : /* nothing to do if no delay configured */
2992 5319502 : if (recovery_min_apply_delay <= 0)
2993 5319502 : return false;
2994 :
2995 : /* no delay is applied on a database not yet consistent */
2996 0 : if (!reachedConsistency)
2997 0 : return false;
2998 :
2999 : /* nothing to do if crash recovery is requested */
3000 0 : if (!ArchiveRecoveryRequested)
3001 0 : return false;
3002 :
3003 : /*
3004 : * Is it a COMMIT record?
3005 : *
3006 : * We deliberately choose not to delay aborts since they have no effect on
3007 : * MVCC. We already allow replay of records that don't have a timestamp,
3008 : * so there is already opportunity for issues caused by early conflicts on
3009 : * standbys.
3010 : */
3011 0 : if (XLogRecGetRmid(record) != RM_XACT_ID)
3012 0 : return false;
3013 :
3014 0 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3015 :
3016 0 : if (xact_info != XLOG_XACT_COMMIT &&
3017 : xact_info != XLOG_XACT_COMMIT_PREPARED)
3018 0 : return false;
3019 :
3020 0 : if (!getRecordTimestamp(record, &xtime))
3021 0 : return false;
3022 :
3023 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3024 :
3025 : /*
3026 : * Exit without arming the latch if it's already past time to apply this
3027 : * record
3028 : */
3029 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3030 0 : if (msecs <= 0)
3031 0 : return false;
3032 :
3033 : while (true)
3034 : {
3035 0 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3036 :
3037 : /* This might change recovery_min_apply_delay. */
3038 0 : HandleStartupProcInterrupts();
3039 :
3040 0 : if (CheckForStandbyTrigger())
3041 0 : break;
3042 :
3043 : /*
3044 : * Recalculate delayUntil as recovery_min_apply_delay could have
3045 : * changed while waiting in this loop.
3046 : */
3047 0 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3048 :
3049 : /*
3050 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3051 : */
3052 0 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3053 : delayUntil);
3054 :
3055 0 : if (msecs <= 0)
3056 0 : break;
3057 :
3058 0 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3059 :
3060 0 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3061 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3062 : msecs,
3063 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3064 : }
3065 0 : return true;
3066 : }
3067 :
3068 : /*
3069 : * Get the current state of the recovery pause request.
3070 : */
3071 : RecoveryPauseState
3072 28 : GetRecoveryPauseState(void)
3073 : {
3074 : RecoveryPauseState state;
3075 :
3076 28 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3077 28 : state = XLogRecoveryCtl->recoveryPauseState;
3078 28 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3079 :
3080 28 : return state;
3081 : }
3082 :
3083 : /*
3084 : * Set the recovery pause state.
3085 : *
3086 : * If recovery pause is requested then sets the recovery pause state to
3087 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3088 : * to 'not paused' to resume the recovery. The recovery pause will be
3089 : * confirmed by the ConfirmRecoveryPaused.
3090 : */
3091 : void
3092 90 : SetRecoveryPause(bool recoveryPause)
3093 : {
3094 90 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3095 :
3096 90 : if (!recoveryPause)
3097 84 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3098 6 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3099 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3100 :
3101 90 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3102 :
3103 90 : if (!recoveryPause)
3104 84 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3105 90 : }
3106 :
3107 : /*
3108 : * Confirm the recovery pause by setting the recovery pause state to
3109 : * RECOVERY_PAUSED.
3110 : */
3111 : static void
3112 12 : ConfirmRecoveryPaused(void)
3113 : {
3114 : /* If recovery pause is requested then set it paused */
3115 12 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3116 12 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3117 6 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3118 12 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3119 12 : }
3120 :
3121 :
3122 : /*
3123 : * Attempt to read the next XLOG record.
3124 : *
3125 : * Before first call, the reader needs to be positioned to the first record
3126 : * by calling XLogPrefetcherBeginRead().
3127 : *
3128 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3129 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3130 : * record is available.
3131 : */
3132 : static XLogRecord *
3133 5323226 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3134 : bool fetching_ckpt, TimeLineID replayTLI)
3135 : {
3136 : XLogRecord *record;
3137 5323226 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3138 5323226 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3139 :
3140 : /* Pass through parameters to XLogPageRead */
3141 5323226 : private->fetching_ckpt = fetching_ckpt;
3142 5323226 : private->emode = emode;
3143 5323226 : private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3144 5323226 : private->replayTLI = replayTLI;
3145 :
3146 : /* This is the first attempt to read this page. */
3147 5323226 : lastSourceFailed = false;
3148 :
3149 : for (;;)
3150 180 : {
3151 : char *errormsg;
3152 :
3153 5323406 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3154 5323306 : if (record == NULL)
3155 : {
3156 : /*
3157 : * When we find that WAL ends in an incomplete record, keep track
3158 : * of that record. After recovery is done, we'll write a record
3159 : * to indicate to downstream WAL readers that that portion is to
3160 : * be ignored.
3161 : *
3162 : * However, when ArchiveRecoveryRequested = true, we're going to
3163 : * switch to a new timeline at the end of recovery. We will only
3164 : * copy WAL over to the new timeline up to the end of the last
3165 : * complete record, so if we did this, we would later create an
3166 : * overwrite contrecord in the wrong place, breaking everything.
3167 : */
3168 472 : if (!ArchiveRecoveryRequested &&
3169 208 : !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
3170 : {
3171 22 : abortedRecPtr = xlogreader->abortedRecPtr;
3172 22 : missingContrecPtr = xlogreader->missingContrecPtr;
3173 : }
3174 :
3175 472 : if (readFile >= 0)
3176 : {
3177 426 : close(readFile);
3178 426 : readFile = -1;
3179 : }
3180 :
3181 : /*
3182 : * We only end up here without a message when XLogPageRead()
3183 : * failed - in that case we already logged something. In
3184 : * StandbyMode that only happens if we have been triggered, so we
3185 : * shouldn't loop anymore in that case.
3186 : */
3187 472 : if (errormsg)
3188 426 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3189 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3190 : }
3191 :
3192 : /*
3193 : * Check page TLI is one of the expected values.
3194 : */
3195 5322834 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3196 : {
3197 : char fname[MAXFNAMELEN];
3198 : XLogSegNo segno;
3199 : int32 offset;
3200 :
3201 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3202 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3203 : wal_segment_size);
3204 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3205 : wal_segment_size);
3206 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3207 : (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3208 : xlogreader->latestPageTLI,
3209 : fname,
3210 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3211 : offset)));
3212 0 : record = NULL;
3213 : }
3214 :
3215 5323306 : if (record)
3216 : {
3217 : /* Great, got a record */
3218 5323126 : return record;
3219 : }
3220 : else
3221 : {
3222 : /* No valid record available from this source */
3223 472 : lastSourceFailed = true;
3224 :
3225 : /*
3226 : * If archive recovery was requested, but we were still doing
3227 : * crash recovery, switch to archive recovery and retry using the
3228 : * offline archive. We have now replayed all the valid WAL in
3229 : * pg_wal, so we are presumably now consistent.
3230 : *
3231 : * We require that there's at least some valid WAL present in
3232 : * pg_wal, however (!fetching_ckpt). We could recover using the
3233 : * WAL from the archive, even if pg_wal is completely empty, but
3234 : * we'd have no idea how far we'd have to replay to reach
3235 : * consistency. So err on the safe side and give up.
3236 : */
3237 472 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3238 4 : !fetching_ckpt)
3239 : {
3240 4 : ereport(DEBUG1,
3241 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3242 4 : InArchiveRecovery = true;
3243 4 : if (StandbyModeRequested)
3244 4 : EnableStandbyMode();
3245 :
3246 4 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3247 4 : minRecoveryPoint = xlogreader->EndRecPtr;
3248 4 : minRecoveryPointTLI = replayTLI;
3249 :
3250 4 : CheckRecoveryConsistency();
3251 :
3252 : /*
3253 : * Before we retry, reset lastSourceFailed and currentSource
3254 : * so that we will check the archive next.
3255 : */
3256 4 : lastSourceFailed = false;
3257 4 : currentSource = XLOG_FROM_ANY;
3258 :
3259 180 : continue;
3260 : }
3261 :
3262 : /* In standby mode, loop back to retry. Otherwise, give up. */
3263 468 : if (StandbyMode && !CheckForStandbyTrigger())
3264 176 : continue;
3265 : else
3266 292 : return NULL;
3267 : }
3268 : }
3269 : }
3270 :
3271 : /*
3272 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3273 : * already). Returns number of bytes read, if the page is read successfully,
3274 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3275 : * but only if they have not been previously reported.
3276 : *
3277 : * See XLogReaderRoutine.page_read for more details.
3278 : *
3279 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3280 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3281 : *
3282 : * This is responsible for restoring files from archive as needed, as well
3283 : * as for waiting for the requested WAL record to arrive in standby mode.
3284 : *
3285 : * xlogreader->private_data->emode specifies the log level used for reporting
3286 : * "file not found" or "end of WAL" situations in archive recovery, or in
3287 : * standby mode when promotion is triggered. If set to WARNING or below,
3288 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3289 : * levels the ereport() won't return.
3290 : *
3291 : * In standby mode, if after a successful return of XLogPageRead() the
3292 : * caller finds the record it's interested in to be broken, it should
3293 : * ereport the error with the level determined by
3294 : * emode_for_corrupt_record(), and then set lastSourceFailed
3295 : * and call XLogPageRead() again with the same arguments. This lets
3296 : * XLogPageRead() to try fetching the record from another source, or to
3297 : * sleep and retry.
3298 : */
3299 : static int
3300 2721404 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3301 : XLogRecPtr targetRecPtr, char *readBuf)
3302 : {
3303 2721404 : XLogPageReadPrivate *private =
3304 : (XLogPageReadPrivate *) xlogreader->private_data;
3305 2721404 : int emode = private->emode;
3306 : uint32 targetPageOff;
3307 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3308 : int r;
3309 :
3310 2721404 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3311 2721404 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3312 :
3313 : /*
3314 : * See if we need to switch to a new segment because the requested record
3315 : * is not in the currently open one.
3316 : */
3317 2721404 : if (readFile >= 0 &&
3318 2718120 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3319 : {
3320 : /*
3321 : * Request a restartpoint if we've replayed too much xlog since the
3322 : * last one.
3323 : */
3324 2490 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3325 : {
3326 2460 : if (XLogCheckpointNeeded(readSegNo))
3327 : {
3328 2248 : (void) GetRedoRecPtr();
3329 2248 : if (XLogCheckpointNeeded(readSegNo))
3330 2234 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3331 : }
3332 : }
3333 :
3334 2490 : close(readFile);
3335 2490 : readFile = -1;
3336 2490 : readSource = XLOG_FROM_ANY;
3337 : }
3338 :
3339 2721404 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3340 :
3341 2721416 : retry:
3342 : /* See if we need to retrieve more data */
3343 2721416 : if (readFile < 0 ||
3344 2715630 : (readSource == XLOG_FROM_STREAM &&
3345 2691992 : flushedUpto < targetPagePtr + reqLen))
3346 : {
3347 16762 : if (readFile >= 0 &&
3348 10976 : xlogreader->nonblocking &&
3349 5366 : readSource == XLOG_FROM_STREAM &&
3350 5366 : flushedUpto < targetPagePtr + reqLen)
3351 5366 : return XLREAD_WOULDBLOCK;
3352 :
3353 11296 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3354 11396 : private->randAccess,
3355 11396 : private->fetching_ckpt,
3356 : targetRecPtr,
3357 : private->replayTLI,
3358 : xlogreader->EndRecPtr,
3359 11396 : xlogreader->nonblocking))
3360 : {
3361 1130 : case XLREAD_WOULDBLOCK:
3362 1130 : return XLREAD_WOULDBLOCK;
3363 80 : case XLREAD_FAIL:
3364 80 : if (readFile >= 0)
3365 0 : close(readFile);
3366 80 : readFile = -1;
3367 80 : readLen = 0;
3368 80 : readSource = XLOG_FROM_ANY;
3369 80 : return XLREAD_FAIL;
3370 10086 : case XLREAD_SUCCESS:
3371 10086 : break;
3372 : }
3373 2704654 : }
3374 :
3375 : /*
3376 : * At this point, we have the right segment open and if we're streaming we
3377 : * know the requested record is in it.
3378 : */
3379 : Assert(readFile != -1);
3380 :
3381 : /*
3382 : * If the current segment is being streamed from the primary, calculate
3383 : * how much of the current page we have received already. We know the
3384 : * requested record has been received, but this is for the benefit of
3385 : * future calls, to allow quick exit at the top of this function.
3386 : */
3387 2714740 : if (readSource == XLOG_FROM_STREAM)
3388 : {
3389 2688926 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3390 2684786 : readLen = XLOG_BLCKSZ;
3391 : else
3392 4140 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3393 : targetPageOff;
3394 : }
3395 : else
3396 25814 : readLen = XLOG_BLCKSZ;
3397 :
3398 : /* Read the requested page */
3399 2714740 : readOff = targetPageOff;
3400 :
3401 2714740 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3402 2714740 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3403 2714740 : if (r != XLOG_BLCKSZ)
3404 : {
3405 : char fname[MAXFNAMELEN];
3406 0 : int save_errno = errno;
3407 :
3408 0 : pgstat_report_wait_end();
3409 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3410 0 : if (r < 0)
3411 : {
3412 0 : errno = save_errno;
3413 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3414 : (errcode_for_file_access(),
3415 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3416 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3417 : readOff)));
3418 : }
3419 : else
3420 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3421 : (errcode(ERRCODE_DATA_CORRUPTED),
3422 : errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3423 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3424 : readOff, r, (Size) XLOG_BLCKSZ)));
3425 0 : goto next_record_is_invalid;
3426 : }
3427 2714740 : pgstat_report_wait_end();
3428 :
3429 : Assert(targetSegNo == readSegNo);
3430 : Assert(targetPageOff == readOff);
3431 : Assert(reqLen <= readLen);
3432 :
3433 2714740 : xlogreader->seg.ws_tli = curFileTLI;
3434 :
3435 : /*
3436 : * Check the page header immediately, so that we can retry immediately if
3437 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3438 : * validates the page header anyway, and would propagate the failure up to
3439 : * ReadRecord(), which would retry. However, there's a corner case with
3440 : * continuation records, if a record is split across two pages such that
3441 : * we would need to read the two pages from different sources. For
3442 : * example, imagine a scenario where a streaming replica is started up,
3443 : * and replay reaches a record that's split across two WAL segments. The
3444 : * first page is only available locally, in pg_wal, because it's already
3445 : * been recycled on the primary. The second page, however, is not present
3446 : * in pg_wal, and we should stream it from the primary. There is a
3447 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3448 : * We would read the first page from the local WAL segment, but when
3449 : * reading the second page, we would read the bogus, recycled, WAL
3450 : * segment. If we didn't catch that case here, we would never recover,
3451 : * because ReadRecord() would retry reading the whole record from the
3452 : * beginning.
3453 : *
3454 : * Of course, this only catches errors in the page header, which is what
3455 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3456 : * corruption still has the same problem. But this at least fixes the
3457 : * common case, which can happen as part of normal operation.
3458 : *
3459 : * Validating the page header is cheap enough that doing it twice
3460 : * shouldn't be a big deal from a performance point of view.
3461 : *
3462 : * When not in standby mode, an invalid page header should cause recovery
3463 : * to end, not retry reading the page, so we don't need to validate the
3464 : * page header here for the retry. Instead, ReadPageInternal() is
3465 : * responsible for the validation.
3466 : */
3467 2714740 : if (StandbyMode &&
3468 2695376 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3469 : {
3470 : /*
3471 : * Emit this error right now then retry this page immediately. Use
3472 : * errmsg_internal() because the message was already translated.
3473 : */
3474 14 : if (xlogreader->errormsg_buf[0])
3475 14 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3476 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3477 :
3478 : /* reset any error XLogReaderValidatePageHeader() might have set */
3479 14 : XLogReaderResetError(xlogreader);
3480 14 : goto next_record_is_invalid;
3481 : }
3482 :
3483 2714726 : return readLen;
3484 :
3485 14 : next_record_is_invalid:
3486 :
3487 : /*
3488 : * If we're reading ahead, give up fast. Retries and error reporting will
3489 : * be handled by a later read when recovery catches up to this point.
3490 : */
3491 14 : if (xlogreader->nonblocking)
3492 2 : return XLREAD_WOULDBLOCK;
3493 :
3494 12 : lastSourceFailed = true;
3495 :
3496 12 : if (readFile >= 0)
3497 12 : close(readFile);
3498 12 : readFile = -1;
3499 12 : readLen = 0;
3500 12 : readSource = XLOG_FROM_ANY;
3501 :
3502 : /* In standby-mode, keep trying */
3503 12 : if (StandbyMode)
3504 12 : goto retry;
3505 : else
3506 0 : return XLREAD_FAIL;
3507 : }
3508 :
3509 : /*
3510 : * Open the WAL segment containing WAL location 'RecPtr'.
3511 : *
3512 : * The segment can be fetched via restore_command, or via walreceiver having
3513 : * streamed the record, or it can already be present in pg_wal. Checking
3514 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3515 : * too, in case someone copies a new segment directly to pg_wal. That is not
3516 : * documented or recommended, though.
3517 : *
3518 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3519 : * prepare to read WAL starting from RedoStartLSN after this.
3520 : *
3521 : * 'RecPtr' might not point to the beginning of the record we're interested
3522 : * in, it might also point to the page or segment header. In that case,
3523 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3524 : * used to decide which timeline to stream the requested WAL from.
3525 : *
3526 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3527 : * timelines, we can reject a switch to a timeline that branched off before
3528 : * this point.
3529 : *
3530 : * If the record is not immediately available, the function returns false
3531 : * if we're not in standby mode. In standby mode, waits for it to become
3532 : * available.
3533 : *
3534 : * When the requested record becomes available, the function opens the file
3535 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3536 : * of standby mode is triggered by the user, and there is no more WAL
3537 : * available, returns XLREAD_FAIL.
3538 : *
3539 : * If nonblocking is true, then give up immediately if we can't satisfy the
3540 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3541 : */
3542 : static XLogPageReadResult
3543 11396 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3544 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3545 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3546 : bool nonblocking)
3547 : {
3548 : static TimestampTz last_fail_time = 0;
3549 : TimestampTz now;
3550 11396 : bool streaming_reply_sent = false;
3551 :
3552 : /*-------
3553 : * Standby mode is implemented by a state machine:
3554 : *
3555 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3556 : * pg_wal (XLOG_FROM_PG_WAL)
3557 : * 2. Check for promotion trigger request
3558 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3559 : * 4. Rescan timelines
3560 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3561 : *
3562 : * Failure to read from the current source advances the state machine to
3563 : * the next state.
3564 : *
3565 : * 'currentSource' indicates the current state. There are no currentSource
3566 : * values for "check trigger", "rescan timelines", and "sleep" states,
3567 : * those actions are taken when reading from the previous source fails, as
3568 : * part of advancing to the next state.
3569 : *
3570 : * If standby mode is turned off while reading WAL from stream, we move
3571 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3572 : * the files (which would be required at end of recovery, e.g., timeline
3573 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3574 : * here because it's already stopped when standby mode is turned off at
3575 : * the end of recovery.
3576 : *-------
3577 : */
3578 11396 : if (!InArchiveRecovery)
3579 1666 : currentSource = XLOG_FROM_PG_WAL;
3580 9730 : else if (currentSource == XLOG_FROM_ANY ||
3581 9516 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3582 : {
3583 214 : lastSourceFailed = false;
3584 214 : currentSource = XLOG_FROM_ARCHIVE;
3585 : }
3586 :
3587 : for (;;)
3588 9526 : {
3589 20922 : XLogSource oldSource = currentSource;
3590 20922 : bool startWalReceiver = false;
3591 :
3592 : /*
3593 : * First check if we failed to read from the current source, and
3594 : * advance the state machine if so. The failure to read might've
3595 : * happened outside this function, e.g when a CRC check fails on a
3596 : * record, or within this loop.
3597 : */
3598 20922 : if (lastSourceFailed)
3599 : {
3600 : /*
3601 : * Don't allow any retry loops to occur during nonblocking
3602 : * readahead. Let the caller process everything that has been
3603 : * decoded already first.
3604 : */
3605 770 : if (nonblocking)
3606 138 : return XLREAD_WOULDBLOCK;
3607 :
3608 632 : switch (currentSource)
3609 : {
3610 398 : case XLOG_FROM_ARCHIVE:
3611 : case XLOG_FROM_PG_WAL:
3612 :
3613 : /*
3614 : * Check to see if promotion is requested. Note that we do
3615 : * this only after failure, so when you promote, we still
3616 : * finish replaying as much as we can from archive and
3617 : * pg_wal before failover.
3618 : */
3619 398 : if (StandbyMode && CheckForStandbyTrigger())
3620 : {
3621 40 : XLogShutdownWalRcv();
3622 40 : return XLREAD_FAIL;
3623 : }
3624 :
3625 : /*
3626 : * Not in standby mode, and we've now tried the archive
3627 : * and pg_wal.
3628 : */
3629 358 : if (!StandbyMode)
3630 40 : return XLREAD_FAIL;
3631 :
3632 : /*
3633 : * Move to XLOG_FROM_STREAM state, and set to start a
3634 : * walreceiver if necessary.
3635 : */
3636 318 : currentSource = XLOG_FROM_STREAM;
3637 318 : startWalReceiver = true;
3638 318 : break;
3639 :
3640 234 : case XLOG_FROM_STREAM:
3641 :
3642 : /*
3643 : * Failure while streaming. Most likely, we got here
3644 : * because streaming replication was terminated, or
3645 : * promotion was triggered. But we also get here if we
3646 : * find an invalid record in the WAL streamed from the
3647 : * primary, in which case something is seriously wrong.
3648 : * There's little chance that the problem will just go
3649 : * away, but PANIC is not good for availability either,
3650 : * especially in hot standby mode. So, we treat that the
3651 : * same as disconnection, and retry from archive/pg_wal
3652 : * again. The WAL in the archive should be identical to
3653 : * what was streamed, so it's unlikely that it helps, but
3654 : * one can hope...
3655 : */
3656 :
3657 : /*
3658 : * We should be able to move to XLOG_FROM_STREAM only in
3659 : * standby mode.
3660 : */
3661 : Assert(StandbyMode);
3662 :
3663 : /*
3664 : * Before we leave XLOG_FROM_STREAM state, make sure that
3665 : * walreceiver is not active, so that it won't overwrite
3666 : * WAL that we restore from archive.
3667 : */
3668 234 : XLogShutdownWalRcv();
3669 :
3670 : /*
3671 : * Before we sleep, re-scan for possible new timelines if
3672 : * we were requested to recover to the latest timeline.
3673 : */
3674 234 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3675 : {
3676 234 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3677 : {
3678 12 : currentSource = XLOG_FROM_ARCHIVE;
3679 12 : break;
3680 : }
3681 : }
3682 :
3683 : /*
3684 : * XLOG_FROM_STREAM is the last state in our state
3685 : * machine, so we've exhausted all the options for
3686 : * obtaining the requested WAL. We're going to loop back
3687 : * and retry from the archive, but if it hasn't been long
3688 : * since last attempt, sleep wal_retrieve_retry_interval
3689 : * milliseconds to avoid busy-waiting.
3690 : */
3691 222 : now = GetCurrentTimestamp();
3692 222 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3693 : wal_retrieve_retry_interval))
3694 : {
3695 : long wait_time;
3696 :
3697 204 : wait_time = wal_retrieve_retry_interval -
3698 102 : TimestampDifferenceMilliseconds(last_fail_time, now);
3699 :
3700 102 : elog(LOG, "waiting for WAL to become available at %X/%X",
3701 : LSN_FORMAT_ARGS(RecPtr));
3702 :
3703 : /* Do background tasks that might benefit us later. */
3704 102 : KnownAssignedTransactionIdsIdleMaintenance();
3705 :
3706 102 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3707 : WL_LATCH_SET | WL_TIMEOUT |
3708 : WL_EXIT_ON_PM_DEATH,
3709 : wait_time,
3710 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3711 102 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3712 102 : now = GetCurrentTimestamp();
3713 :
3714 : /* Handle interrupt signals of startup process */
3715 102 : HandleStartupProcInterrupts();
3716 : }
3717 202 : last_fail_time = now;
3718 202 : currentSource = XLOG_FROM_ARCHIVE;
3719 202 : break;
3720 :
3721 0 : default:
3722 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3723 : }
3724 : }
3725 20152 : else if (currentSource == XLOG_FROM_PG_WAL)
3726 : {
3727 : /*
3728 : * We just successfully read a file in pg_wal. We prefer files in
3729 : * the archive over ones in pg_wal, so try the next file again
3730 : * from the archive first.
3731 : */
3732 1662 : if (InArchiveRecovery)
3733 0 : currentSource = XLOG_FROM_ARCHIVE;
3734 : }
3735 :
3736 20684 : if (currentSource != oldSource)
3737 532 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3738 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3739 : lastSourceFailed ? "failure" : "success");
3740 :
3741 : /*
3742 : * We've now handled possible failure. Try to read from the chosen
3743 : * source.
3744 : */
3745 20684 : lastSourceFailed = false;
3746 :
3747 20684 : switch (currentSource)
3748 : {
3749 2490 : case XLOG_FROM_ARCHIVE:
3750 : case XLOG_FROM_PG_WAL:
3751 :
3752 : /*
3753 : * WAL receiver must not be running when reading WAL from
3754 : * archive or pg_wal.
3755 : */
3756 : Assert(!WalRcvStreaming());
3757 :
3758 : /* Close any old file we might have open. */
3759 2490 : if (readFile >= 0)
3760 : {
3761 126 : close(readFile);
3762 126 : readFile = -1;
3763 : }
3764 : /* Reset curFileTLI if random fetch. */
3765 2490 : if (randAccess)
3766 1946 : curFileTLI = 0;
3767 :
3768 : /*
3769 : * Try to restore the file from archive, or read an existing
3770 : * file from pg_wal.
3771 : */
3772 2490 : readFile = XLogFileReadAnyTLI(readSegNo,
3773 2490 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3774 : currentSource);
3775 2490 : if (readFile >= 0)
3776 2176 : return XLREAD_SUCCESS; /* success! */
3777 :
3778 : /*
3779 : * Nope, not found in archive or pg_wal.
3780 : */
3781 314 : lastSourceFailed = true;
3782 314 : break;
3783 :
3784 18194 : case XLOG_FROM_STREAM:
3785 : {
3786 : bool havedata;
3787 :
3788 : /*
3789 : * We should be able to move to XLOG_FROM_STREAM only in
3790 : * standby mode.
3791 : */
3792 : Assert(StandbyMode);
3793 :
3794 : /*
3795 : * First, shutdown walreceiver if its restart has been
3796 : * requested -- but no point if we're already slated for
3797 : * starting it.
3798 : */
3799 18194 : if (pendingWalRcvRestart && !startWalReceiver)
3800 : {
3801 6 : XLogShutdownWalRcv();
3802 :
3803 : /*
3804 : * Re-scan for possible new timelines if we were
3805 : * requested to recover to the latest timeline.
3806 : */
3807 6 : if (recoveryTargetTimeLineGoal ==
3808 : RECOVERY_TARGET_TIMELINE_LATEST)
3809 6 : rescanLatestTimeLine(replayTLI, replayLSN);
3810 :
3811 6 : startWalReceiver = true;
3812 : }
3813 18194 : pendingWalRcvRestart = false;
3814 :
3815 : /*
3816 : * Launch walreceiver if needed.
3817 : *
3818 : * If fetching_ckpt is true, RecPtr points to the initial
3819 : * checkpoint location. In that case, we use RedoStartLSN
3820 : * as the streaming start position instead of RecPtr, so
3821 : * that when we later jump backwards to start redo at
3822 : * RedoStartLSN, we will have the logs streamed already.
3823 : */
3824 18194 : if (startWalReceiver &&
3825 324 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3826 : {
3827 : XLogRecPtr ptr;
3828 : TimeLineID tli;
3829 :
3830 292 : if (fetching_ckpt)
3831 : {
3832 0 : ptr = RedoStartLSN;
3833 0 : tli = RedoStartTLI;
3834 : }
3835 : else
3836 : {
3837 292 : ptr = RecPtr;
3838 :
3839 : /*
3840 : * Use the record begin position to determine the
3841 : * TLI, rather than the position we're reading.
3842 : */
3843 292 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3844 :
3845 292 : if (curFileTLI > 0 && tli < curFileTLI)
3846 0 : elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3847 : LSN_FORMAT_ARGS(tliRecPtr),
3848 : tli, curFileTLI);
3849 : }
3850 292 : curFileTLI = tli;
3851 292 : SetInstallXLogFileSegmentActive();
3852 292 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3853 : PrimarySlotName,
3854 : wal_receiver_create_temp_slot);
3855 292 : flushedUpto = 0;
3856 : }
3857 :
3858 : /*
3859 : * Check if WAL receiver is active or wait to start up.
3860 : */
3861 18194 : if (!WalRcvStreaming())
3862 : {
3863 174 : lastSourceFailed = true;
3864 174 : break;
3865 : }
3866 :
3867 : /*
3868 : * Walreceiver is active, so see if new data has arrived.
3869 : *
3870 : * We only advance XLogReceiptTime when we obtain fresh
3871 : * WAL from walreceiver and observe that we had already
3872 : * processed everything before the most recent "chunk"
3873 : * that it flushed to disk. In steady state where we are
3874 : * keeping up with the incoming data, XLogReceiptTime will
3875 : * be updated on each cycle. When we are behind,
3876 : * XLogReceiptTime will not advance, so the grace time
3877 : * allotted to conflicting queries will decrease.
3878 : */
3879 18020 : if (RecPtr < flushedUpto)
3880 3536 : havedata = true;
3881 : else
3882 : {
3883 : XLogRecPtr latestChunkStart;
3884 :
3885 14484 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3886 14484 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3887 : {
3888 6856 : havedata = true;
3889 6856 : if (latestChunkStart <= RecPtr)
3890 : {
3891 5704 : XLogReceiptTime = GetCurrentTimestamp();
3892 5704 : SetCurrentChunkStartTime(XLogReceiptTime);
3893 : }
3894 : }
3895 : else
3896 7628 : havedata = false;
3897 : }
3898 18020 : if (havedata)
3899 : {
3900 : /*
3901 : * Great, streamed far enough. Open the file if it's
3902 : * not open already. Also read the timeline history
3903 : * file if we haven't initialized timeline history
3904 : * yet; it should be streamed over and present in
3905 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3906 : * info is set correctly and XLogReceiptTime isn't
3907 : * changed.
3908 : *
3909 : * NB: We must set readTimeLineHistory based on
3910 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3911 : * be the same, but if recovery_target_timeline is
3912 : * 'latest' and archiving is configured, then it's
3913 : * possible that we managed to retrieve one or more
3914 : * new timeline history files from the archive,
3915 : * updating recoveryTargetTLI.
3916 : */
3917 10392 : if (readFile < 0)
3918 : {
3919 2482 : if (!expectedTLEs)
3920 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3921 2482 : readFile = XLogFileRead(readSegNo, receiveTLI,
3922 : XLOG_FROM_STREAM, false);
3923 : Assert(readFile >= 0);
3924 : }
3925 : else
3926 : {
3927 : /* just make sure source info is correct... */
3928 7910 : readSource = XLOG_FROM_STREAM;
3929 7910 : XLogReceiptSource = XLOG_FROM_STREAM;
3930 7910 : return XLREAD_SUCCESS;
3931 : }
3932 2482 : break;
3933 : }
3934 :
3935 : /* In nonblocking mode, return rather than sleeping. */
3936 7628 : if (nonblocking)
3937 992 : return XLREAD_WOULDBLOCK;
3938 :
3939 : /*
3940 : * Data not here yet. Check for trigger, then wait for
3941 : * walreceiver to wake us up when new WAL arrives.
3942 : */
3943 6636 : if (CheckForStandbyTrigger())
3944 : {
3945 : /*
3946 : * Note that we don't return XLREAD_FAIL immediately
3947 : * here. After being triggered, we still want to
3948 : * replay all the WAL that was already streamed. It's
3949 : * in pg_wal now, so we just treat this as a failure,
3950 : * and the state machine will move on to replay the
3951 : * streamed WAL from pg_wal, and then recheck the
3952 : * trigger and exit replay.
3953 : */
3954 60 : lastSourceFailed = true;
3955 60 : break;
3956 : }
3957 :
3958 : /*
3959 : * Since we have replayed everything we have received so
3960 : * far and are about to start waiting for more WAL, let's
3961 : * tell the upstream server our replay location now so
3962 : * that pg_stat_replication doesn't show stale
3963 : * information.
3964 : */
3965 6576 : if (!streaming_reply_sent)
3966 : {
3967 5172 : WalRcvForceReply();
3968 5172 : streaming_reply_sent = true;
3969 : }
3970 :
3971 : /* Do any background tasks that might benefit us later. */
3972 6576 : KnownAssignedTransactionIdsIdleMaintenance();
3973 :
3974 : /* Update pg_stat_recovery_prefetch before sleeping. */
3975 6576 : XLogPrefetcherComputeStats(xlogprefetcher);
3976 :
3977 : /*
3978 : * Wait for more WAL to arrive, when we will be woken
3979 : * immediately by the WAL receiver.
3980 : */
3981 6576 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3982 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
3983 : -1L,
3984 : WAIT_EVENT_RECOVERY_WAL_STREAM);
3985 6576 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3986 6576 : break;
3987 : }
3988 :
3989 0 : default:
3990 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3991 : }
3992 :
3993 : /*
3994 : * Check for recovery pause here so that we can confirm more quickly
3995 : * that a requested pause has actually taken effect.
3996 : */
3997 9606 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3998 : RECOVERY_NOT_PAUSED)
3999 4 : recoveryPausesHere(false);
4000 :
4001 : /*
4002 : * This possibly-long loop needs to handle interrupts of startup
4003 : * process.
4004 : */
4005 9606 : HandleStartupProcInterrupts();
4006 : }
4007 :
4008 : return XLREAD_FAIL; /* not reached */
4009 : }
4010 :
4011 :
4012 : /*
4013 : * Determine what log level should be used to report a corrupt WAL record
4014 : * in the current WAL page, previously read by XLogPageRead().
4015 : *
4016 : * 'emode' is the error mode that would be used to report a file-not-found
4017 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4018 : * we're retrying the exact same record that we've tried previously, only
4019 : * complain the first time to keep the noise down. However, we only do when
4020 : * reading from pg_wal, because we don't expect any invalid records in archive
4021 : * or in records streamed from the primary. Files in the archive should be complete,
4022 : * and we should never hit the end of WAL because we stop and wait for more WAL
4023 : * to arrive before replaying it.
4024 : *
4025 : * NOTE: This function remembers the RecPtr value it was last called with,
4026 : * to suppress repeated messages about the same record. Only call this when
4027 : * you are about to ereport(), or you might cause a later message to be
4028 : * erroneously suppressed.
4029 : */
4030 : static int
4031 440 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4032 : {
4033 : static XLogRecPtr lastComplaint = 0;
4034 :
4035 440 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4036 : {
4037 440 : if (RecPtr == lastComplaint)
4038 72 : emode = DEBUG1;
4039 : else
4040 368 : lastComplaint = RecPtr;
4041 : }
4042 440 : return emode;
4043 : }
4044 :
4045 :
4046 : /*
4047 : * Subroutine to try to fetch and validate a prior checkpoint record.
4048 : */
4049 : static XLogRecord *
4050 1650 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4051 : TimeLineID replayTLI)
4052 : {
4053 : XLogRecord *record;
4054 : uint8 info;
4055 :
4056 : Assert(xlogreader != NULL);
4057 :
4058 1650 : if (!XRecOffIsValid(RecPtr))
4059 : {
4060 0 : ereport(LOG,
4061 : (errmsg("invalid checkpoint location")));
4062 0 : return NULL;
4063 : }
4064 :
4065 1650 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4066 1650 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4067 :
4068 1650 : if (record == NULL)
4069 : {
4070 0 : ereport(LOG,
4071 : (errmsg("invalid checkpoint record")));
4072 0 : return NULL;
4073 : }
4074 1650 : if (record->xl_rmid != RM_XLOG_ID)
4075 : {
4076 0 : ereport(LOG,
4077 : (errmsg("invalid resource manager ID in checkpoint record")));
4078 0 : return NULL;
4079 : }
4080 1650 : info = record->xl_info & ~XLR_INFO_MASK;
4081 1650 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4082 : info != XLOG_CHECKPOINT_ONLINE)
4083 : {
4084 0 : ereport(LOG,
4085 : (errmsg("invalid xl_info in checkpoint record")));
4086 0 : return NULL;
4087 : }
4088 1650 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4089 : {
4090 0 : ereport(LOG,
4091 : (errmsg("invalid length of checkpoint record")));
4092 0 : return NULL;
4093 : }
4094 1650 : return record;
4095 : }
4096 :
4097 : /*
4098 : * Scan for new timelines that might have appeared in the archive since we
4099 : * started recovery.
4100 : *
4101 : * If there are any, the function changes recovery target TLI to the latest
4102 : * one and returns 'true'.
4103 : */
4104 : static bool
4105 240 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4106 : {
4107 : List *newExpectedTLEs;
4108 : bool found;
4109 : ListCell *cell;
4110 : TimeLineID newtarget;
4111 240 : TimeLineID oldtarget = recoveryTargetTLI;
4112 240 : TimeLineHistoryEntry *currentTle = NULL;
4113 :
4114 240 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4115 240 : if (newtarget == recoveryTargetTLI)
4116 : {
4117 : /* No new timelines found */
4118 228 : return false;
4119 : }
4120 :
4121 : /*
4122 : * Determine the list of expected TLIs for the new TLI
4123 : */
4124 :
4125 12 : newExpectedTLEs = readTimeLineHistory(newtarget);
4126 :
4127 : /*
4128 : * If the current timeline is not part of the history of the new timeline,
4129 : * we cannot proceed to it.
4130 : */
4131 12 : found = false;
4132 24 : foreach(cell, newExpectedTLEs)
4133 : {
4134 24 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4135 :
4136 24 : if (currentTle->tli == recoveryTargetTLI)
4137 : {
4138 12 : found = true;
4139 12 : break;
4140 : }
4141 : }
4142 12 : if (!found)
4143 : {
4144 0 : ereport(LOG,
4145 : (errmsg("new timeline %u is not a child of database system timeline %u",
4146 : newtarget,
4147 : replayTLI)));
4148 0 : return false;
4149 : }
4150 :
4151 : /*
4152 : * The current timeline was found in the history file, but check that the
4153 : * next timeline was forked off from it *after* the current recovery
4154 : * location.
4155 : */
4156 12 : if (currentTle->end < replayLSN)
4157 : {
4158 0 : ereport(LOG,
4159 : (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4160 : newtarget,
4161 : replayTLI,
4162 : LSN_FORMAT_ARGS(replayLSN))));
4163 0 : return false;
4164 : }
4165 :
4166 : /* The new timeline history seems valid. Switch target */
4167 12 : recoveryTargetTLI = newtarget;
4168 12 : list_free_deep(expectedTLEs);
4169 12 : expectedTLEs = newExpectedTLEs;
4170 :
4171 : /*
4172 : * As in StartupXLOG(), try to ensure we have all the history files
4173 : * between the old target and new target in pg_wal.
4174 : */
4175 12 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4176 :
4177 12 : ereport(LOG,
4178 : (errmsg("new target timeline is %u",
4179 : recoveryTargetTLI)));
4180 :
4181 12 : return true;
4182 : }
4183 :
4184 :
4185 : /*
4186 : * Open a logfile segment for reading (during recovery).
4187 : *
4188 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4189 : * Otherwise, it's assumed to be already available in pg_wal.
4190 : */
4191 : static int
4192 5766 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4193 : XLogSource source, bool notfoundOk)
4194 : {
4195 : char xlogfname[MAXFNAMELEN];
4196 : char activitymsg[MAXFNAMELEN + 16];
4197 : char path[MAXPGPATH];
4198 : int fd;
4199 :
4200 5766 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4201 :
4202 5766 : switch (source)
4203 : {
4204 844 : case XLOG_FROM_ARCHIVE:
4205 : /* Report recovery progress in PS display */
4206 844 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4207 : xlogfname);
4208 844 : set_ps_display(activitymsg);
4209 :
4210 844 : if (!RestoreArchivedFile(path, xlogfname,
4211 : "RECOVERYXLOG",
4212 : wal_segment_size,
4213 : InRedo))
4214 778 : return -1;
4215 66 : break;
4216 :
4217 4922 : case XLOG_FROM_PG_WAL:
4218 : case XLOG_FROM_STREAM:
4219 4922 : XLogFilePath(path, tli, segno, wal_segment_size);
4220 4922 : break;
4221 :
4222 0 : default:
4223 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4224 : }
4225 :
4226 : /*
4227 : * If the segment was fetched from archival storage, replace the existing
4228 : * xlog segment (if any) with the archival version.
4229 : */
4230 4988 : if (source == XLOG_FROM_ARCHIVE)
4231 : {
4232 : Assert(!IsInstallXLogFileSegmentActive());
4233 66 : KeepFileRestoredFromArchive(path, xlogfname);
4234 :
4235 : /*
4236 : * Set path to point at the new file in pg_wal.
4237 : */
4238 66 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4239 : }
4240 :
4241 4988 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4242 4988 : if (fd >= 0)
4243 : {
4244 : /* Success! */
4245 4658 : curFileTLI = tli;
4246 :
4247 : /* Report recovery progress in PS display */
4248 4658 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4249 : xlogfname);
4250 4658 : set_ps_display(activitymsg);
4251 :
4252 : /* Track source of data in assorted state variables */
4253 4658 : readSource = source;
4254 4658 : XLogReceiptSource = source;
4255 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4256 4658 : if (source != XLOG_FROM_STREAM)
4257 2176 : XLogReceiptTime = GetCurrentTimestamp();
4258 :
4259 4658 : return fd;
4260 : }
4261 330 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4262 0 : ereport(PANIC,
4263 : (errcode_for_file_access(),
4264 : errmsg("could not open file \"%s\": %m", path)));
4265 330 : return -1;
4266 : }
4267 :
4268 : /*
4269 : * Open a logfile segment for reading (during recovery).
4270 : *
4271 : * This version searches for the segment with any TLI listed in expectedTLEs.
4272 : */
4273 : static int
4274 2490 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4275 : {
4276 : char path[MAXPGPATH];
4277 : ListCell *cell;
4278 : int fd;
4279 : List *tles;
4280 :
4281 : /*
4282 : * Loop looking for a suitable timeline ID: we might need to read any of
4283 : * the timelines listed in expectedTLEs.
4284 : *
4285 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4286 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4287 : * to go backwards; this prevents us from picking up the wrong file when a
4288 : * parent timeline extends to higher segment numbers than the child we
4289 : * want to read.
4290 : *
4291 : * If we haven't read the timeline history file yet, read it now, so that
4292 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4293 : * however, unless we actually find a valid segment. That way if there is
4294 : * neither a timeline history file nor a WAL segment in the archive, and
4295 : * streaming replication is set up, we'll read the timeline history file
4296 : * streamed from the primary when we start streaming, instead of
4297 : * recovering with a dummy history generated here.
4298 : */
4299 2490 : if (expectedTLEs)
4300 840 : tles = expectedTLEs;
4301 : else
4302 1650 : tles = readTimeLineHistory(recoveryTargetTLI);
4303 :
4304 2834 : foreach(cell, tles)
4305 : {
4306 2528 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4307 2528 : TimeLineID tli = hent->tli;
4308 :
4309 2528 : if (tli < curFileTLI)
4310 8 : break; /* don't bother looking at too-old TLIs */
4311 :
4312 : /*
4313 : * Skip scanning the timeline ID that the logfile segment to read
4314 : * doesn't belong to
4315 : */
4316 2520 : if (hent->begin != InvalidXLogRecPtr)
4317 : {
4318 136 : XLogSegNo beginseg = 0;
4319 :
4320 136 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4321 :
4322 : /*
4323 : * The logfile segment that doesn't belong to the timeline is
4324 : * older or newer than the segment that the timeline started or
4325 : * ended at, respectively. It's sufficient to check only the
4326 : * starting segment of the timeline here. Since the timelines are
4327 : * scanned in descending order in this loop, any segments newer
4328 : * than the ending segment should belong to newer timeline and
4329 : * have already been read before. So it's not necessary to check
4330 : * the ending segment of the timeline here.
4331 : */
4332 136 : if (segno < beginseg)
4333 14 : continue;
4334 : }
4335 :
4336 2506 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4337 : {
4338 844 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4339 844 : if (fd != -1)
4340 : {
4341 66 : elog(DEBUG1, "got WAL segment from archive");
4342 66 : if (!expectedTLEs)
4343 28 : expectedTLEs = tles;
4344 2176 : return fd;
4345 : }
4346 : }
4347 :
4348 2440 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4349 : {
4350 2440 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4351 2440 : if (fd != -1)
4352 : {
4353 2110 : if (!expectedTLEs)
4354 1622 : expectedTLEs = tles;
4355 2110 : return fd;
4356 : }
4357 : }
4358 : }
4359 :
4360 : /* Couldn't find it. For simplicity, complain about front timeline */
4361 314 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4362 314 : errno = ENOENT;
4363 314 : ereport(DEBUG2,
4364 : (errcode_for_file_access(),
4365 : errmsg("could not open file \"%s\": %m", path)));
4366 314 : return -1;
4367 : }
4368 :
4369 : /*
4370 : * Set flag to signal the walreceiver to restart. (The startup process calls
4371 : * this on noticing a relevant configuration change.)
4372 : */
4373 : void
4374 6 : StartupRequestWalReceiverRestart(void)
4375 : {
4376 6 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4377 : {
4378 6 : ereport(LOG,
4379 : (errmsg("WAL receiver process shutdown requested")));
4380 :
4381 6 : pendingWalRcvRestart = true;
4382 : }
4383 6 : }
4384 :
4385 :
4386 : /*
4387 : * Has a standby promotion already been triggered?
4388 : *
4389 : * Unlike CheckForStandbyTrigger(), this works in any process
4390 : * that's connected to shared memory.
4391 : */
4392 : bool
4393 102 : PromoteIsTriggered(void)
4394 : {
4395 : /*
4396 : * We check shared state each time only until a standby promotion is
4397 : * triggered. We can't trigger a promotion again, so there's no need to
4398 : * keep checking after the shared variable has once been seen true.
4399 : */
4400 102 : if (LocalPromoteIsTriggered)
4401 82 : return true;
4402 :
4403 20 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4404 20 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4405 20 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4406 :
4407 20 : return LocalPromoteIsTriggered;
4408 : }
4409 :
4410 : static void
4411 82 : SetPromoteIsTriggered(void)
4412 : {
4413 82 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4414 82 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4415 82 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4416 :
4417 : /*
4418 : * Mark the recovery pause state as 'not paused' because the paused state
4419 : * ends and promotion continues if a promotion is triggered while recovery
4420 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4421 : * return 'paused' while a promotion is ongoing.
4422 : */
4423 82 : SetRecoveryPause(false);
4424 :
4425 82 : LocalPromoteIsTriggered = true;
4426 82 : }
4427 :
4428 : /*
4429 : * Check whether a promote request has arrived.
4430 : */
4431 : static bool
4432 7266 : CheckForStandbyTrigger(void)
4433 : {
4434 7266 : if (LocalPromoteIsTriggered)
4435 102 : return true;
4436 :
4437 7164 : if (IsPromoteSignaled() && CheckPromoteSignal())
4438 : {
4439 82 : ereport(LOG, (errmsg("received promote request")));
4440 82 : RemovePromoteSignalFiles();
4441 82 : ResetPromoteSignaled();
4442 82 : SetPromoteIsTriggered();
4443 82 : return true;
4444 : }
4445 :
4446 7082 : return false;
4447 : }
4448 :
4449 : /*
4450 : * Remove the files signaling a standby promotion request.
4451 : */
4452 : void
4453 1612 : RemovePromoteSignalFiles(void)
4454 : {
4455 1612 : unlink(PROMOTE_SIGNAL_FILE);
4456 1612 : }
4457 :
4458 : /*
4459 : * Check to see if a promote request has arrived.
4460 : */
4461 : bool
4462 1432 : CheckPromoteSignal(void)
4463 : {
4464 : struct stat stat_buf;
4465 :
4466 1432 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4467 164 : return true;
4468 :
4469 1268 : return false;
4470 : }
4471 :
4472 : /*
4473 : * Wake up startup process to replay newly arrived WAL, or to notice that
4474 : * failover has been requested.
4475 : */
4476 : void
4477 14854 : WakeupRecovery(void)
4478 : {
4479 14854 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4480 14854 : }
4481 :
4482 : /*
4483 : * Schedule a walreceiver wakeup in the main recovery loop.
4484 : */
4485 : void
4486 4 : XLogRequestWalReceiverReply(void)
4487 : {
4488 4 : doRequestWalReceiverReply = true;
4489 4 : }
4490 :
4491 : /*
4492 : * Is HotStandby active yet? This is only important in special backends
4493 : * since normal backends won't ever be able to connect until this returns
4494 : * true. Postmaster knows this by way of signal, not via shared memory.
4495 : *
4496 : * Unlike testing standbyState, this works in any process that's connected to
4497 : * shared memory. (And note that standbyState alone doesn't tell the truth
4498 : * anyway.)
4499 : */
4500 : bool
4501 306 : HotStandbyActive(void)
4502 : {
4503 : /*
4504 : * We check shared state each time only until Hot Standby is active. We
4505 : * can't de-activate Hot Standby, so there's no need to keep checking
4506 : * after the shared variable has once been seen true.
4507 : */
4508 306 : if (LocalHotStandbyActive)
4509 44 : return true;
4510 : else
4511 : {
4512 : /* spinlock is essential on machines with weak memory ordering! */
4513 262 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4514 262 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4515 262 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4516 :
4517 262 : return LocalHotStandbyActive;
4518 : }
4519 : }
4520 :
4521 : /*
4522 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4523 : * where we don't need to ask any other process what the state is.
4524 : */
4525 : static bool
4526 0 : HotStandbyActiveInReplay(void)
4527 : {
4528 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4529 0 : return LocalHotStandbyActive;
4530 : }
4531 :
4532 : /*
4533 : * Get latest redo apply position.
4534 : *
4535 : * Exported to allow WALReceiver to read the pointer directly.
4536 : */
4537 : XLogRecPtr
4538 48132 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4539 : {
4540 : XLogRecPtr recptr;
4541 : TimeLineID tli;
4542 :
4543 48132 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4544 48132 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4545 48132 : tli = XLogRecoveryCtl->lastReplayedTLI;
4546 48132 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4547 :
4548 48132 : if (replayTLI)
4549 4320 : *replayTLI = tli;
4550 48132 : return recptr;
4551 : }
4552 :
4553 :
4554 : /*
4555 : * Get position of last applied, or the record being applied.
4556 : *
4557 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4558 : * record is currently being applied, this includes that record.
4559 : */
4560 : XLogRecPtr
4561 10100 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4562 : {
4563 : XLogRecPtr recptr;
4564 : TimeLineID tli;
4565 :
4566 10100 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4567 10100 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4568 10100 : tli = XLogRecoveryCtl->replayEndTLI;
4569 10100 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4570 :
4571 10100 : if (replayEndTLI)
4572 10100 : *replayEndTLI = tli;
4573 10100 : return recptr;
4574 : }
4575 :
4576 : /*
4577 : * Save timestamp of latest processed commit/abort record.
4578 : *
4579 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4580 : * seen by processes other than the startup process. Note in particular
4581 : * that CreateRestartPoint is executed in the checkpointer.
4582 : */
4583 : static void
4584 40556 : SetLatestXTime(TimestampTz xtime)
4585 : {
4586 40556 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4587 40556 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4588 40556 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4589 40556 : }
4590 :
4591 : /*
4592 : * Fetch timestamp of latest processed commit/abort record.
4593 : */
4594 : TimestampTz
4595 652 : GetLatestXTime(void)
4596 : {
4597 : TimestampTz xtime;
4598 :
4599 652 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4600 652 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4601 652 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4602 :
4603 652 : return xtime;
4604 : }
4605 :
4606 : /*
4607 : * Save timestamp of the next chunk of WAL records to apply.
4608 : *
4609 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4610 : * seen by all backends.
4611 : */
4612 : static void
4613 5704 : SetCurrentChunkStartTime(TimestampTz xtime)
4614 : {
4615 5704 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4616 5704 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4617 5704 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4618 5704 : }
4619 :
4620 : /*
4621 : * Fetch timestamp of latest processed commit/abort record.
4622 : * Startup process maintains an accurate local copy in XLogReceiptTime
4623 : */
4624 : TimestampTz
4625 348 : GetCurrentChunkReplayStartTime(void)
4626 : {
4627 : TimestampTz xtime;
4628 :
4629 348 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4630 348 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4631 348 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4632 :
4633 348 : return xtime;
4634 : }
4635 :
4636 : /*
4637 : * Returns time of receipt of current chunk of XLOG data, as well as
4638 : * whether it was received from streaming replication or from archives.
4639 : */
4640 : void
4641 58 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4642 : {
4643 : /*
4644 : * This must be executed in the startup process, since we don't export the
4645 : * relevant state to shared memory.
4646 : */
4647 : Assert(InRecovery);
4648 :
4649 58 : *rtime = XLogReceiptTime;
4650 58 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4651 58 : }
4652 :
4653 : /*
4654 : * Note that text field supplied is a parameter name and does not require
4655 : * translation
4656 : */
4657 : void
4658 1140 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4659 : {
4660 1140 : if (currValue < minValue)
4661 : {
4662 0 : if (HotStandbyActiveInReplay())
4663 : {
4664 0 : bool warned_for_promote = false;
4665 :
4666 0 : ereport(WARNING,
4667 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4668 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4669 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4670 : param_name,
4671 : currValue,
4672 : minValue)));
4673 :
4674 0 : SetRecoveryPause(true);
4675 :
4676 0 : ereport(LOG,
4677 : (errmsg("recovery has paused"),
4678 : errdetail("If recovery is unpaused, the server will shut down."),
4679 : errhint("You can then restart the server after making the necessary configuration changes.")));
4680 :
4681 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4682 : {
4683 0 : HandleStartupProcInterrupts();
4684 :
4685 0 : if (CheckForStandbyTrigger())
4686 : {
4687 0 : if (!warned_for_promote)
4688 0 : ereport(WARNING,
4689 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4690 : errmsg("promotion is not possible because of insufficient parameter settings"),
4691 :
4692 : /*
4693 : * Repeat the detail from above so it's easy to find
4694 : * in the log.
4695 : */
4696 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4697 : param_name,
4698 : currValue,
4699 : minValue),
4700 : errhint("Restart the server after making the necessary configuration changes.")));
4701 0 : warned_for_promote = true;
4702 : }
4703 :
4704 : /*
4705 : * If recovery pause is requested then set it paused. While
4706 : * we are in the loop, user might resume and pause again so
4707 : * set this every time.
4708 : */
4709 0 : ConfirmRecoveryPaused();
4710 :
4711 : /*
4712 : * We wait on a condition variable that will wake us as soon
4713 : * as the pause ends, but we use a timeout so we can check the
4714 : * above conditions periodically too.
4715 : */
4716 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4717 : WAIT_EVENT_RECOVERY_PAUSE);
4718 : }
4719 0 : ConditionVariableCancelSleep();
4720 : }
4721 :
4722 0 : ereport(FATAL,
4723 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4724 : errmsg("recovery aborted because of insufficient parameter settings"),
4725 : /* Repeat the detail from above so it's easy to find in the log. */
4726 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4727 : param_name,
4728 : currValue,
4729 : minValue),
4730 : errhint("You can restart the server after making the necessary configuration changes.")));
4731 : }
4732 1140 : }
4733 :
4734 :
4735 : /*
4736 : * GUC check_hook for primary_slot_name
4737 : */
4738 : bool
4739 2268 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4740 : {
4741 2268 : if (*newval && strcmp(*newval, "") != 0 &&
4742 282 : !ReplicationSlotValidateName(*newval, WARNING))
4743 0 : return false;
4744 :
4745 2268 : return true;
4746 : }
4747 :
4748 : /*
4749 : * Recovery target settings: Only one of the several recovery_target* settings
4750 : * may be set. Setting a second one results in an error. The global variable
4751 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4752 : * variables store the actual target value (for example a string or a xid).
4753 : * The assign functions of the parameters check whether a competing parameter
4754 : * was already set. But we want to allow setting the same parameter multiple
4755 : * times. We also want to allow unsetting a parameter and setting a different
4756 : * one, so we unset recoveryTarget when the parameter is set to an empty
4757 : * string.
4758 : *
4759 : * XXX this code is broken by design. Throwing an error from a GUC assign
4760 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4761 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4762 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4763 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4764 : */
4765 :
4766 : static void
4767 : pg_attribute_noreturn()
4768 2 : error_multiple_recovery_targets(void)
4769 : {
4770 2 : ereport(ERROR,
4771 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4772 : errmsg("multiple recovery targets specified"),
4773 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4774 : }
4775 :
4776 : /*
4777 : * GUC check_hook for recovery_target
4778 : */
4779 : bool
4780 1988 : check_recovery_target(char **newval, void **extra, GucSource source)
4781 : {
4782 1988 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4783 : {
4784 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4785 0 : return false;
4786 : }
4787 1988 : return true;
4788 : }
4789 :
4790 : /*
4791 : * GUC assign_hook for recovery_target
4792 : */
4793 : void
4794 1988 : assign_recovery_target(const char *newval, void *extra)
4795 : {
4796 1988 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4797 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4798 0 : error_multiple_recovery_targets();
4799 :
4800 1988 : if (newval && strcmp(newval, "") != 0)
4801 2 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4802 : else
4803 1986 : recoveryTarget = RECOVERY_TARGET_UNSET;
4804 1988 : }
4805 :
4806 : /*
4807 : * GUC check_hook for recovery_target_lsn
4808 : */
4809 : bool
4810 1998 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4811 : {
4812 1998 : if (strcmp(*newval, "") != 0)
4813 : {
4814 : XLogRecPtr lsn;
4815 : XLogRecPtr *myextra;
4816 16 : bool have_error = false;
4817 :
4818 16 : lsn = pg_lsn_in_internal(*newval, &have_error);
4819 16 : if (have_error)
4820 0 : return false;
4821 :
4822 16 : myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4823 16 : *myextra = lsn;
4824 16 : *extra = myextra;
4825 : }
4826 1998 : return true;
4827 : }
4828 :
4829 : /*
4830 : * GUC assign_hook for recovery_target_lsn
4831 : */
4832 : void
4833 1998 : assign_recovery_target_lsn(const char *newval, void *extra)
4834 : {
4835 1998 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4836 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4837 0 : error_multiple_recovery_targets();
4838 :
4839 1998 : if (newval && strcmp(newval, "") != 0)
4840 : {
4841 16 : recoveryTarget = RECOVERY_TARGET_LSN;
4842 16 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4843 : }
4844 : else
4845 1982 : recoveryTarget = RECOVERY_TARGET_UNSET;
4846 1998 : }
4847 :
4848 : /*
4849 : * GUC check_hook for recovery_target_name
4850 : */
4851 : bool
4852 2000 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4853 : {
4854 : /* Use the value of newval directly */
4855 2000 : if (strlen(*newval) >= MAXFNAMELEN)
4856 : {
4857 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4858 : "recovery_target_name", MAXFNAMELEN - 1);
4859 0 : return false;
4860 : }
4861 2000 : return true;
4862 : }
4863 :
4864 : /*
4865 : * GUC assign_hook for recovery_target_name
4866 : */
4867 : void
4868 2000 : assign_recovery_target_name(const char *newval, void *extra)
4869 : {
4870 2000 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4871 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4872 0 : error_multiple_recovery_targets();
4873 :
4874 2000 : if (newval && strcmp(newval, "") != 0)
4875 : {
4876 12 : recoveryTarget = RECOVERY_TARGET_NAME;
4877 12 : recoveryTargetName = newval;
4878 : }
4879 : else
4880 1988 : recoveryTarget = RECOVERY_TARGET_UNSET;
4881 2000 : }
4882 :
4883 : /*
4884 : * GUC check_hook for recovery_target_time
4885 : *
4886 : * The interpretation of the recovery_target_time string can depend on the
4887 : * time zone setting, so we need to wait until after all GUC processing is
4888 : * done before we can do the final parsing of the string. This check function
4889 : * only does a parsing pass to catch syntax errors, but we store the string
4890 : * and parse it again when we need to use it.
4891 : */
4892 : bool
4893 1992 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4894 : {
4895 1992 : if (strcmp(*newval, "") != 0)
4896 : {
4897 : /* reject some special values */
4898 6 : if (strcmp(*newval, "now") == 0 ||
4899 6 : strcmp(*newval, "today") == 0 ||
4900 6 : strcmp(*newval, "tomorrow") == 0 ||
4901 6 : strcmp(*newval, "yesterday") == 0)
4902 : {
4903 0 : return false;
4904 : }
4905 :
4906 : /*
4907 : * parse timestamp value (see also timestamptz_in())
4908 : */
4909 : {
4910 6 : char *str = *newval;
4911 : fsec_t fsec;
4912 : struct pg_tm tt,
4913 6 : *tm = &tt;
4914 : int tz;
4915 : int dtype;
4916 : int nf;
4917 : int dterr;
4918 : char *field[MAXDATEFIELDS];
4919 : int ftype[MAXDATEFIELDS];
4920 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4921 : DateTimeErrorExtra dtextra;
4922 : TimestampTz timestamp;
4923 :
4924 6 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4925 : field, ftype, MAXDATEFIELDS, &nf);
4926 6 : if (dterr == 0)
4927 6 : dterr = DecodeDateTime(field, ftype, nf,
4928 : &dtype, tm, &fsec, &tz, &dtextra);
4929 6 : if (dterr != 0)
4930 0 : return false;
4931 6 : if (dtype != DTK_DATE)
4932 0 : return false;
4933 :
4934 6 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4935 : {
4936 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4937 0 : return false;
4938 : }
4939 : }
4940 : }
4941 1992 : return true;
4942 : }
4943 :
4944 : /*
4945 : * GUC assign_hook for recovery_target_time
4946 : */
4947 : void
4948 1992 : assign_recovery_target_time(const char *newval, void *extra)
4949 : {
4950 1992 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4951 2 : recoveryTarget != RECOVERY_TARGET_TIME)
4952 2 : error_multiple_recovery_targets();
4953 :
4954 1990 : if (newval && strcmp(newval, "") != 0)
4955 4 : recoveryTarget = RECOVERY_TARGET_TIME;
4956 : else
4957 1986 : recoveryTarget = RECOVERY_TARGET_UNSET;
4958 1990 : }
4959 :
4960 : /*
4961 : * GUC check_hook for recovery_target_timeline
4962 : */
4963 : bool
4964 1988 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4965 : {
4966 : RecoveryTargetTimeLineGoal rttg;
4967 : RecoveryTargetTimeLineGoal *myextra;
4968 :
4969 1988 : if (strcmp(*newval, "current") == 0)
4970 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4971 1988 : else if (strcmp(*newval, "latest") == 0)
4972 1988 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4973 : else
4974 : {
4975 0 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
4976 :
4977 0 : errno = 0;
4978 0 : strtoul(*newval, NULL, 0);
4979 0 : if (errno == EINVAL || errno == ERANGE)
4980 : {
4981 0 : GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4982 0 : return false;
4983 : }
4984 : }
4985 :
4986 1988 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(ERROR, sizeof(RecoveryTargetTimeLineGoal));
4987 1988 : *myextra = rttg;
4988 1988 : *extra = myextra;
4989 :
4990 1988 : return true;
4991 : }
4992 :
4993 : /*
4994 : * GUC assign_hook for recovery_target_timeline
4995 : */
4996 : void
4997 1988 : assign_recovery_target_timeline(const char *newval, void *extra)
4998 : {
4999 1988 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5000 1988 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5001 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5002 : else
5003 1988 : recoveryTargetTLIRequested = 0;
5004 1988 : }
5005 :
5006 : /*
5007 : * GUC check_hook for recovery_target_xid
5008 : */
5009 : bool
5010 1988 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5011 : {
5012 1988 : if (strcmp(*newval, "") != 0)
5013 : {
5014 : TransactionId xid;
5015 : TransactionId *myextra;
5016 :
5017 2 : errno = 0;
5018 2 : xid = (TransactionId) strtou64(*newval, NULL, 0);
5019 2 : if (errno == EINVAL || errno == ERANGE)
5020 0 : return false;
5021 :
5022 2 : myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5023 2 : *myextra = xid;
5024 2 : *extra = myextra;
5025 : }
5026 1988 : return true;
5027 : }
5028 :
5029 : /*
5030 : * GUC assign_hook for recovery_target_xid
5031 : */
5032 : void
5033 1988 : assign_recovery_target_xid(const char *newval, void *extra)
5034 : {
5035 1988 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5036 0 : recoveryTarget != RECOVERY_TARGET_XID)
5037 0 : error_multiple_recovery_targets();
5038 :
5039 1988 : if (newval && strcmp(newval, "") != 0)
5040 : {
5041 2 : recoveryTarget = RECOVERY_TARGET_XID;
5042 2 : recoveryTargetXid = *((TransactionId *) extra);
5043 : }
5044 : else
5045 1986 : recoveryTarget = RECOVERY_TARGET_UNSET;
5046 1988 : }
|