Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <time.h>
29 : #include <sys/stat.h>
30 : #include <sys/time.h>
31 : #include <unistd.h>
32 :
33 : #include "access/timeline.h"
34 : #include "access/transam.h"
35 : #include "access/xact.h"
36 : #include "access/xlog_internal.h"
37 : #include "access/xlogarchive.h"
38 : #include "access/xlogprefetcher.h"
39 : #include "access/xlogreader.h"
40 : #include "access/xlogrecovery.h"
41 : #include "access/xlogutils.h"
42 : #include "access/xlogwait.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "nodes/miscnodes.h"
49 : #include "pgstat.h"
50 : #include "postmaster/bgwriter.h"
51 : #include "postmaster/startup.h"
52 : #include "replication/slot.h"
53 : #include "replication/slotsync.h"
54 : #include "replication/walreceiver.h"
55 : #include "storage/fd.h"
56 : #include "storage/ipc.h"
57 : #include "storage/latch.h"
58 : #include "storage/pmsignal.h"
59 : #include "storage/procarray.h"
60 : #include "storage/spin.h"
61 : #include "storage/subsystems.h"
62 : #include "utils/datetime.h"
63 : #include "utils/fmgrprotos.h"
64 : #include "utils/guc_hooks.h"
65 : #include "utils/pgstat_internal.h"
66 : #include "utils/pg_lsn.h"
67 : #include "utils/ps_status.h"
68 : #include "utils/pg_rusage.h"
69 : #include "utils/wait_event.h"
70 :
71 : /* Unsupported old recovery command file names (relative to $PGDATA) */
72 : #define RECOVERY_COMMAND_FILE "recovery.conf"
73 : #define RECOVERY_COMMAND_DONE "recovery.done"
74 :
75 : /*
76 : * GUC support
77 : */
78 : const struct config_enum_entry recovery_target_action_options[] = {
79 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
80 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
81 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
82 : {NULL, 0, false}
83 : };
84 :
85 : /* options formerly taken from recovery.conf for archive recovery */
86 : char *recoveryRestoreCommand = NULL;
87 : char *recoveryEndCommand = NULL;
88 : char *archiveCleanupCommand = NULL;
89 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
90 : bool recoveryTargetInclusive = true;
91 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
92 : TransactionId recoveryTargetXid;
93 : char *recovery_target_time_string;
94 : TimestampTz recoveryTargetTime;
95 : const char *recoveryTargetName;
96 : XLogRecPtr recoveryTargetLSN;
97 : int recovery_min_apply_delay = 0;
98 :
99 : /* options formerly taken from recovery.conf for XLOG streaming */
100 : char *PrimaryConnInfo = NULL;
101 : char *PrimarySlotName = NULL;
102 : bool wal_receiver_create_temp_slot = false;
103 :
104 : /*
105 : * recoveryTargetTimeLineGoal: what the user requested, if any
106 : *
107 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
108 : *
109 : * recoveryTargetTLI: the currently understood target timeline; changes
110 : *
111 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
112 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
113 : * always the first list member). Only these TLIs are expected to be seen in
114 : * the WAL segments we read, and indeed only these TLIs will be considered as
115 : * candidate WAL files to open at all.
116 : *
117 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
118 : * (This is not necessarily the same as the timeline from which we are
119 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
120 : * scanning data that was copied from an ancestor timeline when the current
121 : * file was created.) During a sequential scan we do not allow this value
122 : * to decrease.
123 : */
124 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
125 : TimeLineID recoveryTargetTLIRequested = 0;
126 : TimeLineID recoveryTargetTLI = 0;
127 : static List *expectedTLEs;
128 : static TimeLineID curFileTLI;
129 :
130 : /*
131 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
132 : * ie. signal files were present. When InArchiveRecovery is set, we are
133 : * currently recovering using offline XLOG archives. These variables are only
134 : * valid in the startup process.
135 : *
136 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
137 : * currently performing crash recovery using only XLOG files in pg_wal, but
138 : * will switch to using offline XLOG archives as soon as we reach the end of
139 : * WAL in pg_wal.
140 : */
141 : bool ArchiveRecoveryRequested = false;
142 : bool InArchiveRecovery = false;
143 :
144 : /*
145 : * When StandbyModeRequested is set, standby mode was requested, i.e.
146 : * standby.signal file was present. When StandbyMode is set, we are currently
147 : * in standby mode. These variables are only valid in the startup process.
148 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
149 : */
150 : static bool StandbyModeRequested = false;
151 : bool StandbyMode = false;
152 :
153 : /* was a signal file present at startup? */
154 : static bool standby_signal_file_found = false;
155 : static bool recovery_signal_file_found = false;
156 :
157 : /*
158 : * CheckPointLoc is the position of the checkpoint record that determines
159 : * where to start the replay. It comes from the backup label file or the
160 : * control file.
161 : *
162 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
163 : * file or the control file. In standby mode, XLOG streaming usually starts
164 : * from the position where an invalid record was found. But if we fail to
165 : * read even the initial checkpoint record, we use the REDO location instead
166 : * of the checkpoint location as the start position of XLOG streaming.
167 : * Otherwise we would have to jump backwards to the REDO location after
168 : * reading the checkpoint record, because the REDO record can precede the
169 : * checkpoint record.
170 : */
171 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
172 : static TimeLineID CheckPointTLI = 0;
173 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
174 : static TimeLineID RedoStartTLI = 0;
175 :
176 : /*
177 : * Local copy of SharedHotStandbyActive variable. False actually means "not
178 : * known, need to check the shared state".
179 : */
180 : static bool LocalHotStandbyActive = false;
181 :
182 : /*
183 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
184 : * known, need to check the shared state".
185 : */
186 : static bool LocalPromoteIsTriggered = false;
187 :
188 : /* Has the recovery code requested a walreceiver wakeup? */
189 : static bool doRequestWalReceiverReply;
190 :
191 : /* XLogReader object used to parse the WAL records */
192 : static XLogReaderState *xlogreader = NULL;
193 :
194 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
195 : static XLogPrefetcher *xlogprefetcher = NULL;
196 :
197 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
198 : typedef struct XLogPageReadPrivate
199 : {
200 : int emode;
201 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
202 : bool randAccess;
203 : TimeLineID replayTLI;
204 : } XLogPageReadPrivate;
205 :
206 : /* flag to tell XLogPageRead that we have started replaying */
207 : static bool InRedo = false;
208 :
209 : /*
210 : * Codes indicating where we got a WAL file from during recovery, or where
211 : * to attempt to get one.
212 : */
213 : typedef enum
214 : {
215 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
216 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
217 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
218 : XLOG_FROM_STREAM, /* streamed from primary */
219 : } XLogSource;
220 :
221 : /* human-readable names for XLogSources, for debugging output */
222 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
223 :
224 : /*
225 : * readFile is -1 or a kernel FD for the log file segment that's currently
226 : * open for reading. readSegNo identifies the segment. readOff is the offset
227 : * of the page just read, readLen indicates how much of it has been read into
228 : * readBuf, and readSource indicates where we got the currently open file from.
229 : *
230 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
231 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
232 : * worthwhile, since the XLOG is not read by general-purpose sessions.
233 : */
234 : static int readFile = -1;
235 : static XLogSegNo readSegNo = 0;
236 : static uint32 readOff = 0;
237 : static uint32 readLen = 0;
238 : static XLogSource readSource = XLOG_FROM_ANY;
239 :
240 : /*
241 : * Keeps track of which source we're currently reading from. This is
242 : * different from readSource in that this is always set, even when we don't
243 : * currently have a WAL file open. If lastSourceFailed is set, our last
244 : * attempt to read from currentSource failed, and we should try another source
245 : * next.
246 : *
247 : * pendingWalRcvRestart is set when a config change occurs that requires a
248 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
249 : */
250 : static XLogSource currentSource = XLOG_FROM_ANY;
251 : static bool lastSourceFailed = false;
252 : static bool pendingWalRcvRestart = false;
253 :
254 : /*
255 : * These variables track when we last obtained some WAL data to process,
256 : * and where we got it from. (XLogReceiptSource is initially the same as
257 : * readSource, but readSource gets reset to zero when we don't have data
258 : * to process right now. It is also different from currentSource, which
259 : * also changes when we try to read from a source and fail, while
260 : * XLogReceiptSource tracks where we last successfully read some WAL.)
261 : */
262 : static TimestampTz XLogReceiptTime = 0;
263 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
264 :
265 : /* Local copy of WalRcv->flushedUpto */
266 : static XLogRecPtr flushedUpto = InvalidXLogRecPtr;
267 : static TimeLineID receiveTLI = 0;
268 :
269 : /*
270 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
271 : *
272 : * In order to reach consistency, we must replay the WAL up to
273 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
274 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
275 : * to backupStartPoint.
276 : *
277 : * Note: In archive recovery, after consistency has been reached, the
278 : * functions in xlog.c will start updating minRecoveryPoint in the control
279 : * file. But this copy of minRecoveryPoint variable reflects the value at the
280 : * beginning of recovery, and is *not* updated after consistency is reached.
281 : */
282 : static XLogRecPtr minRecoveryPoint;
283 : static TimeLineID minRecoveryPointTLI;
284 :
285 : static XLogRecPtr backupStartPoint;
286 : static XLogRecPtr backupEndPoint;
287 : static bool backupEndRequired = false;
288 :
289 : /*
290 : * Have we reached a consistent database state? In crash recovery, we have
291 : * to replay all the WAL, so reachedConsistency is never set. During archive
292 : * recovery, the database is consistent once minRecoveryPoint is reached.
293 : *
294 : * Consistent state means that the system is internally consistent, all
295 : * the WAL has been replayed up to a certain point, and importantly, there
296 : * is no trace of later actions on disk.
297 : *
298 : * This flag is used only by the startup process and postmaster. When
299 : * minRecoveryPoint is reached, the startup process sets it to true and
300 : * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
301 : * which then sets it to true upon receiving the signal.
302 : */
303 : bool reachedConsistency = false;
304 :
305 : /* Buffers dedicated to consistency checks of size BLCKSZ */
306 : static char *replay_image_masked = NULL;
307 : static char *primary_image_masked = NULL;
308 :
309 : XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
310 :
311 : static void XLogRecoveryShmemRequest(void *arg);
312 : static void XLogRecoveryShmemInit(void *arg);
313 :
314 : const ShmemCallbacks XLogRecoveryShmemCallbacks = {
315 : .request_fn = XLogRecoveryShmemRequest,
316 : .init_fn = XLogRecoveryShmemInit,
317 : };
318 :
319 : /*
320 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
321 : * recovery completes; missingContrecPtr is the location of the first
322 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
323 : * details.
324 : */
325 : static XLogRecPtr abortedRecPtr;
326 : static XLogRecPtr missingContrecPtr;
327 :
328 : /*
329 : * if recoveryStopsBefore/After returns true, it saves information of the stop
330 : * point here
331 : */
332 : static TransactionId recoveryStopXid;
333 : static TimestampTz recoveryStopTime;
334 : static XLogRecPtr recoveryStopLSN;
335 : static char recoveryStopName[MAXFNAMELEN];
336 : static bool recoveryStopAfter;
337 :
338 : /* prototypes for local functions */
339 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
340 :
341 : static void EnableStandbyMode(void);
342 : static void readRecoverySignalFile(void);
343 : static void validateRecoveryParameters(void);
344 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
345 : TimeLineID *backupLabelTLI,
346 : bool *backupEndRequired, bool *backupFromStandby);
347 : static bool read_tablespace_map(List **tablespaces);
348 :
349 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
350 : static void CheckRecoveryConsistency(void);
351 : static void rm_redo_error_callback(void *arg);
352 : #ifdef WAL_DEBUG
353 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
354 : #endif
355 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
356 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
357 : TimeLineID prevTLI, TimeLineID replayTLI);
358 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
359 : static void verifyBackupPageConsistency(XLogReaderState *record);
360 :
361 : static bool recoveryStopsBefore(XLogReaderState *record);
362 : static bool recoveryStopsAfter(XLogReaderState *record);
363 : static char *getRecoveryStopReason(void);
364 : static void recoveryPausesHere(bool endOfRecovery);
365 : static bool recoveryApplyDelay(XLogReaderState *record);
366 : static void ConfirmRecoveryPaused(void);
367 :
368 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
369 : int emode, bool fetching_ckpt,
370 : TimeLineID replayTLI);
371 :
372 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
373 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
374 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
375 : bool randAccess,
376 : bool fetching_ckpt,
377 : XLogRecPtr tliRecPtr,
378 : TimeLineID replayTLI,
379 : XLogRecPtr replayLSN,
380 : bool nonblocking);
381 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
382 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
383 : XLogRecPtr RecPtr, TimeLineID replayTLI);
384 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
385 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
386 : XLogSource source, bool notfoundOk);
387 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
388 :
389 : static bool CheckForStandbyTrigger(void);
390 : static void SetPromoteIsTriggered(void);
391 : static bool HotStandbyActiveInReplay(void);
392 :
393 : static void SetCurrentChunkStartTime(TimestampTz xtime);
394 : static void SetLatestXTime(TimestampTz xtime);
395 :
396 : /*
397 : * Register shared memory for WAL recovery
398 : */
399 : static void
400 1248 : XLogRecoveryShmemRequest(void *arg)
401 : {
402 1248 : ShmemRequestStruct(.name = "XLOG Recovery Ctl",
403 : .size = sizeof(XLogRecoveryCtlData),
404 : .ptr = (void **) &XLogRecoveryCtl,
405 : );
406 1248 : }
407 :
408 : static void
409 1245 : XLogRecoveryShmemInit(void *arg)
410 : {
411 1245 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
412 :
413 1245 : SpinLockInit(&XLogRecoveryCtl->info_lck);
414 1245 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
415 1245 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
416 1245 : }
417 :
418 : /*
419 : * A thin wrapper to enable StandbyMode and do other preparatory work as
420 : * needed.
421 : */
422 : static void
423 119 : EnableStandbyMode(void)
424 : {
425 119 : StandbyMode = true;
426 :
427 : /*
428 : * To avoid server log bloat, we don't report recovery progress in a
429 : * standby as it will always be in recovery unless promoted. We disable
430 : * startup progress timeout in standby mode to avoid calling
431 : * startup_progress_timeout_handler() unnecessarily.
432 : */
433 119 : disable_startup_progress_timeout();
434 119 : }
435 :
436 : /*
437 : * Prepare the system for WAL recovery, if needed.
438 : *
439 : * This is called by StartupXLOG() which coordinates the server startup
440 : * sequence. This function analyzes the control file and the backup label
441 : * file, if any, and figures out whether we need to perform crash recovery or
442 : * archive recovery, and how far we need to replay the WAL to reach a
443 : * consistent state.
444 : *
445 : * This doesn't yet change the on-disk state, except for creating the symlinks
446 : * from table space map file if any, and for fetching WAL files needed to find
447 : * the checkpoint record. On entry, the caller has already read the control
448 : * file into memory, and passes it as argument. This function updates it to
449 : * reflect the recovery state, and the caller is expected to write it back to
450 : * disk does after initializing other subsystems, but before calling
451 : * PerformWalRecovery().
452 : *
453 : * This initializes some global variables like ArchiveRecoveryRequested, and
454 : * StandbyModeRequested and InRecovery.
455 : */
456 : void
457 1088 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
458 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
459 : {
460 : XLogPageReadPrivate *private;
461 : struct stat st;
462 : bool wasShutdown;
463 : XLogRecord *record;
464 : DBState dbstate_at_startup;
465 1088 : bool haveTblspcMap = false;
466 1088 : bool haveBackupLabel = false;
467 : CheckPoint checkPoint;
468 1088 : bool backupFromStandby = false;
469 :
470 1088 : dbstate_at_startup = ControlFile->state;
471 :
472 : /*
473 : * Initialize on the assumption we want to recover to the latest timeline
474 : * that's active according to pg_control.
475 : */
476 1088 : if (ControlFile->minRecoveryPointTLI >
477 1088 : ControlFile->checkPointCopy.ThisTimeLineID)
478 2 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
479 : else
480 1086 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
481 :
482 : /*
483 : * Check for signal files, and if so set up state for offline recovery
484 : */
485 1088 : readRecoverySignalFile();
486 1088 : validateRecoveryParameters();
487 :
488 : /*
489 : * Take ownership of the wakeup latch if we're going to sleep during
490 : * recovery, if required.
491 : */
492 1088 : if (ArchiveRecoveryRequested)
493 124 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
494 :
495 : /*
496 : * Set the WAL reading processor now, as it will be needed when reading
497 : * the checkpoint record required (backup_label or not).
498 : */
499 1088 : private = palloc0_object(XLogPageReadPrivate);
500 1088 : xlogreader =
501 1088 : XLogReaderAllocate(wal_segment_size, NULL,
502 1088 : XL_ROUTINE(.page_read = &XLogPageRead,
503 : .segment_open = NULL,
504 : .segment_close = wal_segment_close),
505 : private);
506 1088 : if (!xlogreader)
507 0 : ereport(ERROR,
508 : (errcode(ERRCODE_OUT_OF_MEMORY),
509 : errmsg("out of memory"),
510 : errdetail("Failed while allocating a WAL reading processor.")));
511 1088 : xlogreader->system_identifier = ControlFile->system_identifier;
512 :
513 : /*
514 : * Set the WAL decode buffer size. This limits how far ahead we can read
515 : * in the WAL.
516 : */
517 1088 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
518 :
519 : /* Create a WAL prefetcher. */
520 1088 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
521 :
522 : /*
523 : * Allocate two page buffers dedicated to WAL consistency checks. We do
524 : * it this way, rather than just making static arrays, for two reasons:
525 : * (1) no need to waste the storage in most instantiations of the backend;
526 : * (2) a static char array isn't guaranteed to have any particular
527 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
528 : */
529 1088 : replay_image_masked = (char *) palloc(BLCKSZ);
530 1088 : primary_image_masked = (char *) palloc(BLCKSZ);
531 :
532 : /*
533 : * Read the backup_label file. We want to run this part of the recovery
534 : * process after checking for signal files and after performing validation
535 : * of the recovery parameters.
536 : */
537 1088 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
538 : &backupFromStandby))
539 : {
540 86 : List *tablespaces = NIL;
541 :
542 : /*
543 : * Archive recovery was requested, and thanks to the backup label
544 : * file, we know how far we need to replay to reach consistency. Enter
545 : * archive recovery directly.
546 : */
547 86 : InArchiveRecovery = true;
548 86 : if (StandbyModeRequested)
549 74 : EnableStandbyMode();
550 :
551 : /*
552 : * Omitting backup_label when creating a new replica, PITR node etc.
553 : * unfortunately is a common cause of corruption. Logging that
554 : * backup_label was used makes it a bit easier to exclude that as the
555 : * cause of observed corruption.
556 : *
557 : * Do so before we try to read the checkpoint record (which can fail),
558 : * as otherwise it can be hard to understand why a checkpoint other
559 : * than ControlFile->checkPoint is used.
560 : */
561 86 : ereport(LOG,
562 : errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
563 : LSN_FORMAT_ARGS(RedoStartLSN),
564 : LSN_FORMAT_ARGS(CheckPointLoc),
565 : CheckPointTLI));
566 :
567 : /*
568 : * When a backup_label file is present, we want to roll forward from
569 : * the checkpoint it identifies, rather than using pg_control.
570 : */
571 86 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
572 : CheckPointTLI);
573 86 : if (record != NULL)
574 : {
575 86 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
576 86 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
577 86 : ereport(DEBUG1,
578 : errmsg_internal("checkpoint record is at %X/%08X",
579 : LSN_FORMAT_ARGS(CheckPointLoc)));
580 86 : InRecovery = true; /* force recovery even if SHUTDOWNED */
581 :
582 : /*
583 : * Make sure that REDO location exists. This may not be the case
584 : * if there was a crash during an online backup, which left a
585 : * backup_label around that references a WAL segment that's
586 : * already been archived.
587 : */
588 86 : if (checkPoint.redo < CheckPointLoc)
589 : {
590 86 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
591 86 : if (!ReadRecord(xlogprefetcher, LOG, false,
592 : checkPoint.ThisTimeLineID))
593 0 : ereport(FATAL,
594 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
595 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
596 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
597 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
598 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
599 : DataDir, DataDir, DataDir, DataDir));
600 : }
601 : }
602 : else
603 : {
604 0 : ereport(FATAL,
605 : errmsg("could not locate required checkpoint record at %X/%08X",
606 : LSN_FORMAT_ARGS(CheckPointLoc)),
607 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
608 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
609 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
610 : DataDir, DataDir, DataDir, DataDir));
611 : wasShutdown = false; /* keep compiler quiet */
612 : }
613 :
614 : /* Read the tablespace_map file if present and create symlinks. */
615 86 : if (read_tablespace_map(&tablespaces))
616 : {
617 : ListCell *lc;
618 :
619 4 : foreach(lc, tablespaces)
620 : {
621 2 : tablespaceinfo *ti = lfirst(lc);
622 : char *linkloc;
623 :
624 2 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
625 :
626 : /*
627 : * Remove the existing symlink if any and Create the symlink
628 : * under PGDATA.
629 : */
630 2 : remove_tablespace_symlink(linkloc);
631 :
632 2 : if (symlink(ti->path, linkloc) < 0)
633 0 : ereport(ERROR,
634 : (errcode_for_file_access(),
635 : errmsg("could not create symbolic link \"%s\": %m",
636 : linkloc)));
637 :
638 2 : pfree(ti->path);
639 2 : pfree(ti);
640 : }
641 :
642 : /* tell the caller to delete it later */
643 2 : haveTblspcMap = true;
644 : }
645 :
646 : /* tell the caller to delete it later */
647 86 : haveBackupLabel = true;
648 : }
649 : else
650 : {
651 : /* No backup_label file has been found if we are here. */
652 :
653 : /*
654 : * If tablespace_map file is present without backup_label file, there
655 : * is no use of such file. There is no harm in retaining it, but it
656 : * is better to get rid of the map file so that we don't have any
657 : * redundant file in data directory and it will avoid any sort of
658 : * confusion. It seems prudent though to just rename the file out of
659 : * the way rather than delete it completely, also we ignore any error
660 : * that occurs in rename operation as even if map file is present
661 : * without backup_label file, it is harmless.
662 : */
663 1002 : if (stat(TABLESPACE_MAP, &st) == 0)
664 : {
665 1 : unlink(TABLESPACE_MAP_OLD);
666 1 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
667 1 : ereport(LOG,
668 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
669 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
670 : errdetail("File \"%s\" was renamed to \"%s\".",
671 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
672 : else
673 0 : ereport(LOG,
674 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
675 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
676 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
677 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
678 : }
679 :
680 : /*
681 : * It's possible that archive recovery was requested, but we don't
682 : * know how far we need to replay the WAL before we reach consistency.
683 : * This can happen for example if a base backup is taken from a
684 : * running server using an atomic filesystem snapshot, without calling
685 : * pg_backup_start/stop. Or if you just kill a running primary server
686 : * and put it into archive recovery by creating a recovery signal
687 : * file.
688 : *
689 : * Our strategy in that case is to perform crash recovery first,
690 : * replaying all the WAL present in pg_wal, and only enter archive
691 : * recovery after that.
692 : *
693 : * But usually we already know how far we need to replay the WAL (up
694 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
695 : * end-of-backup record), and we can enter archive recovery directly.
696 : */
697 1002 : if (ArchiveRecoveryRequested &&
698 45 : (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) ||
699 9 : ControlFile->backupEndRequired ||
700 9 : XLogRecPtrIsValid(ControlFile->backupEndPoint) ||
701 9 : ControlFile->state == DB_SHUTDOWNED))
702 : {
703 44 : InArchiveRecovery = true;
704 44 : if (StandbyModeRequested)
705 44 : EnableStandbyMode();
706 : }
707 :
708 : /*
709 : * For the same reason as when starting up with backup_label present,
710 : * emit a log message when we continue initializing from a base
711 : * backup.
712 : */
713 1002 : if (XLogRecPtrIsValid(ControlFile->backupStartPoint))
714 0 : ereport(LOG,
715 : errmsg("restarting backup recovery with redo LSN %X/%08X",
716 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
717 :
718 : /* Get the last valid checkpoint record. */
719 1002 : CheckPointLoc = ControlFile->checkPoint;
720 1002 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
721 1002 : RedoStartLSN = ControlFile->checkPointCopy.redo;
722 1002 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
723 1002 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
724 : CheckPointTLI);
725 1002 : if (record != NULL)
726 : {
727 1001 : ereport(DEBUG1,
728 : errmsg_internal("checkpoint record is at %X/%08X",
729 : LSN_FORMAT_ARGS(CheckPointLoc)));
730 : }
731 : else
732 : {
733 : /*
734 : * We used to attempt to go back to a secondary checkpoint record
735 : * here, but only when not in standby mode. We now just fail if we
736 : * can't read the last checkpoint because this allows us to
737 : * simplify processing around checkpoints.
738 : */
739 1 : ereport(FATAL,
740 : errmsg("could not locate a valid checkpoint record at %X/%08X",
741 : LSN_FORMAT_ARGS(CheckPointLoc)));
742 : }
743 1001 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
744 1001 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
745 :
746 : /* Make sure that REDO location exists. */
747 1001 : if (checkPoint.redo < CheckPointLoc)
748 : {
749 45 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
750 45 : if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
751 1 : ereport(FATAL,
752 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
753 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
754 : }
755 : }
756 :
757 1086 : if (ArchiveRecoveryRequested)
758 : {
759 124 : if (StandbyModeRequested)
760 119 : ereport(LOG,
761 : (errmsg("entering standby mode")));
762 5 : else if (recoveryTarget == RECOVERY_TARGET_XID)
763 0 : ereport(LOG,
764 : (errmsg("starting point-in-time recovery to XID %u",
765 : recoveryTargetXid)));
766 5 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
767 0 : ereport(LOG,
768 : (errmsg("starting point-in-time recovery to %s",
769 : timestamptz_to_str(recoveryTargetTime))));
770 5 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
771 3 : ereport(LOG,
772 : (errmsg("starting point-in-time recovery to \"%s\"",
773 : recoveryTargetName)));
774 2 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
775 0 : ereport(LOG,
776 : errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
777 : LSN_FORMAT_ARGS(recoveryTargetLSN)));
778 2 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
779 0 : ereport(LOG,
780 : (errmsg("starting point-in-time recovery to earliest consistent point")));
781 : else
782 2 : ereport(LOG,
783 : (errmsg("starting archive recovery")));
784 : }
785 :
786 : /*
787 : * If the location of the checkpoint record is not on the expected
788 : * timeline in the history of the requested timeline, we cannot proceed:
789 : * the backup is not part of the history of the requested timeline.
790 : */
791 : Assert(expectedTLEs); /* was initialized by reading checkpoint
792 : * record */
793 1086 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
794 : CheckPointTLI)
795 : {
796 : XLogRecPtr switchpoint;
797 :
798 : /*
799 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
800 : * not in expectedTLEs at all.
801 : */
802 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
803 0 : ereport(FATAL,
804 : (errmsg("requested timeline %u is not a child of this server's history",
805 : recoveryTargetTLI),
806 : /* translator: %s is a backup_label file or a pg_control file */
807 : errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
808 : haveBackupLabel ? "backup_label" : "pg_control",
809 : LSN_FORMAT_ARGS(CheckPointLoc),
810 : CheckPointTLI,
811 : LSN_FORMAT_ARGS(switchpoint))));
812 : }
813 :
814 : /*
815 : * The min recovery point should be part of the requested timeline's
816 : * history, too.
817 : */
818 1086 : if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) &&
819 44 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
820 44 : ControlFile->minRecoveryPointTLI)
821 0 : ereport(FATAL,
822 : errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
823 : recoveryTargetTLI,
824 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
825 : ControlFile->minRecoveryPointTLI));
826 :
827 1086 : ereport(DEBUG1,
828 : errmsg_internal("redo record is at %X/%08X; shutdown %s",
829 : LSN_FORMAT_ARGS(checkPoint.redo),
830 : wasShutdown ? "true" : "false"));
831 1086 : ereport(DEBUG1,
832 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
833 : U64FromFullTransactionId(checkPoint.nextXid),
834 : checkPoint.nextOid)));
835 1086 : ereport(DEBUG1,
836 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
837 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
838 1086 : ereport(DEBUG1,
839 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
840 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
841 1086 : ereport(DEBUG1,
842 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
843 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
844 1086 : ereport(DEBUG1,
845 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
846 : checkPoint.oldestCommitTsXid,
847 : checkPoint.newestCommitTsXid)));
848 1086 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
849 0 : ereport(PANIC,
850 : (errmsg("invalid next transaction ID")));
851 :
852 : /* sanity check */
853 1086 : if (checkPoint.redo > CheckPointLoc)
854 0 : ereport(PANIC,
855 : (errmsg("invalid redo in checkpoint record")));
856 :
857 : /*
858 : * Check whether we need to force recovery from WAL. If it appears to
859 : * have been a clean shutdown and we did not have a recovery signal file,
860 : * then assume no recovery needed.
861 : */
862 1086 : if (checkPoint.redo < CheckPointLoc)
863 : {
864 130 : if (wasShutdown)
865 0 : ereport(PANIC,
866 : (errmsg("invalid redo record in shutdown checkpoint")));
867 130 : InRecovery = true;
868 : }
869 956 : else if (ControlFile->state != DB_SHUTDOWNED)
870 94 : InRecovery = true;
871 862 : else if (ArchiveRecoveryRequested)
872 : {
873 : /* force recovery due to presence of recovery signal file */
874 8 : InRecovery = true;
875 : }
876 :
877 : /*
878 : * If recovery is needed, update our in-memory copy of pg_control to show
879 : * that we are recovering and to show the selected checkpoint as the place
880 : * we are starting from. We also mark pg_control with any minimum recovery
881 : * stop point obtained from a backup history file.
882 : *
883 : * We don't write the changes to disk yet, though. Only do that after
884 : * initializing various subsystems.
885 : */
886 1086 : if (InRecovery)
887 : {
888 232 : if (InArchiveRecovery)
889 : {
890 130 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
891 : }
892 : else
893 : {
894 102 : ereport(LOG,
895 : (errmsg("database system was not properly shut down; "
896 : "automatic recovery in progress")));
897 102 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
898 2 : ereport(LOG,
899 : (errmsg("crash recovery starts in timeline %u "
900 : "and has target timeline %u",
901 : ControlFile->checkPointCopy.ThisTimeLineID,
902 : recoveryTargetTLI)));
903 102 : ControlFile->state = DB_IN_CRASH_RECOVERY;
904 : }
905 232 : ControlFile->checkPoint = CheckPointLoc;
906 232 : ControlFile->checkPointCopy = checkPoint;
907 232 : if (InArchiveRecovery)
908 : {
909 : /* initialize minRecoveryPoint if not set yet */
910 130 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
911 : {
912 88 : ControlFile->minRecoveryPoint = checkPoint.redo;
913 88 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
914 : }
915 : }
916 :
917 : /*
918 : * Set backupStartPoint if we're starting recovery from a base backup.
919 : *
920 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
921 : * location if we're starting recovery from a base backup which was
922 : * taken from a standby. In this case, the database system status in
923 : * pg_control must indicate that the database was already in recovery.
924 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
925 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
926 : * before reaching this point; e.g. because restore_command or
927 : * primary_conninfo were faulty.
928 : *
929 : * Any other state indicates that the backup somehow became corrupted
930 : * and we can't sensibly continue with recovery.
931 : */
932 232 : if (haveBackupLabel)
933 : {
934 86 : ControlFile->backupStartPoint = checkPoint.redo;
935 86 : ControlFile->backupEndRequired = backupEndRequired;
936 :
937 86 : if (backupFromStandby)
938 : {
939 6 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
940 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
941 0 : ereport(FATAL,
942 : (errmsg("backup_label contains data inconsistent with control file"),
943 : errhint("This means that the backup is corrupted and you will "
944 : "have to use another backup for recovery.")));
945 6 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
946 : }
947 : }
948 : }
949 :
950 : /* remember these, so that we know when we have reached consistency */
951 1086 : backupStartPoint = ControlFile->backupStartPoint;
952 1086 : backupEndRequired = ControlFile->backupEndRequired;
953 1086 : backupEndPoint = ControlFile->backupEndPoint;
954 1086 : if (InArchiveRecovery)
955 : {
956 130 : minRecoveryPoint = ControlFile->minRecoveryPoint;
957 130 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
958 : }
959 : else
960 : {
961 956 : minRecoveryPoint = InvalidXLogRecPtr;
962 956 : minRecoveryPointTLI = 0;
963 : }
964 :
965 : /*
966 : * Start recovery assuming that the final record isn't lost.
967 : */
968 1086 : abortedRecPtr = InvalidXLogRecPtr;
969 1086 : missingContrecPtr = InvalidXLogRecPtr;
970 :
971 1086 : *wasShutdown_ptr = wasShutdown;
972 1086 : *haveBackupLabel_ptr = haveBackupLabel;
973 1086 : *haveTblspcMap_ptr = haveTblspcMap;
974 1086 : }
975 :
976 : /*
977 : * See if there are any recovery signal files and if so, set state for
978 : * recovery.
979 : *
980 : * See if there is a recovery command file (recovery.conf), and if so
981 : * throw an ERROR since as of PG12 we no longer recognize that.
982 : */
983 : static void
984 1088 : readRecoverySignalFile(void)
985 : {
986 : struct stat stat_buf;
987 :
988 1088 : if (IsBootstrapProcessingMode())
989 964 : return;
990 :
991 : /*
992 : * Check for old recovery API file: recovery.conf
993 : */
994 1031 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
995 0 : ereport(FATAL,
996 : (errcode_for_file_access(),
997 : errmsg("using recovery command file \"%s\" is not supported",
998 : RECOVERY_COMMAND_FILE)));
999 :
1000 : /*
1001 : * Remove unused .done file, if present. Ignore if absent.
1002 : */
1003 1031 : unlink(RECOVERY_COMMAND_DONE);
1004 :
1005 : /*
1006 : * Check for recovery signal files and if found, fsync them since they
1007 : * represent server state information. We don't sweat too much about the
1008 : * possibility of fsync failure, however.
1009 : */
1010 1031 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1011 : {
1012 : int fd;
1013 :
1014 119 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1015 : S_IRUSR | S_IWUSR);
1016 119 : if (fd >= 0)
1017 : {
1018 119 : (void) pg_fsync(fd);
1019 119 : close(fd);
1020 : }
1021 119 : standby_signal_file_found = true;
1022 : }
1023 :
1024 1031 : if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1025 : {
1026 : int fd;
1027 :
1028 6 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1029 : S_IRUSR | S_IWUSR);
1030 6 : if (fd >= 0)
1031 : {
1032 6 : (void) pg_fsync(fd);
1033 6 : close(fd);
1034 : }
1035 6 : recovery_signal_file_found = true;
1036 : }
1037 :
1038 : /*
1039 : * If both signal files are present, standby signal file takes precedence.
1040 : * If neither is present then we won't enter archive recovery.
1041 : */
1042 1031 : StandbyModeRequested = false;
1043 1031 : ArchiveRecoveryRequested = false;
1044 1031 : if (standby_signal_file_found)
1045 : {
1046 119 : StandbyModeRequested = true;
1047 119 : ArchiveRecoveryRequested = true;
1048 : }
1049 912 : else if (recovery_signal_file_found)
1050 : {
1051 5 : StandbyModeRequested = false;
1052 5 : ArchiveRecoveryRequested = true;
1053 : }
1054 : else
1055 907 : return;
1056 :
1057 : /*
1058 : * We don't support standby mode in standalone backends; that requires
1059 : * other processes such as the WAL receiver to be alive.
1060 : */
1061 124 : if (StandbyModeRequested && !IsUnderPostmaster)
1062 0 : ereport(FATAL,
1063 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1064 : errmsg("standby mode is not supported by single-user servers")));
1065 : }
1066 :
1067 : static void
1068 1088 : validateRecoveryParameters(void)
1069 : {
1070 1088 : if (!ArchiveRecoveryRequested)
1071 964 : return;
1072 :
1073 : /*
1074 : * Check for compulsory parameters
1075 : */
1076 124 : if (StandbyModeRequested)
1077 : {
1078 119 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1079 12 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1080 2 : ereport(WARNING,
1081 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1082 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1083 : }
1084 : else
1085 : {
1086 5 : if (recoveryRestoreCommand == NULL ||
1087 5 : strcmp(recoveryRestoreCommand, "") == 0)
1088 0 : ereport(FATAL,
1089 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1090 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1091 : }
1092 :
1093 : /*
1094 : * Override any inconsistent requests. Note that this is a change of
1095 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1096 : * hot_standby = off, which was surprising behaviour.
1097 : */
1098 124 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1099 117 : !EnableHotStandby)
1100 3 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1101 :
1102 : /*
1103 : * Final parsing of recovery_target_time string; see also
1104 : * check_recovery_target_time().
1105 : */
1106 124 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1107 : {
1108 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1109 : CStringGetDatum(recovery_target_time_string),
1110 : ObjectIdGetDatum(InvalidOid),
1111 : Int32GetDatum(-1)));
1112 : }
1113 :
1114 : /*
1115 : * If user specified recovery_target_timeline, validate it or compute the
1116 : * "latest" value. We can't do this until after we've gotten the restore
1117 : * command and set InArchiveRecovery, because we need to fetch timeline
1118 : * history files from the archive.
1119 : */
1120 124 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1121 : {
1122 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1123 :
1124 : /* Timeline 1 does not have a history file, all else should */
1125 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1126 0 : ereport(FATAL,
1127 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1128 : errmsg("recovery target timeline %u does not exist",
1129 : rtli)));
1130 0 : recoveryTargetTLI = rtli;
1131 : }
1132 124 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1133 : {
1134 : /* We start the "latest" search from pg_control's timeline */
1135 124 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1136 : }
1137 : else
1138 : {
1139 : /*
1140 : * else we just use the recoveryTargetTLI as already read from
1141 : * ControlFile
1142 : */
1143 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1144 : }
1145 : }
1146 :
1147 : /*
1148 : * read_backup_label: check to see if a backup_label file is present
1149 : *
1150 : * If we see a backup_label during recovery, we assume that we are recovering
1151 : * from a backup dump file, and we therefore roll forward from the checkpoint
1152 : * identified by the label file, NOT what pg_control says. This avoids the
1153 : * problem that pg_control might have been archived one or more checkpoints
1154 : * later than the start of the dump, and so if we rely on it as the start
1155 : * point, we will fail to restore a consistent database state.
1156 : *
1157 : * Returns true if a backup_label was found (and fills the checkpoint
1158 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1159 : * returns false if not. If this backup_label came from a streamed backup,
1160 : * *backupEndRequired is set to true. If this backup_label was created during
1161 : * recovery, *backupFromStandby is set to true.
1162 : *
1163 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1164 : * and TLI read from the backup file.
1165 : */
1166 : static bool
1167 1088 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1168 : bool *backupEndRequired, bool *backupFromStandby)
1169 : {
1170 : char startxlogfilename[MAXFNAMELEN];
1171 : TimeLineID tli_from_walseg,
1172 : tli_from_file;
1173 : FILE *lfp;
1174 : char ch;
1175 : char backuptype[20];
1176 : char backupfrom[20];
1177 : char backuplabel[MAXPGPATH];
1178 : char backuptime[128];
1179 : uint32 hi,
1180 : lo;
1181 :
1182 : /* suppress possible uninitialized-variable warnings */
1183 1088 : *checkPointLoc = InvalidXLogRecPtr;
1184 1088 : *backupLabelTLI = 0;
1185 1088 : *backupEndRequired = false;
1186 1088 : *backupFromStandby = false;
1187 :
1188 : /*
1189 : * See if label file is present
1190 : */
1191 1088 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1192 1088 : if (!lfp)
1193 : {
1194 1002 : if (errno != ENOENT)
1195 0 : ereport(FATAL,
1196 : (errcode_for_file_access(),
1197 : errmsg("could not read file \"%s\": %m",
1198 : BACKUP_LABEL_FILE)));
1199 1002 : return false; /* it's not there, all is fine */
1200 : }
1201 :
1202 : /*
1203 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1204 : * is pretty crude, but we are not expecting any variability in the file
1205 : * format).
1206 : */
1207 86 : if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1208 86 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1209 0 : ereport(FATAL,
1210 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1211 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1212 86 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1213 86 : RedoStartTLI = tli_from_walseg;
1214 86 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1215 86 : &hi, &lo, &ch) != 3 || ch != '\n')
1216 0 : ereport(FATAL,
1217 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1218 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1219 86 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1220 86 : *backupLabelTLI = tli_from_walseg;
1221 :
1222 : /*
1223 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1224 : * which could mean either pg_basebackup or the pg_backup_start/stop
1225 : * method was used) or if this label came from somewhere else (the only
1226 : * other option today being from pg_rewind). If this was a streamed
1227 : * backup then we know that we need to play through until we get to the
1228 : * end of the WAL which was generated during the backup (at which point we
1229 : * will have reached consistency and backupEndRequired will be reset to be
1230 : * false).
1231 : */
1232 86 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1233 : {
1234 86 : if (strcmp(backuptype, "streamed") == 0)
1235 85 : *backupEndRequired = true;
1236 : }
1237 :
1238 : /*
1239 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1240 : * it was from a standby, we'll double-check that the control file state
1241 : * matches that of a standby.
1242 : */
1243 86 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1244 : {
1245 86 : if (strcmp(backupfrom, "standby") == 0)
1246 6 : *backupFromStandby = true;
1247 : }
1248 :
1249 : /*
1250 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1251 : * but checking for their presence is useful for debugging and the next
1252 : * sanity checks. Cope also with the fact that the result buffers have a
1253 : * pre-allocated size, hence if the backup_label file has been generated
1254 : * with strings longer than the maximum assumed here an incorrect parsing
1255 : * happens. That's fine as only minor consistency checks are done
1256 : * afterwards.
1257 : */
1258 86 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1259 86 : ereport(DEBUG1,
1260 : (errmsg_internal("backup time %s in file \"%s\"",
1261 : backuptime, BACKUP_LABEL_FILE)));
1262 :
1263 86 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1264 85 : ereport(DEBUG1,
1265 : (errmsg_internal("backup label %s in file \"%s\"",
1266 : backuplabel, BACKUP_LABEL_FILE)));
1267 :
1268 : /*
1269 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1270 : * it as a sanity check if present.
1271 : */
1272 86 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1273 : {
1274 85 : if (tli_from_walseg != tli_from_file)
1275 0 : ereport(FATAL,
1276 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1277 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1278 : errdetail("Timeline ID parsed is %u, but expected %u.",
1279 : tli_from_file, tli_from_walseg)));
1280 :
1281 85 : ereport(DEBUG1,
1282 : (errmsg_internal("backup timeline %u in file \"%s\"",
1283 : tli_from_file, BACKUP_LABEL_FILE)));
1284 : }
1285 :
1286 86 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1287 0 : ereport(FATAL,
1288 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1289 : errmsg("this is an incremental backup, not a data directory"),
1290 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1291 :
1292 86 : if (ferror(lfp) || FreeFile(lfp))
1293 0 : ereport(FATAL,
1294 : (errcode_for_file_access(),
1295 : errmsg("could not read file \"%s\": %m",
1296 : BACKUP_LABEL_FILE)));
1297 :
1298 86 : return true;
1299 : }
1300 :
1301 : /*
1302 : * read_tablespace_map: check to see if a tablespace_map file is present
1303 : *
1304 : * If we see a tablespace_map file during recovery, we assume that we are
1305 : * recovering from a backup dump file, and we therefore need to create symlinks
1306 : * as per the information present in tablespace_map file.
1307 : *
1308 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1309 : * with a tablespaceinfo struct for each tablespace listed in the file);
1310 : * returns false if not.
1311 : */
1312 : static bool
1313 86 : read_tablespace_map(List **tablespaces)
1314 : {
1315 : tablespaceinfo *ti;
1316 : FILE *lfp;
1317 : char str[MAXPGPATH];
1318 : int ch,
1319 : i,
1320 : n;
1321 : bool was_backslash;
1322 :
1323 : /*
1324 : * See if tablespace_map file is present
1325 : */
1326 86 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1327 86 : if (!lfp)
1328 : {
1329 84 : if (errno != ENOENT)
1330 0 : ereport(FATAL,
1331 : (errcode_for_file_access(),
1332 : errmsg("could not read file \"%s\": %m",
1333 : TABLESPACE_MAP)));
1334 84 : return false; /* it's not there, all is fine */
1335 : }
1336 :
1337 : /*
1338 : * Read and parse the link name and path lines from tablespace_map file
1339 : * (this code is pretty crude, but we are not expecting any variability in
1340 : * the file format). De-escape any backslashes that were inserted.
1341 : */
1342 2 : i = 0;
1343 2 : was_backslash = false;
1344 77 : while ((ch = fgetc(lfp)) != EOF)
1345 : {
1346 75 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1347 2 : {
1348 : char *endp;
1349 :
1350 2 : if (i == 0)
1351 0 : continue; /* \r immediately followed by \n */
1352 :
1353 : /*
1354 : * The de-escaped line should contain an OID followed by exactly
1355 : * one space followed by a path. The path might start with
1356 : * spaces, so don't be too liberal about parsing.
1357 : */
1358 2 : str[i] = '\0';
1359 2 : n = 0;
1360 12 : while (str[n] && str[n] != ' ')
1361 10 : n++;
1362 2 : if (n < 1 || n >= i - 1)
1363 0 : ereport(FATAL,
1364 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1365 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1366 2 : str[n++] = '\0';
1367 :
1368 2 : ti = palloc0_object(tablespaceinfo);
1369 2 : errno = 0;
1370 2 : ti->oid = strtoul(str, &endp, 10);
1371 2 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1372 0 : ereport(FATAL,
1373 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1374 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1375 2 : ti->path = pstrdup(str + n);
1376 2 : *tablespaces = lappend(*tablespaces, ti);
1377 :
1378 2 : i = 0;
1379 2 : continue;
1380 : }
1381 73 : else if (!was_backslash && ch == '\\')
1382 0 : was_backslash = true;
1383 : else
1384 : {
1385 73 : if (i < sizeof(str) - 1)
1386 73 : str[i++] = ch;
1387 73 : was_backslash = false;
1388 : }
1389 : }
1390 :
1391 2 : if (i != 0 || was_backslash) /* last line not terminated? */
1392 0 : ereport(FATAL,
1393 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1394 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1395 :
1396 2 : if (ferror(lfp) || FreeFile(lfp))
1397 0 : ereport(FATAL,
1398 : (errcode_for_file_access(),
1399 : errmsg("could not read file \"%s\": %m",
1400 : TABLESPACE_MAP)));
1401 :
1402 2 : return true;
1403 : }
1404 :
1405 : /*
1406 : * Finish WAL recovery.
1407 : *
1408 : * This does not close the 'xlogreader' yet, because in some cases the caller
1409 : * still wants to re-read the last checkpoint record by calling
1410 : * ReadCheckpointRecord().
1411 : *
1412 : * Returns the position of the last valid or applied record, after which new
1413 : * WAL should be appended, information about why recovery was ended, and some
1414 : * other things. See the EndOfWalRecoveryInfo struct for details.
1415 : */
1416 : EndOfWalRecoveryInfo *
1417 1018 : FinishWalRecovery(void)
1418 : {
1419 1018 : EndOfWalRecoveryInfo *result = palloc_object(EndOfWalRecoveryInfo);
1420 : XLogRecPtr lastRec;
1421 : TimeLineID lastRecTLI;
1422 : XLogRecPtr endOfLog;
1423 :
1424 : /*
1425 : * Kill WAL receiver, if it's still running, before we continue to write
1426 : * the startup checkpoint and aborted-contrecord records. It will trump
1427 : * over these records and subsequent ones if it's still alive when we
1428 : * start writing WAL.
1429 : */
1430 1018 : XLogShutdownWalRcv();
1431 :
1432 : /*
1433 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1434 : * it and to prevent it from keep trying to fetch the failover slots.
1435 : *
1436 : * We do not update the 'synced' column in 'pg_replication_slots' system
1437 : * view from true to false here, as any failed update could leave 'synced'
1438 : * column false for some slots. This could cause issues during slot sync
1439 : * after restarting the server as a standby. While updating the 'synced'
1440 : * column after switching to the new timeline is an option, it does not
1441 : * simplify the handling for the 'synced' column. Therefore, we retain the
1442 : * 'synced' column as true after promotion as it may provide useful
1443 : * information about the slot origin.
1444 : */
1445 1018 : ShutDownSlotSync();
1446 :
1447 : /*
1448 : * We are now done reading the xlog from stream. Turn off streaming
1449 : * recovery to force fetching the files (which would be required at end of
1450 : * recovery, e.g., timeline history file) from archive or pg_wal.
1451 : *
1452 : * Note that standby mode must be turned off after killing WAL receiver,
1453 : * i.e., calling XLogShutdownWalRcv().
1454 : */
1455 : Assert(!WalRcvStreaming());
1456 1018 : StandbyMode = false;
1457 :
1458 : /*
1459 : * Determine where to start writing WAL next.
1460 : *
1461 : * Re-fetch the last valid or last applied record, so we can identify the
1462 : * exact endpoint of what we consider the valid portion of WAL. There may
1463 : * be an incomplete continuation record after that, in which case
1464 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1465 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1466 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1467 : *
1468 : * An important side-effect of this is to load the last page into
1469 : * xlogreader. The caller uses it to initialize the WAL for writing.
1470 : */
1471 1018 : if (!InRecovery)
1472 : {
1473 853 : lastRec = CheckPointLoc;
1474 853 : lastRecTLI = CheckPointTLI;
1475 : }
1476 : else
1477 : {
1478 165 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1479 165 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1480 : }
1481 1018 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1482 1018 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1483 1018 : endOfLog = xlogreader->EndRecPtr;
1484 :
1485 : /*
1486 : * Remember the TLI in the filename of the XLOG segment containing the
1487 : * end-of-log. It could be different from the timeline that endOfLog
1488 : * nominally belongs to, if there was a timeline switch in that segment,
1489 : * and we were reading the old WAL from a segment belonging to a higher
1490 : * timeline.
1491 : */
1492 1018 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1493 :
1494 1018 : if (ArchiveRecoveryRequested)
1495 : {
1496 : /*
1497 : * We are no longer in archive recovery state.
1498 : *
1499 : * We are now done reading the old WAL. Turn off archive fetching if
1500 : * it was active.
1501 : */
1502 : Assert(InArchiveRecovery);
1503 57 : InArchiveRecovery = false;
1504 :
1505 : /*
1506 : * If the ending log segment is still open, close it (to avoid
1507 : * problems on Windows with trying to rename or delete an open file).
1508 : */
1509 57 : if (readFile >= 0)
1510 : {
1511 57 : close(readFile);
1512 57 : readFile = -1;
1513 : }
1514 : }
1515 :
1516 : /*
1517 : * Copy the last partial block to the caller, for initializing the WAL
1518 : * buffer for appending new WAL.
1519 : */
1520 1018 : if (endOfLog % XLOG_BLCKSZ != 0)
1521 : {
1522 : char *page;
1523 : int len;
1524 : XLogRecPtr pageBeginPtr;
1525 :
1526 996 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1527 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1528 :
1529 : /* Copy the valid part of the last block */
1530 996 : len = endOfLog % XLOG_BLCKSZ;
1531 996 : page = palloc(len);
1532 996 : memcpy(page, xlogreader->readBuf, len);
1533 :
1534 996 : result->lastPageBeginPtr = pageBeginPtr;
1535 996 : result->lastPage = page;
1536 : }
1537 : else
1538 : {
1539 : /* There is no partial block to copy. */
1540 22 : result->lastPageBeginPtr = endOfLog;
1541 22 : result->lastPage = NULL;
1542 : }
1543 :
1544 : /*
1545 : * Create a comment for the history file to explain why and where timeline
1546 : * changed.
1547 : */
1548 1018 : result->recoveryStopReason = getRecoveryStopReason();
1549 :
1550 1018 : result->lastRec = lastRec;
1551 1018 : result->lastRecTLI = lastRecTLI;
1552 1018 : result->endOfLog = endOfLog;
1553 :
1554 1018 : result->abortedRecPtr = abortedRecPtr;
1555 1018 : result->missingContrecPtr = missingContrecPtr;
1556 :
1557 1018 : result->standby_signal_file_found = standby_signal_file_found;
1558 1018 : result->recovery_signal_file_found = recovery_signal_file_found;
1559 :
1560 1018 : return result;
1561 : }
1562 :
1563 : /*
1564 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1565 : */
1566 : void
1567 1018 : ShutdownWalRecovery(void)
1568 : {
1569 : char recoveryPath[MAXPGPATH];
1570 :
1571 : /* Final update of pg_stat_recovery_prefetch. */
1572 1018 : XLogPrefetcherComputeStats(xlogprefetcher);
1573 :
1574 : /* Shut down xlogreader */
1575 1018 : if (readFile >= 0)
1576 : {
1577 961 : close(readFile);
1578 961 : readFile = -1;
1579 : }
1580 1018 : pfree(xlogreader->private_data);
1581 1018 : XLogReaderFree(xlogreader);
1582 1018 : XLogPrefetcherFree(xlogprefetcher);
1583 :
1584 1018 : if (ArchiveRecoveryRequested)
1585 : {
1586 : /*
1587 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1588 : * rid of it.
1589 : */
1590 57 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1591 57 : unlink(recoveryPath); /* ignore any error */
1592 :
1593 : /* Get rid of any remaining recovered timeline-history file, too */
1594 57 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1595 57 : unlink(recoveryPath); /* ignore any error */
1596 : }
1597 :
1598 : /*
1599 : * We don't need the latch anymore. It's not strictly necessary to disown
1600 : * it, but let's do it for the sake of tidiness.
1601 : */
1602 1018 : if (ArchiveRecoveryRequested)
1603 57 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1604 1018 : }
1605 :
1606 : /*
1607 : * Perform WAL recovery.
1608 : *
1609 : * If the system was shut down cleanly, this is never called.
1610 : */
1611 : void
1612 231 : PerformWalRecovery(void)
1613 : {
1614 : XLogRecord *record;
1615 231 : bool reachedRecoveryTarget = false;
1616 : TimeLineID replayTLI;
1617 :
1618 : /*
1619 : * Initialize shared variables for tracking progress of WAL replay, as if
1620 : * we had just replayed the record before the REDO location (or the
1621 : * checkpoint record itself, if it's a shutdown checkpoint).
1622 : */
1623 231 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1624 231 : if (RedoStartLSN < CheckPointLoc)
1625 : {
1626 129 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1627 129 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1628 129 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1629 : }
1630 : else
1631 : {
1632 102 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1633 102 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1634 102 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1635 : }
1636 231 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1637 231 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1638 231 : XLogRecoveryCtl->recoveryLastXTime = 0;
1639 231 : XLogRecoveryCtl->currentChunkStartTime = 0;
1640 231 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1641 231 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1642 :
1643 : /* Also ensure XLogReceiptTime has a sane value */
1644 231 : XLogReceiptTime = GetCurrentTimestamp();
1645 :
1646 : /*
1647 : * Let postmaster know we've started redo now, so that it can launch the
1648 : * archiver if necessary.
1649 : */
1650 231 : if (IsUnderPostmaster)
1651 222 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1652 :
1653 : /*
1654 : * Allow read-only connections immediately if we're consistent already.
1655 : */
1656 231 : CheckRecoveryConsistency();
1657 :
1658 : /*
1659 : * Find the first record that logically follows the checkpoint --- it
1660 : * might physically precede it, though.
1661 : */
1662 231 : if (RedoStartLSN < CheckPointLoc)
1663 : {
1664 : /* back up to find the record */
1665 129 : replayTLI = RedoStartTLI;
1666 129 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1667 129 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1668 :
1669 : /*
1670 : * If a checkpoint record's redo pointer points back to an earlier
1671 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1672 : * record.
1673 : */
1674 129 : if (record->xl_rmid != RM_XLOG_ID ||
1675 129 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1676 0 : ereport(FATAL,
1677 : errmsg("unexpected record type found at redo point %X/%08X",
1678 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1679 : }
1680 : else
1681 : {
1682 : /* just have to read next record after CheckPoint */
1683 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1684 102 : replayTLI = CheckPointTLI;
1685 102 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1686 : }
1687 :
1688 231 : if (record != NULL)
1689 : {
1690 : TimestampTz xtime;
1691 : PGRUsage ru0;
1692 :
1693 222 : pg_rusage_init(&ru0);
1694 :
1695 222 : InRedo = true;
1696 :
1697 222 : RmgrStartup();
1698 :
1699 222 : ereport(LOG,
1700 : errmsg("redo starts at %X/%08X",
1701 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1702 :
1703 : /* Prepare to report progress of the redo phase. */
1704 222 : if (!StandbyMode)
1705 108 : begin_startup_progress_phase();
1706 :
1707 : /*
1708 : * main redo apply loop
1709 : */
1710 : do
1711 : {
1712 2948815 : if (!StandbyMode)
1713 327568 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1714 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1715 :
1716 : #ifdef WAL_DEBUG
1717 : if (XLOG_DEBUG)
1718 : {
1719 : StringInfoData buf;
1720 :
1721 : initStringInfo(&buf);
1722 : appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1723 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1724 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1725 : xlog_outrec(&buf, xlogreader);
1726 : appendStringInfoString(&buf, " - ");
1727 : xlog_outdesc(&buf, xlogreader);
1728 : elog(LOG, "%s", buf.data);
1729 : pfree(buf.data);
1730 : }
1731 : #endif
1732 :
1733 : /* Handle interrupt signals of startup process */
1734 2948815 : ProcessStartupProcInterrupts();
1735 :
1736 : /*
1737 : * Pause WAL replay, if requested by a hot-standby session via
1738 : * SetRecoveryPause().
1739 : *
1740 : * Note that we intentionally don't take the info_lck spinlock
1741 : * here. We might therefore read a slightly stale value of the
1742 : * recoveryPause flag, but it can't be very stale (no worse than
1743 : * the last spinlock we did acquire). Since a pause request is a
1744 : * pretty asynchronous thing anyway, possibly responding to it one
1745 : * WAL record later than we otherwise would is a minor issue, so
1746 : * it doesn't seem worth adding another spinlock cycle to prevent
1747 : * that.
1748 : */
1749 2948815 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1750 : RECOVERY_NOT_PAUSED)
1751 0 : recoveryPausesHere(false);
1752 :
1753 : /*
1754 : * Have we reached our recovery target?
1755 : */
1756 2948815 : if (recoveryStopsBefore(xlogreader))
1757 : {
1758 2 : reachedRecoveryTarget = true;
1759 2 : break;
1760 : }
1761 :
1762 : /*
1763 : * If we've been asked to lag the primary, wait on latch until
1764 : * enough time has passed.
1765 : */
1766 2948813 : if (recoveryApplyDelay(xlogreader))
1767 : {
1768 : /*
1769 : * We test for paused recovery again here. If user sets
1770 : * delayed apply, it may be because they expect to pause
1771 : * recovery in case of problems, so we must test again here
1772 : * otherwise pausing during the delay-wait wouldn't work.
1773 : */
1774 17 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1775 : RECOVERY_NOT_PAUSED)
1776 1 : recoveryPausesHere(false);
1777 : }
1778 :
1779 : /*
1780 : * Apply the record
1781 : */
1782 2948813 : ApplyWalRecord(xlogreader, record, &replayTLI);
1783 :
1784 : /*
1785 : * Wake up processes waiting for standby replay, write, or flush
1786 : * LSN to reach current replay position. Replay implies that the
1787 : * WAL was already written and flushed to disk, so write and flush
1788 : * waiters can be woken at the replay position too.
1789 : */
1790 2948811 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY,
1791 2948811 : XLogRecoveryCtl->lastReplayedEndRecPtr);
1792 2948811 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_WRITE,
1793 2948811 : XLogRecoveryCtl->lastReplayedEndRecPtr);
1794 2948811 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_FLUSH,
1795 2948811 : XLogRecoveryCtl->lastReplayedEndRecPtr);
1796 :
1797 : /* Exit loop if we reached inclusive recovery target */
1798 2948811 : if (recoveryStopsAfter(xlogreader))
1799 : {
1800 5 : reachedRecoveryTarget = true;
1801 5 : break;
1802 : }
1803 :
1804 : /* Else, try to fetch the next WAL record */
1805 2948806 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1806 2948743 : } while (record != NULL);
1807 :
1808 : /*
1809 : * end of main redo apply loop
1810 : */
1811 :
1812 157 : if (reachedRecoveryTarget)
1813 : {
1814 7 : if (!reachedConsistency)
1815 0 : ereport(FATAL,
1816 : (errmsg("requested recovery stop point is before consistent recovery point")));
1817 :
1818 : /*
1819 : * This is the last point where we can restart recovery with a new
1820 : * recovery target, if we shutdown and begin again. After this,
1821 : * Resource Managers may choose to do permanent corrective actions
1822 : * at end of recovery.
1823 : */
1824 7 : switch (recoveryTargetAction)
1825 : {
1826 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1827 :
1828 : /*
1829 : * exit with special return code to request shutdown of
1830 : * postmaster. Log messages issued from postmaster.
1831 : */
1832 0 : proc_exit(3);
1833 :
1834 1 : case RECOVERY_TARGET_ACTION_PAUSE:
1835 1 : SetRecoveryPause(true);
1836 1 : recoveryPausesHere(true);
1837 :
1838 : /* drop into promote */
1839 : pg_fallthrough;
1840 :
1841 7 : case RECOVERY_TARGET_ACTION_PROMOTE:
1842 7 : break;
1843 : }
1844 : }
1845 :
1846 157 : RmgrCleanup();
1847 :
1848 157 : ereport(LOG,
1849 : errmsg("redo done at %X/%08X system usage: %s",
1850 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1851 : pg_rusage_show(&ru0)));
1852 157 : xtime = GetLatestXTime();
1853 157 : if (xtime)
1854 40 : ereport(LOG,
1855 : (errmsg("last completed transaction was at log time %s",
1856 : timestamptz_to_str(xtime))));
1857 :
1858 157 : InRedo = false;
1859 : }
1860 : else
1861 : {
1862 : /* there are no WAL records following the checkpoint */
1863 9 : ereport(LOG,
1864 : (errmsg("redo is not required")));
1865 : }
1866 :
1867 : /*
1868 : * This check is intentionally after the above log messages that indicate
1869 : * how far recovery went.
1870 : */
1871 166 : if (ArchiveRecoveryRequested &&
1872 58 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1873 8 : !reachedRecoveryTarget)
1874 1 : ereport(FATAL,
1875 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1876 : errmsg("recovery ended before configured recovery target was reached")));
1877 165 : }
1878 :
1879 : /*
1880 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1881 : */
1882 : static void
1883 2948813 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1884 : {
1885 : ErrorContextCallback errcallback;
1886 2948813 : bool switchedTLI = false;
1887 :
1888 : /* Setup error traceback support for ereport() */
1889 2948813 : errcallback.callback = rm_redo_error_callback;
1890 2948813 : errcallback.arg = xlogreader;
1891 2948813 : errcallback.previous = error_context_stack;
1892 2948813 : error_context_stack = &errcallback;
1893 :
1894 : /*
1895 : * TransamVariables->nextXid must be beyond record's xid.
1896 : */
1897 2948813 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1898 :
1899 : /*
1900 : * Before replaying this record, check if this record causes the current
1901 : * timeline to change. The record is already considered to be part of the
1902 : * new timeline, so we update replayTLI before replaying it. That's
1903 : * important so that replayEndTLI, which is recorded as the minimum
1904 : * recovery point's TLI if recovery stops after this record, is set
1905 : * correctly.
1906 : */
1907 2948813 : if (record->xl_rmid == RM_XLOG_ID)
1908 : {
1909 115286 : TimeLineID newReplayTLI = *replayTLI;
1910 115286 : TimeLineID prevReplayTLI = *replayTLI;
1911 115286 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1912 :
1913 115286 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1914 : {
1915 : CheckPoint checkPoint;
1916 :
1917 44 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1918 44 : newReplayTLI = checkPoint.ThisTimeLineID;
1919 44 : prevReplayTLI = checkPoint.PrevTimeLineID;
1920 : }
1921 115242 : else if (info == XLOG_END_OF_RECOVERY)
1922 : {
1923 : xl_end_of_recovery xlrec;
1924 :
1925 12 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1926 12 : newReplayTLI = xlrec.ThisTimeLineID;
1927 12 : prevReplayTLI = xlrec.PrevTimeLineID;
1928 : }
1929 :
1930 115286 : if (newReplayTLI != *replayTLI)
1931 : {
1932 : /* Check that it's OK to switch to this TLI */
1933 13 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1934 : newReplayTLI, prevReplayTLI, *replayTLI);
1935 :
1936 : /* Following WAL records should be run with new TLI */
1937 13 : *replayTLI = newReplayTLI;
1938 13 : switchedTLI = true;
1939 : }
1940 : }
1941 :
1942 : /*
1943 : * Update shared replayEndRecPtr before replaying this record, so that
1944 : * XLogFlush will update minRecoveryPoint correctly.
1945 : */
1946 2948813 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1947 2948813 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1948 2948813 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1949 2948813 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1950 :
1951 : /*
1952 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1953 : */
1954 2948813 : if (standbyState >= STANDBY_INITIALIZED &&
1955 2640938 : TransactionIdIsValid(record->xl_xid))
1956 2578840 : RecordKnownAssignedTransactionIds(record->xl_xid);
1957 :
1958 : /*
1959 : * Some XLOG record types that are related to recovery are processed
1960 : * directly here, rather than in xlog_redo()
1961 : */
1962 2948813 : if (record->xl_rmid == RM_XLOG_ID)
1963 115286 : xlogrecovery_redo(xlogreader, *replayTLI);
1964 :
1965 : /* Now apply the WAL record itself */
1966 2948813 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1967 :
1968 : /*
1969 : * After redo, check whether the backup pages associated with the WAL
1970 : * record are consistent with the existing pages. This check is done only
1971 : * if consistency check is enabled for this record.
1972 : */
1973 2948811 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1974 2264676 : verifyBackupPageConsistency(xlogreader);
1975 :
1976 : /* Pop the error context stack */
1977 2948811 : error_context_stack = errcallback.previous;
1978 :
1979 : /*
1980 : * Update lastReplayedEndRecPtr after this record has been successfully
1981 : * replayed.
1982 : */
1983 2948811 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1984 2948811 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1985 2948811 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1986 2948811 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1987 2948811 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1988 :
1989 : /* ------
1990 : * Wakeup walsenders:
1991 : *
1992 : * On the standby, the WAL is flushed first (which will only wake up
1993 : * physical walsenders) and then applied, which will only wake up logical
1994 : * walsenders.
1995 : *
1996 : * Indeed, logical walsenders on standby can't decode and send data until
1997 : * it's been applied.
1998 : *
1999 : * Physical walsenders don't need to be woken up during replay unless
2000 : * cascading replication is allowed and time line change occurred (so that
2001 : * they can notice that they are on a new time line).
2002 : *
2003 : * That's why the wake up conditions are for:
2004 : *
2005 : * - physical walsenders in case of new time line and cascade
2006 : * replication is allowed
2007 : * - logical walsenders in case cascade replication is allowed (could not
2008 : * be created otherwise)
2009 : * ------
2010 : */
2011 2948811 : if (AllowCascadeReplication())
2012 2696909 : WalSndWakeup(switchedTLI, true);
2013 :
2014 : /*
2015 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2016 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2017 : * a reply to the primary.
2018 : */
2019 2948811 : if (doRequestWalReceiverReply)
2020 : {
2021 2 : doRequestWalReceiverReply = false;
2022 2 : WalRcvRequestApplyReply();
2023 : }
2024 :
2025 : /* Allow read-only connections if we're consistent now */
2026 2948811 : CheckRecoveryConsistency();
2027 :
2028 : /* Is this a timeline switch? */
2029 2948811 : if (switchedTLI)
2030 : {
2031 : /*
2032 : * Before we continue on the new timeline, clean up any (possibly
2033 : * bogus) future WAL segments on the old timeline.
2034 : */
2035 13 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2036 :
2037 : /* Reset the prefetcher. */
2038 13 : XLogPrefetchReconfigure();
2039 : }
2040 2948811 : }
2041 :
2042 : /*
2043 : * Some XLOG RM record types that are directly related to WAL recovery are
2044 : * handled here rather than in the xlog_redo()
2045 : */
2046 : static void
2047 115286 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2048 : {
2049 115286 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2050 115286 : XLogRecPtr lsn = record->EndRecPtr;
2051 :
2052 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2053 :
2054 115286 : if (info == XLOG_OVERWRITE_CONTRECORD)
2055 : {
2056 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2057 : xl_overwrite_contrecord xlrec;
2058 :
2059 1 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2060 1 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2061 0 : elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2062 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2063 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2064 :
2065 : /* We have safely skipped the aborted record */
2066 1 : abortedRecPtr = InvalidXLogRecPtr;
2067 1 : missingContrecPtr = InvalidXLogRecPtr;
2068 :
2069 1 : ereport(LOG,
2070 : errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2071 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2072 : timestamptz_to_str(xlrec.overwrite_time)));
2073 :
2074 : /* Verifying the record should only happen once */
2075 1 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2076 : }
2077 115285 : else if (info == XLOG_BACKUP_END)
2078 : {
2079 : XLogRecPtr startpoint;
2080 :
2081 102 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2082 :
2083 102 : if (backupStartPoint == startpoint)
2084 : {
2085 : /*
2086 : * We have reached the end of base backup, the point where
2087 : * pg_backup_stop() was done. The data on disk is now consistent
2088 : * (assuming we have also reached minRecoveryPoint). Set
2089 : * backupEndPoint to the current LSN, so that the next call to
2090 : * CheckRecoveryConsistency() will notice it and do the
2091 : * end-of-backup processing.
2092 : */
2093 84 : elog(DEBUG1, "end of backup record reached");
2094 :
2095 84 : backupEndPoint = lsn;
2096 : }
2097 : else
2098 18 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2099 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2100 : }
2101 115286 : }
2102 :
2103 : /*
2104 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2105 : * directories.
2106 : *
2107 : * Replay of database creation XLOG records for databases that were later
2108 : * dropped can create fake directories in pg_tblspc. By the time consistency
2109 : * is reached these directories should have been removed; here we verify
2110 : * that this did indeed happen. This is to be called at the point where
2111 : * consistent state is reached.
2112 : *
2113 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2114 : * useful for testing purposes, and also allows for an escape hatch in case
2115 : * things go south.
2116 : */
2117 : static void
2118 130 : CheckTablespaceDirectory(void)
2119 : {
2120 : DIR *dir;
2121 : struct dirent *de;
2122 :
2123 130 : dir = AllocateDir(PG_TBLSPC_DIR);
2124 397 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2125 : {
2126 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2127 :
2128 : /* Skip entries of non-oid names */
2129 267 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2130 260 : continue;
2131 :
2132 7 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2133 :
2134 7 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2135 4 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2136 : (errcode(ERRCODE_DATA_CORRUPTED),
2137 : errmsg("unexpected directory entry \"%s\" found in %s",
2138 : de->d_name, PG_TBLSPC_DIR),
2139 : errdetail("All directory entries in %s/ should be symbolic links.",
2140 : PG_TBLSPC_DIR),
2141 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2142 : }
2143 130 : }
2144 :
2145 : /*
2146 : * Checks if recovery has reached a consistent state. When consistency is
2147 : * reached and we have a valid starting standby snapshot, tell postmaster
2148 : * that it can start accepting read-only connections.
2149 : */
2150 : static void
2151 2949043 : CheckRecoveryConsistency(void)
2152 : {
2153 : XLogRecPtr lastReplayedEndRecPtr;
2154 : TimeLineID lastReplayedTLI;
2155 :
2156 : /*
2157 : * During crash recovery, we don't reach a consistent state until we've
2158 : * replayed all the WAL.
2159 : */
2160 2949043 : if (!XLogRecPtrIsValid(minRecoveryPoint))
2161 322455 : return;
2162 :
2163 : Assert(InArchiveRecovery);
2164 :
2165 : /*
2166 : * assume that we are called in the startup process, and hence don't need
2167 : * a lock to read lastReplayedEndRecPtr
2168 : */
2169 2626588 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2170 2626588 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2171 :
2172 : /*
2173 : * Have we reached the point where our base backup was completed?
2174 : */
2175 2626588 : if (XLogRecPtrIsValid(backupEndPoint) &&
2176 121 : backupEndPoint <= lastReplayedEndRecPtr)
2177 : {
2178 86 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2179 86 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2180 :
2181 86 : elog(DEBUG1, "end of backup reached");
2182 :
2183 : /*
2184 : * We have reached the end of base backup, as indicated by pg_control.
2185 : * Update the control file accordingly.
2186 : */
2187 86 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2188 86 : backupStartPoint = InvalidXLogRecPtr;
2189 86 : backupEndPoint = InvalidXLogRecPtr;
2190 86 : backupEndRequired = false;
2191 :
2192 86 : ereport(LOG,
2193 : errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2194 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2195 : LSN_FORMAT_ARGS(saveBackupEndPoint)));
2196 : }
2197 :
2198 : /*
2199 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2200 : * known to be incorrectly set if recovering from a backup, until the
2201 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2202 : * All we know prior to that is that we're not consistent yet.
2203 : */
2204 2626588 : if (!reachedConsistency && !backupEndRequired &&
2205 7965 : minRecoveryPoint <= lastReplayedEndRecPtr)
2206 : {
2207 : /*
2208 : * Check to see if the XLOG sequence contained any unresolved
2209 : * references to uninitialized pages.
2210 : */
2211 130 : XLogCheckInvalidPages();
2212 :
2213 : /*
2214 : * Check that pg_tblspc doesn't contain any real directories. Replay
2215 : * of Database/CREATE_* records may have created fictitious tablespace
2216 : * directories that should have been removed by the time consistency
2217 : * was reached.
2218 : */
2219 130 : CheckTablespaceDirectory();
2220 :
2221 130 : reachedConsistency = true;
2222 130 : SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2223 130 : ereport(LOG,
2224 : errmsg("consistent recovery state reached at %X/%08X",
2225 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2226 : }
2227 :
2228 : /*
2229 : * Have we got a valid starting snapshot that will allow queries to be
2230 : * run? If so, we can tell postmaster that the database is consistent now,
2231 : * enabling connections.
2232 : */
2233 2626588 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2234 2626332 : !LocalHotStandbyActive &&
2235 121 : reachedConsistency &&
2236 : IsUnderPostmaster)
2237 : {
2238 121 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2239 121 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2240 121 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2241 :
2242 121 : LocalHotStandbyActive = true;
2243 :
2244 121 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2245 : }
2246 : }
2247 :
2248 : /*
2249 : * Error context callback for errors occurring during rm_redo().
2250 : */
2251 : static void
2252 156 : rm_redo_error_callback(void *arg)
2253 : {
2254 156 : XLogReaderState *record = (XLogReaderState *) arg;
2255 : StringInfoData buf;
2256 :
2257 156 : initStringInfo(&buf);
2258 156 : xlog_outdesc(&buf, record);
2259 156 : xlog_block_info(&buf, record);
2260 :
2261 : /* translator: %s is a WAL record description */
2262 156 : errcontext("WAL redo at %X/%08X for %s",
2263 156 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2264 : buf.data);
2265 :
2266 156 : pfree(buf.data);
2267 156 : }
2268 :
2269 : /*
2270 : * Returns a string describing an XLogRecord, consisting of its identity
2271 : * optionally followed by a colon, a space, and a further description.
2272 : */
2273 : void
2274 156 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2275 : {
2276 156 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2277 156 : uint8 info = XLogRecGetInfo(record);
2278 : const char *id;
2279 :
2280 156 : appendStringInfoString(buf, rmgr.rm_name);
2281 156 : appendStringInfoChar(buf, '/');
2282 :
2283 156 : id = rmgr.rm_identify(info);
2284 156 : if (id == NULL)
2285 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2286 : else
2287 156 : appendStringInfo(buf, "%s: ", id);
2288 :
2289 156 : rmgr.rm_desc(buf, record);
2290 156 : }
2291 :
2292 : #ifdef WAL_DEBUG
2293 :
2294 : static void
2295 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2296 : {
2297 : appendStringInfo(buf, "prev %X/%08X; xid %u",
2298 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2299 : XLogRecGetXid(record));
2300 :
2301 : appendStringInfo(buf, "; len %u",
2302 : XLogRecGetDataLen(record));
2303 :
2304 : xlog_block_info(buf, record);
2305 : }
2306 : #endif /* WAL_DEBUG */
2307 :
2308 : /*
2309 : * Returns a string giving information about all the blocks in an
2310 : * XLogRecord.
2311 : */
2312 : static void
2313 156 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2314 : {
2315 : int block_id;
2316 :
2317 : /* decode block references */
2318 211 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2319 : {
2320 : RelFileLocator rlocator;
2321 : ForkNumber forknum;
2322 : BlockNumber blk;
2323 :
2324 55 : if (!XLogRecGetBlockTagExtended(record, block_id,
2325 : &rlocator, &forknum, &blk, NULL))
2326 0 : continue;
2327 :
2328 55 : if (forknum != MAIN_FORKNUM)
2329 7 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2330 : block_id,
2331 : rlocator.spcOid, rlocator.dbOid,
2332 : rlocator.relNumber,
2333 : forknum,
2334 : blk);
2335 : else
2336 48 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2337 : block_id,
2338 : rlocator.spcOid, rlocator.dbOid,
2339 : rlocator.relNumber,
2340 : blk);
2341 55 : if (XLogRecHasBlockImage(record, block_id))
2342 34 : appendStringInfoString(buf, " FPW");
2343 : }
2344 156 : }
2345 :
2346 :
2347 : /*
2348 : * Check that it's OK to switch to new timeline during recovery.
2349 : *
2350 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2351 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2352 : */
2353 : static void
2354 13 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2355 : TimeLineID replayTLI)
2356 : {
2357 : /* Check that the record agrees on what the current (old) timeline is */
2358 13 : if (prevTLI != replayTLI)
2359 0 : ereport(PANIC,
2360 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2361 : prevTLI, replayTLI)));
2362 :
2363 : /*
2364 : * The new timeline better be in the list of timelines we expect to see,
2365 : * according to the timeline history. It should also not decrease.
2366 : */
2367 13 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2368 0 : ereport(PANIC,
2369 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2370 : newTLI, replayTLI)));
2371 :
2372 : /*
2373 : * If we have not yet reached min recovery point, and we're about to
2374 : * switch to a timeline greater than the timeline of the min recovery
2375 : * point: trouble. After switching to the new timeline, we could not
2376 : * possibly visit the min recovery point on the correct timeline anymore.
2377 : * This can happen if there is a newer timeline in the archive that
2378 : * branched before the timeline the min recovery point is on, and you
2379 : * attempt to do PITR to the new timeline.
2380 : */
2381 13 : if (XLogRecPtrIsValid(minRecoveryPoint) &&
2382 11 : lsn < minRecoveryPoint &&
2383 1 : newTLI > minRecoveryPointTLI)
2384 0 : ereport(PANIC,
2385 : errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2386 : newTLI,
2387 : LSN_FORMAT_ARGS(minRecoveryPoint),
2388 : minRecoveryPointTLI));
2389 :
2390 : /* Looks good */
2391 13 : }
2392 :
2393 :
2394 : /*
2395 : * Extract timestamp from WAL record.
2396 : *
2397 : * If the record contains a timestamp, returns true, and saves the timestamp
2398 : * in *recordXtime. If the record type has no timestamp, returns false.
2399 : * Currently, only transaction commit/abort records and restore points contain
2400 : * timestamps.
2401 : */
2402 : static bool
2403 47294 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2404 : {
2405 47294 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2406 47294 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2407 47294 : uint8 rmid = XLogRecGetRmid(record);
2408 :
2409 47294 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2410 : {
2411 2 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2412 2 : return true;
2413 : }
2414 47292 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2415 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2416 : {
2417 43326 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2418 43326 : return true;
2419 : }
2420 3966 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2421 : xact_info == XLOG_XACT_ABORT_PREPARED))
2422 : {
2423 3966 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2424 3966 : return true;
2425 : }
2426 0 : return false;
2427 : }
2428 :
2429 : /*
2430 : * Checks whether the current buffer page and backup page stored in the
2431 : * WAL record are consistent or not. Before comparing the two pages, a
2432 : * masking can be applied to the pages to ignore certain areas like hint bits,
2433 : * unused space between pd_lower and pd_upper among other things. This
2434 : * function should be called once WAL replay has been completed for a
2435 : * given record.
2436 : */
2437 : static void
2438 2264676 : verifyBackupPageConsistency(XLogReaderState *record)
2439 : {
2440 2264676 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2441 : RelFileLocator rlocator;
2442 : ForkNumber forknum;
2443 : BlockNumber blkno;
2444 : int block_id;
2445 :
2446 : /* Records with no backup blocks have no need for consistency checks. */
2447 2264676 : if (!XLogRecHasAnyBlockRefs(record))
2448 53 : return;
2449 :
2450 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2451 :
2452 4702296 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2453 : {
2454 : Buffer buf;
2455 : Page page;
2456 :
2457 2437673 : if (!XLogRecGetBlockTagExtended(record, block_id,
2458 : &rlocator, &forknum, &blkno, NULL))
2459 : {
2460 : /*
2461 : * WAL record doesn't contain a block reference with the given id.
2462 : * Do nothing.
2463 : */
2464 2089 : continue;
2465 : }
2466 :
2467 : Assert(XLogRecHasBlockImage(record, block_id));
2468 :
2469 2435584 : if (XLogRecBlockImageApply(record, block_id))
2470 : {
2471 : /*
2472 : * WAL record has already applied the page, so bypass the
2473 : * consistency check as that would result in comparing the full
2474 : * page stored in the record with itself.
2475 : */
2476 29201 : continue;
2477 : }
2478 :
2479 : /*
2480 : * Read the contents from the current buffer and store it in a
2481 : * temporary page.
2482 : */
2483 2406383 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2484 : RBM_NORMAL_NO_LOG,
2485 : InvalidBuffer);
2486 2406383 : if (!BufferIsValid(buf))
2487 0 : continue;
2488 :
2489 2406383 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2490 2406383 : page = BufferGetPage(buf);
2491 :
2492 : /*
2493 : * Take a copy of the local page where WAL has been applied to have a
2494 : * comparison base before masking it...
2495 : */
2496 2406383 : memcpy(replay_image_masked, page, BLCKSZ);
2497 :
2498 : /* No need for this page anymore now that a copy is in. */
2499 2406383 : UnlockReleaseBuffer(buf);
2500 :
2501 : /*
2502 : * If the block LSN is already ahead of this WAL record, we can't
2503 : * expect contents to match. This can happen if recovery is
2504 : * restarted.
2505 : */
2506 2406383 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2507 0 : continue;
2508 :
2509 : /*
2510 : * Read the contents from the backup copy, stored in WAL record and
2511 : * store it in a temporary page. There is no need to allocate a new
2512 : * page here, a local buffer is fine to hold its contents and a mask
2513 : * can be directly applied on it.
2514 : */
2515 2406383 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2516 0 : ereport(ERROR,
2517 : (errcode(ERRCODE_INTERNAL_ERROR),
2518 : errmsg_internal("%s", record->errormsg_buf)));
2519 :
2520 : /*
2521 : * If masking function is defined, mask both the primary and replay
2522 : * images
2523 : */
2524 2406383 : if (rmgr.rm_mask != NULL)
2525 : {
2526 2406383 : rmgr.rm_mask(replay_image_masked, blkno);
2527 2406383 : rmgr.rm_mask(primary_image_masked, blkno);
2528 : }
2529 :
2530 : /* Time to compare the primary and replay images. */
2531 2406383 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2532 : {
2533 0 : elog(FATAL,
2534 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2535 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2536 : forknum, blkno);
2537 : }
2538 : }
2539 : }
2540 :
2541 : /*
2542 : * For point-in-time recovery, this function decides whether we want to
2543 : * stop applying the XLOG before the current record.
2544 : *
2545 : * Returns true if we are stopping, false otherwise. If stopping, some
2546 : * information is saved in recoveryStopXid et al for use in annotating the
2547 : * new timeline's history file.
2548 : */
2549 : static bool
2550 2948815 : recoveryStopsBefore(XLogReaderState *record)
2551 : {
2552 2948815 : bool stopsHere = false;
2553 : uint8 xact_info;
2554 : bool isCommit;
2555 2948815 : TimestampTz recordXtime = 0;
2556 : TransactionId recordXid;
2557 :
2558 : /*
2559 : * Ignore recovery target settings when not in archive recovery (meaning
2560 : * we are in crash recovery).
2561 : */
2562 2948815 : if (!ArchiveRecoveryRequested)
2563 307861 : return false;
2564 :
2565 : /* Check if we should stop as soon as reaching consistency */
2566 2640954 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2567 : {
2568 0 : ereport(LOG,
2569 : (errmsg("recovery stopping after reaching consistency")));
2570 :
2571 0 : recoveryStopAfter = false;
2572 0 : recoveryStopXid = InvalidTransactionId;
2573 0 : recoveryStopLSN = InvalidXLogRecPtr;
2574 0 : recoveryStopTime = 0;
2575 0 : recoveryStopName[0] = '\0';
2576 0 : return true;
2577 : }
2578 :
2579 : /* Check if target LSN has been reached */
2580 2640954 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2581 8553 : !recoveryTargetInclusive &&
2582 481 : record->ReadRecPtr >= recoveryTargetLSN)
2583 : {
2584 2 : recoveryStopAfter = false;
2585 2 : recoveryStopXid = InvalidTransactionId;
2586 2 : recoveryStopLSN = record->ReadRecPtr;
2587 2 : recoveryStopTime = 0;
2588 2 : recoveryStopName[0] = '\0';
2589 2 : ereport(LOG,
2590 : errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2591 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2592 2 : return true;
2593 : }
2594 :
2595 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2596 2640952 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2597 2617019 : return false;
2598 :
2599 23933 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2600 :
2601 23933 : if (xact_info == XLOG_XACT_COMMIT)
2602 : {
2603 21623 : isCommit = true;
2604 21623 : recordXid = XLogRecGetXid(record);
2605 : }
2606 2310 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2607 : {
2608 26 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2609 : xl_xact_parsed_commit parsed;
2610 :
2611 26 : isCommit = true;
2612 26 : ParseCommitRecord(XLogRecGetInfo(record),
2613 : xlrec,
2614 : &parsed);
2615 26 : recordXid = parsed.twophase_xid;
2616 : }
2617 2284 : else if (xact_info == XLOG_XACT_ABORT)
2618 : {
2619 1968 : isCommit = false;
2620 1968 : recordXid = XLogRecGetXid(record);
2621 : }
2622 316 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2623 : {
2624 15 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2625 : xl_xact_parsed_abort parsed;
2626 :
2627 15 : isCommit = false;
2628 15 : ParseAbortRecord(XLogRecGetInfo(record),
2629 : xlrec,
2630 : &parsed);
2631 15 : recordXid = parsed.twophase_xid;
2632 : }
2633 : else
2634 301 : return false;
2635 :
2636 23632 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2637 : {
2638 : /*
2639 : * There can be only one transaction end record with this exact
2640 : * transactionid
2641 : *
2642 : * when testing for an xid, we MUST test for equality only, since
2643 : * transactions are numbered in the order they start, not the order
2644 : * they complete. A higher numbered xid will complete before you about
2645 : * 50% of the time...
2646 : */
2647 0 : stopsHere = (recordXid == recoveryTargetXid);
2648 : }
2649 :
2650 : /*
2651 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2652 : * We don't expect getRecordTimestamp ever to fail, since we already know
2653 : * this is a commit or abort record; but test its result anyway.
2654 : */
2655 23632 : if (getRecordTimestamp(record, &recordXtime) &&
2656 23632 : recoveryTarget == RECOVERY_TARGET_TIME)
2657 : {
2658 : /*
2659 : * There can be many transactions that share the same commit time, so
2660 : * we stop after the last one, if we are inclusive, or stop at the
2661 : * first one if we are exclusive
2662 : */
2663 0 : if (recoveryTargetInclusive)
2664 0 : stopsHere = (recordXtime > recoveryTargetTime);
2665 : else
2666 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2667 : }
2668 :
2669 23632 : if (stopsHere)
2670 : {
2671 0 : recoveryStopAfter = false;
2672 0 : recoveryStopXid = recordXid;
2673 0 : recoveryStopTime = recordXtime;
2674 0 : recoveryStopLSN = InvalidXLogRecPtr;
2675 0 : recoveryStopName[0] = '\0';
2676 :
2677 0 : if (isCommit)
2678 : {
2679 0 : ereport(LOG,
2680 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2681 : recoveryStopXid,
2682 : timestamptz_to_str(recoveryStopTime))));
2683 : }
2684 : else
2685 : {
2686 0 : ereport(LOG,
2687 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2688 : recoveryStopXid,
2689 : timestamptz_to_str(recoveryStopTime))));
2690 : }
2691 : }
2692 :
2693 23632 : return stopsHere;
2694 : }
2695 :
2696 : /*
2697 : * Same as recoveryStopsBefore, but called after applying the record.
2698 : *
2699 : * We also track the timestamp of the latest applied COMMIT/ABORT
2700 : * record in XLogRecoveryCtl->recoveryLastXTime.
2701 : */
2702 : static bool
2703 2948811 : recoveryStopsAfter(XLogReaderState *record)
2704 : {
2705 : uint8 info;
2706 : uint8 xact_info;
2707 : uint8 rmid;
2708 2948811 : TimestampTz recordXtime = 0;
2709 :
2710 : /*
2711 : * Ignore recovery target settings when not in archive recovery (meaning
2712 : * we are in crash recovery).
2713 : */
2714 2948811 : if (!ArchiveRecoveryRequested)
2715 307861 : return false;
2716 :
2717 2640950 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2718 2640950 : rmid = XLogRecGetRmid(record);
2719 :
2720 : /*
2721 : * There can be many restore points that share the same name; we stop at
2722 : * the first one.
2723 : */
2724 2640950 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2725 20 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2726 : {
2727 : xl_restore_point *recordRestorePointData;
2728 :
2729 3 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2730 :
2731 3 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2732 : {
2733 2 : recoveryStopAfter = true;
2734 2 : recoveryStopXid = InvalidTransactionId;
2735 2 : recoveryStopLSN = InvalidXLogRecPtr;
2736 2 : (void) getRecordTimestamp(record, &recoveryStopTime);
2737 2 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2738 :
2739 2 : ereport(LOG,
2740 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2741 : recoveryStopName,
2742 : timestamptz_to_str(recoveryStopTime))));
2743 2 : return true;
2744 : }
2745 : }
2746 :
2747 : /* Check if the target LSN has been reached */
2748 2640948 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2749 8072 : recoveryTargetInclusive &&
2750 8072 : record->ReadRecPtr >= recoveryTargetLSN)
2751 : {
2752 3 : recoveryStopAfter = true;
2753 3 : recoveryStopXid = InvalidTransactionId;
2754 3 : recoveryStopLSN = record->ReadRecPtr;
2755 3 : recoveryStopTime = 0;
2756 3 : recoveryStopName[0] = '\0';
2757 3 : ereport(LOG,
2758 : errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2759 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2760 3 : return true;
2761 : }
2762 :
2763 2640945 : if (rmid != RM_XACT_ID)
2764 2617014 : return false;
2765 :
2766 23931 : xact_info = info & XLOG_XACT_OPMASK;
2767 :
2768 23931 : if (xact_info == XLOG_XACT_COMMIT ||
2769 2284 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2770 316 : xact_info == XLOG_XACT_ABORT ||
2771 : xact_info == XLOG_XACT_ABORT_PREPARED)
2772 : {
2773 : TransactionId recordXid;
2774 :
2775 : /* Update the last applied transaction timestamp */
2776 23630 : if (getRecordTimestamp(record, &recordXtime))
2777 23630 : SetLatestXTime(recordXtime);
2778 :
2779 : /* Extract the XID of the committed/aborted transaction */
2780 23630 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2781 : {
2782 26 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2783 : xl_xact_parsed_commit parsed;
2784 :
2785 26 : ParseCommitRecord(XLogRecGetInfo(record),
2786 : xlrec,
2787 : &parsed);
2788 26 : recordXid = parsed.twophase_xid;
2789 : }
2790 23604 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2791 : {
2792 15 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2793 : xl_xact_parsed_abort parsed;
2794 :
2795 15 : ParseAbortRecord(XLogRecGetInfo(record),
2796 : xlrec,
2797 : &parsed);
2798 15 : recordXid = parsed.twophase_xid;
2799 : }
2800 : else
2801 23589 : recordXid = XLogRecGetXid(record);
2802 :
2803 : /*
2804 : * There can be only one transaction end record with this exact
2805 : * transactionid
2806 : *
2807 : * when testing for an xid, we MUST test for equality only, since
2808 : * transactions are numbered in the order they start, not the order
2809 : * they complete. A higher numbered xid will complete before you about
2810 : * 50% of the time...
2811 : */
2812 23630 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2813 0 : recordXid == recoveryTargetXid)
2814 : {
2815 0 : recoveryStopAfter = true;
2816 0 : recoveryStopXid = recordXid;
2817 0 : recoveryStopTime = recordXtime;
2818 0 : recoveryStopLSN = InvalidXLogRecPtr;
2819 0 : recoveryStopName[0] = '\0';
2820 :
2821 0 : if (xact_info == XLOG_XACT_COMMIT ||
2822 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2823 : {
2824 0 : ereport(LOG,
2825 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2826 : recoveryStopXid,
2827 : timestamptz_to_str(recoveryStopTime))));
2828 : }
2829 0 : else if (xact_info == XLOG_XACT_ABORT ||
2830 : xact_info == XLOG_XACT_ABORT_PREPARED)
2831 : {
2832 0 : ereport(LOG,
2833 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2834 : recoveryStopXid,
2835 : timestamptz_to_str(recoveryStopTime))));
2836 : }
2837 0 : return true;
2838 : }
2839 : }
2840 :
2841 : /* Check if we should stop as soon as reaching consistency */
2842 23931 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2843 : {
2844 0 : ereport(LOG,
2845 : (errmsg("recovery stopping after reaching consistency")));
2846 :
2847 0 : recoveryStopAfter = true;
2848 0 : recoveryStopXid = InvalidTransactionId;
2849 0 : recoveryStopTime = 0;
2850 0 : recoveryStopLSN = InvalidXLogRecPtr;
2851 0 : recoveryStopName[0] = '\0';
2852 0 : return true;
2853 : }
2854 :
2855 23931 : return false;
2856 : }
2857 :
2858 : /*
2859 : * Create a comment for the history file to explain why and where
2860 : * timeline changed.
2861 : */
2862 : static char *
2863 1018 : getRecoveryStopReason(void)
2864 : {
2865 : char reason[200];
2866 :
2867 1018 : if (recoveryTarget == RECOVERY_TARGET_XID)
2868 0 : snprintf(reason, sizeof(reason),
2869 : "%s transaction %u",
2870 0 : recoveryStopAfter ? "after" : "before",
2871 : recoveryStopXid);
2872 1018 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2873 0 : snprintf(reason, sizeof(reason),
2874 : "%s %s\n",
2875 0 : recoveryStopAfter ? "after" : "before",
2876 : timestamptz_to_str(recoveryStopTime));
2877 1018 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2878 6 : snprintf(reason, sizeof(reason),
2879 : "%s LSN %X/%08X\n",
2880 6 : recoveryStopAfter ? "after" : "before",
2881 6 : LSN_FORMAT_ARGS(recoveryStopLSN));
2882 1012 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2883 3 : snprintf(reason, sizeof(reason),
2884 : "at restore point \"%s\"",
2885 : recoveryStopName);
2886 1009 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2887 0 : snprintf(reason, sizeof(reason), "reached consistency");
2888 : else
2889 1009 : snprintf(reason, sizeof(reason), "no recovery target specified");
2890 :
2891 1018 : return pstrdup(reason);
2892 : }
2893 :
2894 : /*
2895 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2896 : *
2897 : * endOfRecovery is true if the recovery target is reached and
2898 : * the paused state starts at the end of recovery because of
2899 : * recovery_target_action=pause, and false otherwise.
2900 : */
2901 : static void
2902 9 : recoveryPausesHere(bool endOfRecovery)
2903 : {
2904 : /* Don't pause unless users can connect! */
2905 9 : if (!LocalHotStandbyActive)
2906 0 : return;
2907 :
2908 : /* Don't pause after standby promotion has been triggered */
2909 9 : if (LocalPromoteIsTriggered)
2910 0 : return;
2911 :
2912 9 : if (endOfRecovery)
2913 1 : ereport(LOG,
2914 : (errmsg("pausing at the end of recovery"),
2915 : errhint("Execute pg_wal_replay_resume() to promote.")));
2916 : else
2917 8 : ereport(LOG,
2918 : (errmsg("recovery has paused"),
2919 : errhint("Execute pg_wal_replay_resume() to continue.")));
2920 :
2921 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2922 30 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2923 : {
2924 23 : ProcessStartupProcInterrupts();
2925 23 : if (CheckForStandbyTrigger())
2926 2 : return;
2927 :
2928 : /*
2929 : * If recovery pause is requested then set it paused. While we are in
2930 : * the loop, user might resume and pause again so set this every time.
2931 : */
2932 21 : ConfirmRecoveryPaused();
2933 :
2934 : /*
2935 : * We wait on a condition variable that will wake us as soon as the
2936 : * pause ends, but we use a timeout so we can check the above exit
2937 : * condition periodically too.
2938 : */
2939 21 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2940 : WAIT_EVENT_RECOVERY_PAUSE);
2941 : }
2942 7 : ConditionVariableCancelSleep();
2943 : }
2944 :
2945 : /*
2946 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2947 : * certain record types are applied at least that interval behind the primary.
2948 : *
2949 : * Returns true if we waited.
2950 : *
2951 : * Note that the delay is calculated between the WAL record log time and
2952 : * the current time on standby. We would prefer to keep track of when this
2953 : * standby received each WAL record, which would allow a more consistent
2954 : * approach and one not affected by time synchronisation issues, but that
2955 : * is significantly more effort and complexity for little actual gain in
2956 : * usability.
2957 : */
2958 : static bool
2959 2948813 : recoveryApplyDelay(XLogReaderState *record)
2960 : {
2961 : uint8 xact_info;
2962 : TimestampTz xtime;
2963 : TimestampTz delayUntil;
2964 : long msecs;
2965 :
2966 : /* nothing to do if no delay configured */
2967 2948813 : if (recovery_min_apply_delay <= 0)
2968 2948660 : return false;
2969 :
2970 : /* no delay is applied on a database not yet consistent */
2971 153 : if (!reachedConsistency)
2972 4 : return false;
2973 :
2974 : /* nothing to do if crash recovery is requested */
2975 149 : if (!ArchiveRecoveryRequested)
2976 0 : return false;
2977 :
2978 : /*
2979 : * Is it a COMMIT record?
2980 : *
2981 : * We deliberately choose not to delay aborts since they have no effect on
2982 : * MVCC. We already allow replay of records that don't have a timestamp,
2983 : * so there is already opportunity for issues caused by early conflicts on
2984 : * standbys.
2985 : */
2986 149 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2987 119 : return false;
2988 :
2989 30 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2990 :
2991 30 : if (xact_info != XLOG_XACT_COMMIT &&
2992 : xact_info != XLOG_XACT_COMMIT_PREPARED)
2993 0 : return false;
2994 :
2995 30 : if (!getRecordTimestamp(record, &xtime))
2996 0 : return false;
2997 :
2998 30 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
2999 :
3000 : /*
3001 : * Exit without arming the latch if it's already past time to apply this
3002 : * record
3003 : */
3004 30 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3005 30 : if (msecs <= 0)
3006 13 : return false;
3007 :
3008 : while (true)
3009 : {
3010 45 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3011 :
3012 : /* This might change recovery_min_apply_delay. */
3013 45 : ProcessStartupProcInterrupts();
3014 :
3015 45 : if (CheckForStandbyTrigger())
3016 0 : break;
3017 :
3018 : /*
3019 : * Recalculate delayUntil as recovery_min_apply_delay could have
3020 : * changed while waiting in this loop.
3021 : */
3022 45 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3023 :
3024 : /*
3025 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3026 : */
3027 45 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3028 : delayUntil);
3029 :
3030 45 : if (msecs <= 0)
3031 17 : break;
3032 :
3033 28 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3034 :
3035 28 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3036 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3037 : msecs,
3038 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3039 : }
3040 17 : return true;
3041 : }
3042 :
3043 : /*
3044 : * Get the current state of the recovery pause request.
3045 : */
3046 : RecoveryPauseState
3047 39 : GetRecoveryPauseState(void)
3048 : {
3049 : RecoveryPauseState state;
3050 :
3051 39 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3052 39 : state = XLogRecoveryCtl->recoveryPauseState;
3053 39 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3054 :
3055 39 : return state;
3056 : }
3057 :
3058 : /*
3059 : * Set the recovery pause state.
3060 : *
3061 : * If recovery pause is requested then sets the recovery pause state to
3062 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3063 : * to 'not paused' to resume the recovery. The recovery pause will be
3064 : * confirmed by the ConfirmRecoveryPaused.
3065 : */
3066 : void
3067 66 : SetRecoveryPause(bool recoveryPause)
3068 : {
3069 66 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3070 :
3071 66 : if (!recoveryPause)
3072 57 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3073 9 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3074 9 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3075 :
3076 66 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3077 :
3078 66 : if (!recoveryPause)
3079 57 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3080 66 : }
3081 :
3082 : /*
3083 : * Confirm the recovery pause by setting the recovery pause state to
3084 : * RECOVERY_PAUSED.
3085 : */
3086 : static void
3087 21 : ConfirmRecoveryPaused(void)
3088 : {
3089 : /* If recovery pause is requested then set it paused */
3090 21 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3091 21 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3092 9 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3093 21 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3094 21 : }
3095 :
3096 :
3097 : /*
3098 : * Attempt to read the next XLOG record.
3099 : *
3100 : * Before first call, the reader needs to be positioned to the first record
3101 : * by calling XLogPrefetcherBeginRead().
3102 : *
3103 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3104 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3105 : * record is available.
3106 : */
3107 : static XLogRecord *
3108 2951274 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3109 : bool fetching_ckpt, TimeLineID replayTLI)
3110 : {
3111 : XLogRecord *record;
3112 2951274 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3113 2951274 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3114 :
3115 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3116 :
3117 : /* Pass through parameters to XLogPageRead */
3118 2951274 : private->fetching_ckpt = fetching_ckpt;
3119 2951274 : private->emode = emode;
3120 2951274 : private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3121 2951274 : private->replayTLI = replayTLI;
3122 :
3123 : /* This is the first attempt to read this page. */
3124 2951274 : lastSourceFailed = false;
3125 :
3126 : for (;;)
3127 162 : {
3128 : char *errormsg;
3129 :
3130 2951436 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3131 2951373 : if (record == NULL)
3132 : {
3133 : /*
3134 : * When we find that WAL ends in an incomplete record, keep track
3135 : * of that record. After recovery is done, we'll write a record
3136 : * to indicate to downstream WAL readers that that portion is to
3137 : * be ignored.
3138 : *
3139 : * However, when ArchiveRecoveryRequested = true, we're going to
3140 : * switch to a new timeline at the end of recovery. We will only
3141 : * copy WAL over to the new timeline up to the end of the last
3142 : * complete record, so if we did this, we would later create an
3143 : * overwrite contrecord in the wrong place, breaking everything.
3144 : */
3145 323 : if (!ArchiveRecoveryRequested &&
3146 110 : XLogRecPtrIsValid(xlogreader->abortedRecPtr))
3147 : {
3148 11 : abortedRecPtr = xlogreader->abortedRecPtr;
3149 11 : missingContrecPtr = xlogreader->missingContrecPtr;
3150 : }
3151 :
3152 323 : if (readFile >= 0)
3153 : {
3154 296 : close(readFile);
3155 296 : readFile = -1;
3156 : }
3157 :
3158 : /*
3159 : * We only end up here without a message when XLogPageRead()
3160 : * failed - in that case we already logged something. In
3161 : * StandbyMode that only happens if we have been triggered, so we
3162 : * shouldn't loop anymore in that case.
3163 : */
3164 323 : if (errormsg)
3165 296 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3166 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3167 : }
3168 :
3169 : /*
3170 : * Check page TLI is one of the expected values.
3171 : */
3172 2951050 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3173 : {
3174 : char fname[MAXFNAMELEN];
3175 : XLogSegNo segno;
3176 : int32 offset;
3177 :
3178 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3179 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3180 : wal_segment_size);
3181 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3182 : wal_segment_size);
3183 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3184 : errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3185 : xlogreader->latestPageTLI,
3186 : fname,
3187 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3188 : offset));
3189 0 : record = NULL;
3190 : }
3191 :
3192 2951373 : if (record)
3193 : {
3194 : /* Great, got a record */
3195 2951211 : return record;
3196 : }
3197 : else
3198 : {
3199 : /* No valid record available from this source */
3200 323 : lastSourceFailed = true;
3201 :
3202 : /*
3203 : * If archive recovery was requested, but we were still doing
3204 : * crash recovery, switch to archive recovery and retry using the
3205 : * offline archive. We have now replayed all the valid WAL in
3206 : * pg_wal, so we are presumably now consistent.
3207 : *
3208 : * We require that there's at least some valid WAL present in
3209 : * pg_wal, however (!fetching_ckpt). We could recover using the
3210 : * WAL from the archive, even if pg_wal is completely empty, but
3211 : * we'd have no idea how far we'd have to replay to reach
3212 : * consistency. So err on the safe side and give up.
3213 : */
3214 323 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3215 1 : !fetching_ckpt)
3216 : {
3217 1 : ereport(DEBUG1,
3218 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3219 1 : InArchiveRecovery = true;
3220 1 : if (StandbyModeRequested)
3221 1 : EnableStandbyMode();
3222 :
3223 1 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3224 1 : minRecoveryPoint = xlogreader->EndRecPtr;
3225 1 : minRecoveryPointTLI = replayTLI;
3226 :
3227 1 : CheckRecoveryConsistency();
3228 :
3229 : /*
3230 : * Before we retry, reset lastSourceFailed and currentSource
3231 : * so that we will check the archive next.
3232 : */
3233 1 : lastSourceFailed = false;
3234 1 : currentSource = XLOG_FROM_ANY;
3235 :
3236 162 : continue;
3237 : }
3238 :
3239 : /* In standby mode, loop back to retry. Otherwise, give up. */
3240 322 : if (StandbyMode && !CheckForStandbyTrigger())
3241 161 : continue;
3242 : else
3243 161 : return NULL;
3244 : }
3245 : }
3246 : }
3247 :
3248 : /*
3249 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3250 : * already). Returns number of bytes read, if the page is read successfully,
3251 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3252 : * but only if they have not been previously reported.
3253 : *
3254 : * See XLogReaderRoutine.page_read for more details.
3255 : *
3256 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3257 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3258 : *
3259 : * This is responsible for restoring files from archive as needed, as well
3260 : * as for waiting for the requested WAL record to arrive in standby mode.
3261 : *
3262 : * xlogreader->private_data->emode specifies the log level used for reporting
3263 : * "file not found" or "end of WAL" situations in archive recovery, or in
3264 : * standby mode when promotion is triggered. If set to WARNING or below,
3265 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3266 : * levels the ereport() won't return.
3267 : *
3268 : * In standby mode, if after a successful return of XLogPageRead() the
3269 : * caller finds the record it's interested in to be broken, it should
3270 : * ereport the error with the level determined by
3271 : * emode_for_corrupt_record(), and then set lastSourceFailed
3272 : * and call XLogPageRead() again with the same arguments. This lets
3273 : * XLogPageRead() to try fetching the record from another source, or to
3274 : * sleep and retry.
3275 : */
3276 : static int
3277 1524198 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3278 : XLogRecPtr targetRecPtr, char *readBuf)
3279 : {
3280 1524198 : XLogPageReadPrivate *private =
3281 : (XLogPageReadPrivate *) xlogreader->private_data;
3282 1524198 : int emode = private->emode;
3283 : uint32 targetPageOff;
3284 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3285 : int r;
3286 : instr_time io_start;
3287 :
3288 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3289 :
3290 1524198 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3291 1524198 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3292 :
3293 : /*
3294 : * See if we need to switch to a new segment because the requested record
3295 : * is not in the currently open one.
3296 : */
3297 1524198 : if (readFile >= 0 &&
3298 1522330 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3299 : {
3300 : /*
3301 : * Request a restartpoint if we've replayed too much xlog since the
3302 : * last one.
3303 : */
3304 1514 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3305 : {
3306 1497 : if (XLogCheckpointNeeded(readSegNo))
3307 : {
3308 1360 : (void) GetRedoRecPtr();
3309 1360 : if (XLogCheckpointNeeded(readSegNo))
3310 1352 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3311 : }
3312 : }
3313 :
3314 1514 : close(readFile);
3315 1514 : readFile = -1;
3316 1514 : readSource = XLOG_FROM_ANY;
3317 : }
3318 :
3319 1524198 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3320 :
3321 1524201 : retry:
3322 : /* See if we need to retrieve more data */
3323 1524201 : if (readFile < 0 ||
3324 1520816 : (readSource == XLOG_FROM_STREAM &&
3325 1506975 : flushedUpto < targetPagePtr + reqLen))
3326 : {
3327 37069 : if (readFile >= 0 &&
3328 33684 : xlogreader->nonblocking &&
3329 16661 : readSource == XLOG_FROM_STREAM &&
3330 16661 : flushedUpto < targetPagePtr + reqLen)
3331 16661 : return XLREAD_WOULDBLOCK;
3332 :
3333 20345 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3334 20408 : private->randAccess,
3335 20408 : private->fetching_ckpt,
3336 : targetRecPtr,
3337 : private->replayTLI,
3338 : xlogreader->EndRecPtr,
3339 20408 : xlogreader->nonblocking))
3340 : {
3341 435 : case XLREAD_WOULDBLOCK:
3342 435 : return XLREAD_WOULDBLOCK;
3343 52 : case XLREAD_FAIL:
3344 52 : if (readFile >= 0)
3345 0 : close(readFile);
3346 52 : readFile = -1;
3347 52 : readLen = 0;
3348 52 : readSource = XLOG_FROM_ANY;
3349 52 : return XLREAD_FAIL;
3350 19858 : case XLREAD_SUCCESS:
3351 19858 : break;
3352 : }
3353 : }
3354 :
3355 : /*
3356 : * At this point, we have the right segment open and if we're streaming we
3357 : * know the requested record is in it.
3358 : */
3359 : Assert(readFile != -1);
3360 :
3361 : /*
3362 : * If the current segment is being streamed from the primary, calculate
3363 : * how much of the current page we have received already. We know the
3364 : * requested record has been received, but this is for the benefit of
3365 : * future calls, to allow quick exit at the top of this function.
3366 : */
3367 1506990 : if (readSource == XLOG_FROM_STREAM)
3368 : {
3369 1491387 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3370 1479471 : readLen = XLOG_BLCKSZ;
3371 : else
3372 11916 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3373 : targetPageOff;
3374 : }
3375 : else
3376 15603 : readLen = XLOG_BLCKSZ;
3377 :
3378 : /* Read the requested page */
3379 1506990 : readOff = targetPageOff;
3380 :
3381 : /* Measure I/O timing when reading segment */
3382 1506990 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3383 :
3384 1506990 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3385 1506990 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3386 1506990 : if (r != XLOG_BLCKSZ)
3387 : {
3388 : char fname[MAXFNAMELEN];
3389 0 : int save_errno = errno;
3390 :
3391 0 : pgstat_report_wait_end();
3392 :
3393 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3394 : io_start, 1, r);
3395 :
3396 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3397 0 : if (r < 0)
3398 : {
3399 0 : errno = save_errno;
3400 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3401 : (errcode_for_file_access(),
3402 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3403 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3404 : readOff)));
3405 : }
3406 : else
3407 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3408 : (errcode(ERRCODE_DATA_CORRUPTED),
3409 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3410 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3411 : readOff, r, (Size) XLOG_BLCKSZ)));
3412 0 : goto next_record_is_invalid;
3413 : }
3414 1506990 : pgstat_report_wait_end();
3415 :
3416 1506990 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3417 : io_start, 1, r);
3418 :
3419 : Assert(targetSegNo == readSegNo);
3420 : Assert(targetPageOff == readOff);
3421 : Assert(reqLen <= readLen);
3422 :
3423 1506990 : xlogreader->seg.ws_tli = curFileTLI;
3424 :
3425 : /*
3426 : * Check the page header immediately, so that we can retry immediately if
3427 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3428 : * validates the page header anyway, and would propagate the failure up to
3429 : * ReadRecord(), which would retry. However, there's a corner case with
3430 : * continuation records, if a record is split across two pages such that
3431 : * we would need to read the two pages from different sources across two
3432 : * WAL segments.
3433 : *
3434 : * The first page is only available locally, in pg_wal, because it's
3435 : * already been recycled on the primary. The second page, however, is not
3436 : * present in pg_wal, and we should stream it from the primary. There is a
3437 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3438 : * We would read the first page from the local WAL segment, but when
3439 : * reading the second page, we would read the bogus, recycled, WAL
3440 : * segment. If we didn't catch that case here, we would never recover,
3441 : * because ReadRecord() would retry reading the whole record from the
3442 : * beginning.
3443 : *
3444 : * Of course, this only catches errors in the page header, which is what
3445 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3446 : * corruption still has the same problem. But this at least fixes the
3447 : * common case, which can happen as part of normal operation.
3448 : *
3449 : * Validating the page header is cheap enough that doing it twice
3450 : * shouldn't be a big deal from a performance point of view.
3451 : *
3452 : * When not in standby mode, an invalid page header should cause recovery
3453 : * to end, not retry reading the page, so we don't need to validate the
3454 : * page header here for the retry. Instead, ReadPageInternal() is
3455 : * responsible for the validation.
3456 : */
3457 1506990 : if (StandbyMode &&
3458 1495240 : (targetPagePtr % wal_segment_size) == 0 &&
3459 1481 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3460 : {
3461 : /*
3462 : * Emit this error right now then retry this page immediately. Use
3463 : * errmsg_internal() because the message was already translated.
3464 : */
3465 4 : if (xlogreader->errormsg_buf[0])
3466 4 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3467 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3468 :
3469 : /* reset any error XLogReaderValidatePageHeader() might have set */
3470 4 : XLogReaderResetError(xlogreader);
3471 4 : goto next_record_is_invalid;
3472 : }
3473 :
3474 1506986 : return readLen;
3475 :
3476 4 : next_record_is_invalid:
3477 :
3478 : /*
3479 : * If we're reading ahead, give up fast. Retries and error reporting will
3480 : * be handled by a later read when recovery catches up to this point.
3481 : */
3482 4 : if (xlogreader->nonblocking)
3483 1 : return XLREAD_WOULDBLOCK;
3484 :
3485 3 : lastSourceFailed = true;
3486 :
3487 3 : if (readFile >= 0)
3488 3 : close(readFile);
3489 3 : readFile = -1;
3490 3 : readLen = 0;
3491 3 : readSource = XLOG_FROM_ANY;
3492 :
3493 : /* In standby-mode, keep trying */
3494 3 : if (StandbyMode)
3495 3 : goto retry;
3496 : else
3497 0 : return XLREAD_FAIL;
3498 : }
3499 :
3500 : /*
3501 : * Open the WAL segment containing WAL location 'RecPtr'.
3502 : *
3503 : * The segment can be fetched via restore_command, or via walreceiver having
3504 : * streamed the record, or it can already be present in pg_wal. Checking
3505 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3506 : * too, in case someone copies a new segment directly to pg_wal. That is not
3507 : * documented or recommended, though.
3508 : *
3509 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3510 : * prepare to read WAL starting from RedoStartLSN after this.
3511 : *
3512 : * 'RecPtr' might not point to the beginning of the record we're interested
3513 : * in, it might also point to the page or segment header. In that case,
3514 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3515 : * used to decide which timeline to stream the requested WAL from.
3516 : *
3517 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3518 : * timelines, we can reject a switch to a timeline that branched off before
3519 : * this point.
3520 : *
3521 : * If the record is not immediately available, the function returns XLREAD_FAIL
3522 : * if we're not in standby mode. In standby mode, the function waits for it to
3523 : * become available.
3524 : *
3525 : * When the requested record becomes available, the function opens the file
3526 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3527 : * of standby mode is triggered by the user, and there is no more WAL
3528 : * available, returns XLREAD_FAIL.
3529 : *
3530 : * If nonblocking is true, then give up immediately if we can't satisfy the
3531 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3532 : */
3533 : static XLogPageReadResult
3534 20408 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3535 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3536 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3537 : bool nonblocking)
3538 : {
3539 : static TimestampTz last_fail_time = 0;
3540 : TimestampTz now;
3541 20408 : bool streaming_reply_sent = false;
3542 :
3543 : /*-------
3544 : * Standby mode is implemented by a state machine:
3545 : *
3546 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3547 : * pg_wal (XLOG_FROM_PG_WAL)
3548 : * 2. Check for promotion trigger request
3549 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3550 : * 4. Rescan timelines
3551 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3552 : *
3553 : * Failure to read from the current source advances the state machine to
3554 : * the next state.
3555 : *
3556 : * 'currentSource' indicates the current state. There are no currentSource
3557 : * values for "check trigger", "rescan timelines", and "sleep" states,
3558 : * those actions are taken when reading from the previous source fails, as
3559 : * part of advancing to the next state.
3560 : *
3561 : * If standby mode is turned off while reading WAL from stream, we move
3562 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3563 : * the files (which would be required at end of recovery, e.g., timeline
3564 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3565 : * here because it's already stopped when standby mode is turned off at
3566 : * the end of recovery.
3567 : *-------
3568 : */
3569 20408 : if (!InArchiveRecovery)
3570 1081 : currentSource = XLOG_FROM_PG_WAL;
3571 19327 : else if (currentSource == XLOG_FROM_ANY ||
3572 19196 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3573 : {
3574 131 : lastSourceFailed = false;
3575 131 : currentSource = XLOG_FROM_ARCHIVE;
3576 : }
3577 :
3578 : for (;;)
3579 17890 : {
3580 38298 : XLogSource oldSource = currentSource;
3581 38298 : bool startWalReceiver = false;
3582 :
3583 : /*
3584 : * First check if we failed to read from the current source, and
3585 : * advance the state machine if so. The failure to read might've
3586 : * happened outside this function, e.g when a CRC check fails on a
3587 : * record, or within this loop.
3588 : */
3589 38298 : if (lastSourceFailed)
3590 : {
3591 : /*
3592 : * Don't allow any retry loops to occur during nonblocking
3593 : * readahead. Let the caller process everything that has been
3594 : * decoded already first.
3595 : */
3596 606 : if (nonblocking)
3597 88 : return XLREAD_WOULDBLOCK;
3598 :
3599 518 : switch (currentSource)
3600 : {
3601 308 : case XLOG_FROM_ARCHIVE:
3602 : case XLOG_FROM_PG_WAL:
3603 :
3604 : /*
3605 : * Check to see if promotion is requested. Note that we do
3606 : * this only after failure, so when you promote, we still
3607 : * finish replaying as much as we can from archive and
3608 : * pg_wal before failover.
3609 : */
3610 308 : if (StandbyMode && CheckForStandbyTrigger())
3611 : {
3612 22 : XLogShutdownWalRcv();
3613 22 : return XLREAD_FAIL;
3614 : }
3615 :
3616 : /*
3617 : * Not in standby mode, and we've now tried the archive
3618 : * and pg_wal.
3619 : */
3620 286 : if (!StandbyMode)
3621 30 : return XLREAD_FAIL;
3622 :
3623 : /*
3624 : * Move to XLOG_FROM_STREAM state, and set to start a
3625 : * walreceiver if necessary.
3626 : */
3627 256 : currentSource = XLOG_FROM_STREAM;
3628 256 : startWalReceiver = true;
3629 256 : break;
3630 :
3631 210 : case XLOG_FROM_STREAM:
3632 :
3633 : /*
3634 : * Failure while streaming. Most likely, we got here
3635 : * because streaming replication was terminated, or
3636 : * promotion was triggered. But we also get here if we
3637 : * find an invalid record in the WAL streamed from the
3638 : * primary, in which case something is seriously wrong.
3639 : * There's little chance that the problem will just go
3640 : * away, but PANIC is not good for availability either,
3641 : * especially in hot standby mode. So, we treat that the
3642 : * same as disconnection, and retry from archive/pg_wal
3643 : * again. The WAL in the archive should be identical to
3644 : * what was streamed, so it's unlikely that it helps, but
3645 : * one can hope...
3646 : */
3647 :
3648 : /*
3649 : * We should be able to move to XLOG_FROM_STREAM only in
3650 : * standby mode.
3651 : */
3652 : Assert(StandbyMode);
3653 :
3654 : /*
3655 : * Before we leave XLOG_FROM_STREAM state, make sure that
3656 : * walreceiver is not active, so that it won't overwrite
3657 : * WAL that we restore from archive.
3658 : *
3659 : * If walreceiver is actively streaming (or attempting to
3660 : * connect), we must shut it down. However, if it's
3661 : * already in WAITING state (e.g., due to timeline
3662 : * divergence), we only need to reset the install flag to
3663 : * allow archive restoration.
3664 : */
3665 210 : if (WalRcvStreaming())
3666 32 : XLogShutdownWalRcv();
3667 : else
3668 : {
3669 : /*
3670 : * WALRCV_STOPPING state is a transient state while
3671 : * the startup process is in ShutdownWalRcv(). It
3672 : * should never appear here since we would be waiting
3673 : * for the walreceiver to reach WALRCV_STOPPED in that
3674 : * case.
3675 : */
3676 : Assert(WalRcvGetState() != WALRCV_STOPPING);
3677 178 : ResetInstallXLogFileSegmentActive();
3678 : }
3679 :
3680 : /*
3681 : * Before we sleep, re-scan for possible new timelines if
3682 : * we were requested to recover to the latest timeline.
3683 : */
3684 210 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3685 : {
3686 210 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3687 : {
3688 8 : currentSource = XLOG_FROM_ARCHIVE;
3689 8 : break;
3690 : }
3691 : }
3692 :
3693 : /*
3694 : * XLOG_FROM_STREAM is the last state in our state
3695 : * machine, so we've exhausted all the options for
3696 : * obtaining the requested WAL. We're going to loop back
3697 : * and retry from the archive, but if it hasn't been long
3698 : * since last attempt, sleep wal_retrieve_retry_interval
3699 : * milliseconds to avoid busy-waiting.
3700 : */
3701 201 : now = GetCurrentTimestamp();
3702 201 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3703 : wal_retrieve_retry_interval))
3704 : {
3705 : long wait_time;
3706 :
3707 218 : wait_time = wal_retrieve_retry_interval -
3708 109 : TimestampDifferenceMilliseconds(last_fail_time, now);
3709 :
3710 109 : elog(LOG, "waiting for WAL to become available at %X/%08X",
3711 : LSN_FORMAT_ARGS(RecPtr));
3712 :
3713 : /* Do background tasks that might benefit us later. */
3714 109 : KnownAssignedTransactionIdsIdleMaintenance();
3715 :
3716 109 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3717 : WL_LATCH_SET | WL_TIMEOUT |
3718 : WL_EXIT_ON_PM_DEATH,
3719 : wait_time,
3720 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3721 109 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3722 109 : now = GetCurrentTimestamp();
3723 :
3724 : /* Handle interrupt signals of startup process */
3725 109 : ProcessStartupProcInterrupts();
3726 : }
3727 183 : last_fail_time = now;
3728 183 : currentSource = XLOG_FROM_ARCHIVE;
3729 183 : break;
3730 :
3731 0 : default:
3732 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3733 : }
3734 : }
3735 37692 : else if (currentSource == XLOG_FROM_PG_WAL)
3736 : {
3737 : /*
3738 : * We just successfully read a file in pg_wal. We prefer files in
3739 : * the archive over ones in pg_wal, so try the next file again
3740 : * from the archive first.
3741 : */
3742 1075 : if (InArchiveRecovery)
3743 0 : currentSource = XLOG_FROM_ARCHIVE;
3744 : }
3745 :
3746 38139 : if (currentSource != oldSource)
3747 447 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3748 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3749 : lastSourceFailed ? "failure" : "success");
3750 :
3751 : /*
3752 : * We've now handled possible failure. Try to read from the chosen
3753 : * source.
3754 : */
3755 38139 : lastSourceFailed = false;
3756 :
3757 38139 : switch (currentSource)
3758 : {
3759 1969 : case XLOG_FROM_ARCHIVE:
3760 : case XLOG_FROM_PG_WAL:
3761 :
3762 : /*
3763 : * WAL receiver must not be running when reading WAL from
3764 : * archive or pg_wal.
3765 : */
3766 : Assert(!WalRcvStreaming());
3767 :
3768 : /* Close any old file we might have open. */
3769 1969 : if (readFile >= 0)
3770 : {
3771 97 : close(readFile);
3772 97 : readFile = -1;
3773 : }
3774 : /* Reset curFileTLI if random fetch. */
3775 1969 : if (randAccess)
3776 1251 : curFileTLI = 0;
3777 :
3778 : /*
3779 : * Try to restore the file from archive, or read an existing
3780 : * file from pg_wal.
3781 : */
3782 1969 : readFile = XLogFileReadAnyTLI(readSegNo,
3783 1969 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3784 : currentSource);
3785 1969 : if (readFile >= 0)
3786 1762 : return XLREAD_SUCCESS; /* success! */
3787 :
3788 : /*
3789 : * Nope, not found in archive or pg_wal.
3790 : */
3791 207 : lastSourceFailed = true;
3792 207 : break;
3793 :
3794 36170 : case XLOG_FROM_STREAM:
3795 : {
3796 : bool havedata;
3797 :
3798 : /*
3799 : * We should be able to move to XLOG_FROM_STREAM only in
3800 : * standby mode.
3801 : */
3802 : Assert(StandbyMode);
3803 :
3804 : /*
3805 : * First, shutdown walreceiver if its restart has been
3806 : * requested -- but no point if we're already slated for
3807 : * starting it.
3808 : */
3809 36170 : if (pendingWalRcvRestart && !startWalReceiver)
3810 : {
3811 8 : XLogShutdownWalRcv();
3812 :
3813 : /*
3814 : * Re-scan for possible new timelines if we were
3815 : * requested to recover to the latest timeline.
3816 : */
3817 8 : if (recoveryTargetTimeLineGoal ==
3818 : RECOVERY_TARGET_TIMELINE_LATEST)
3819 8 : rescanLatestTimeLine(replayTLI, replayLSN);
3820 :
3821 8 : startWalReceiver = true;
3822 : }
3823 36170 : pendingWalRcvRestart = false;
3824 :
3825 : /*
3826 : * Launch walreceiver if needed.
3827 : *
3828 : * If fetching_ckpt is true, RecPtr points to the initial
3829 : * checkpoint location. In that case, we use RedoStartLSN
3830 : * as the streaming start position instead of RecPtr, so
3831 : * that when we later jump backwards to start redo at
3832 : * RedoStartLSN, we will have the logs streamed already.
3833 : */
3834 36170 : if (startWalReceiver &&
3835 264 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3836 : {
3837 : XLogRecPtr ptr;
3838 : TimeLineID tli;
3839 :
3840 205 : if (fetching_ckpt)
3841 : {
3842 0 : ptr = RedoStartLSN;
3843 0 : tli = RedoStartTLI;
3844 : }
3845 : else
3846 : {
3847 205 : ptr = RecPtr;
3848 :
3849 : /*
3850 : * Use the record begin position to determine the
3851 : * TLI, rather than the position we're reading.
3852 : */
3853 205 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3854 :
3855 205 : if (curFileTLI > 0 && tli < curFileTLI)
3856 0 : elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3857 : LSN_FORMAT_ARGS(tliRecPtr),
3858 : tli, curFileTLI);
3859 : }
3860 205 : curFileTLI = tli;
3861 205 : SetInstallXLogFileSegmentActive();
3862 205 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3863 : PrimarySlotName,
3864 : wal_receiver_create_temp_slot);
3865 205 : flushedUpto = InvalidXLogRecPtr;
3866 : }
3867 :
3868 : /*
3869 : * Check if WAL receiver is active or wait to start up.
3870 : */
3871 36170 : if (!WalRcvStreaming())
3872 : {
3873 178 : lastSourceFailed = true;
3874 178 : break;
3875 : }
3876 :
3877 : /*
3878 : * Walreceiver is active, so see if new data has arrived.
3879 : *
3880 : * We only advance XLogReceiptTime when we obtain fresh
3881 : * WAL from walreceiver and observe that we had already
3882 : * processed everything before the most recent "chunk"
3883 : * that it flushed to disk. In steady state where we are
3884 : * keeping up with the incoming data, XLogReceiptTime will
3885 : * be updated on each cycle. When we are behind,
3886 : * XLogReceiptTime will not advance, so the grace time
3887 : * allotted to conflicting queries will decrease.
3888 : */
3889 35992 : if (RecPtr < flushedUpto)
3890 1791 : havedata = true;
3891 : else
3892 : {
3893 : XLogRecPtr latestChunkStart;
3894 :
3895 34201 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3896 34201 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3897 : {
3898 17513 : havedata = true;
3899 17513 : if (latestChunkStart <= RecPtr)
3900 : {
3901 13430 : XLogReceiptTime = GetCurrentTimestamp();
3902 13430 : SetCurrentChunkStartTime(XLogReceiptTime);
3903 : }
3904 : }
3905 : else
3906 16688 : havedata = false;
3907 : }
3908 35992 : if (havedata)
3909 : {
3910 : /*
3911 : * Great, streamed far enough. Open the file if it's
3912 : * not open already. Also read the timeline history
3913 : * file if we haven't initialized timeline history
3914 : * yet; it should be streamed over and present in
3915 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3916 : * info is set correctly and XLogReceiptTime isn't
3917 : * changed.
3918 : *
3919 : * NB: We must set readTimeLineHistory based on
3920 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3921 : * be the same, but if recovery_target_timeline is
3922 : * 'latest' and archiving is configured, then it's
3923 : * possible that we managed to retrieve one or more
3924 : * new timeline history files from the archive,
3925 : * updating recoveryTargetTLI.
3926 : */
3927 19304 : if (readFile < 0)
3928 : {
3929 1208 : if (!expectedTLEs)
3930 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3931 1208 : readFile = XLogFileRead(readSegNo, receiveTLI,
3932 : XLOG_FROM_STREAM, false);
3933 : Assert(readFile >= 0);
3934 : }
3935 : else
3936 : {
3937 : /* just make sure source info is correct... */
3938 18096 : readSource = XLOG_FROM_STREAM;
3939 18096 : XLogReceiptSource = XLOG_FROM_STREAM;
3940 18096 : return XLREAD_SUCCESS;
3941 : }
3942 1208 : break;
3943 : }
3944 :
3945 : /* In nonblocking mode, return rather than sleeping. */
3946 16688 : if (nonblocking)
3947 347 : return XLREAD_WOULDBLOCK;
3948 :
3949 : /*
3950 : * Data not here yet. Check for trigger, then wait for
3951 : * walreceiver to wake us up when new WAL arrives.
3952 : */
3953 16341 : if (CheckForStandbyTrigger())
3954 : {
3955 : /*
3956 : * Note that we don't return XLREAD_FAIL immediately
3957 : * here. After being triggered, we still want to
3958 : * replay all the WAL that was already streamed. It's
3959 : * in pg_wal now, so we just treat this as a failure,
3960 : * and the state machine will move on to replay the
3961 : * streamed WAL from pg_wal, and then recheck the
3962 : * trigger and exit replay.
3963 : */
3964 32 : lastSourceFailed = true;
3965 32 : break;
3966 : }
3967 :
3968 : /*
3969 : * Since we have replayed everything we have received so
3970 : * far and are about to start waiting for more WAL, let's
3971 : * tell the upstream server our replay location now so
3972 : * that pg_stat_replication doesn't show stale
3973 : * information.
3974 : */
3975 16309 : if (!streaming_reply_sent)
3976 : {
3977 13767 : WalRcvRequestApplyReply();
3978 13767 : streaming_reply_sent = true;
3979 : }
3980 :
3981 : /* Do any background tasks that might benefit us later. */
3982 16309 : KnownAssignedTransactionIdsIdleMaintenance();
3983 :
3984 : /* Update pg_stat_recovery_prefetch before sleeping. */
3985 16309 : XLogPrefetcherComputeStats(xlogprefetcher);
3986 :
3987 : /*
3988 : * Wait for more WAL to arrive, when we will be woken
3989 : * immediately by the WAL receiver.
3990 : */
3991 16309 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3992 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
3993 : -1L,
3994 : WAIT_EVENT_RECOVERY_WAL_STREAM);
3995 16309 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3996 16309 : break;
3997 : }
3998 :
3999 0 : default:
4000 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
4001 : }
4002 :
4003 : /*
4004 : * Check for recovery pause here so that we can confirm more quickly
4005 : * that a requested pause has actually taken effect.
4006 : */
4007 17934 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4008 : RECOVERY_NOT_PAUSED)
4009 7 : recoveryPausesHere(false);
4010 :
4011 : /*
4012 : * This possibly-long loop needs to handle interrupts of startup
4013 : * process.
4014 : */
4015 17934 : ProcessStartupProcInterrupts();
4016 : }
4017 :
4018 : return XLREAD_FAIL; /* not reached */
4019 : }
4020 :
4021 :
4022 : /*
4023 : * Determine what log level should be used to report a corrupt WAL record
4024 : * in the current WAL page, previously read by XLogPageRead().
4025 : *
4026 : * 'emode' is the error mode that would be used to report a file-not-found
4027 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4028 : * we're retrying the exact same record that we've tried previously, only
4029 : * complain the first time to keep the noise down. However, we only do when
4030 : * reading from pg_wal, because we don't expect any invalid records in archive
4031 : * or in records streamed from the primary. Files in the archive should be complete,
4032 : * and we should never hit the end of WAL because we stop and wait for more WAL
4033 : * to arrive before replaying it.
4034 : *
4035 : * NOTE: This function remembers the RecPtr value it was last called with,
4036 : * to suppress repeated messages about the same record. Only call this when
4037 : * you are about to ereport(), or you might cause a later message to be
4038 : * erroneously suppressed.
4039 : */
4040 : static int
4041 300 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4042 : {
4043 : static XLogRecPtr lastComplaint = InvalidXLogRecPtr;
4044 :
4045 300 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4046 : {
4047 296 : if (RecPtr == lastComplaint)
4048 85 : emode = DEBUG1;
4049 : else
4050 211 : lastComplaint = RecPtr;
4051 : }
4052 300 : return emode;
4053 : }
4054 :
4055 :
4056 : /*
4057 : * Subroutine to try to fetch and validate a prior checkpoint record.
4058 : */
4059 : static XLogRecord *
4060 1088 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4061 : TimeLineID replayTLI)
4062 : {
4063 : XLogRecord *record;
4064 : uint8 info;
4065 :
4066 : Assert(xlogreader != NULL);
4067 :
4068 1088 : if (!XRecOffIsValid(RecPtr))
4069 : {
4070 0 : ereport(LOG,
4071 : (errmsg("invalid checkpoint location")));
4072 0 : return NULL;
4073 : }
4074 :
4075 1088 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4076 1088 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4077 :
4078 1088 : if (record == NULL)
4079 : {
4080 1 : ereport(LOG,
4081 : (errmsg("invalid checkpoint record")));
4082 1 : return NULL;
4083 : }
4084 1087 : if (record->xl_rmid != RM_XLOG_ID)
4085 : {
4086 0 : ereport(LOG,
4087 : (errmsg("invalid resource manager ID in checkpoint record")));
4088 0 : return NULL;
4089 : }
4090 1087 : info = record->xl_info & ~XLR_INFO_MASK;
4091 1087 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4092 : info != XLOG_CHECKPOINT_ONLINE)
4093 : {
4094 0 : ereport(LOG,
4095 : (errmsg("invalid xl_info in checkpoint record")));
4096 0 : return NULL;
4097 : }
4098 1087 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4099 : {
4100 0 : ereport(LOG,
4101 : (errmsg("invalid length of checkpoint record")));
4102 0 : return NULL;
4103 : }
4104 1087 : return record;
4105 : }
4106 :
4107 : /*
4108 : * Scan for new timelines that might have appeared in the archive since we
4109 : * started recovery.
4110 : *
4111 : * If there are any, the function changes recovery target TLI to the latest
4112 : * one and returns 'true'.
4113 : */
4114 : static bool
4115 218 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4116 : {
4117 : List *newExpectedTLEs;
4118 : bool found;
4119 : ListCell *cell;
4120 : TimeLineID newtarget;
4121 218 : TimeLineID oldtarget = recoveryTargetTLI;
4122 218 : TimeLineHistoryEntry *currentTle = NULL;
4123 :
4124 218 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4125 217 : if (newtarget == recoveryTargetTLI)
4126 : {
4127 : /* No new timelines found */
4128 209 : return false;
4129 : }
4130 :
4131 : /*
4132 : * Determine the list of expected TLIs for the new TLI
4133 : */
4134 :
4135 8 : newExpectedTLEs = readTimeLineHistory(newtarget);
4136 :
4137 : /*
4138 : * If the current timeline is not part of the history of the new timeline,
4139 : * we cannot proceed to it.
4140 : */
4141 8 : found = false;
4142 16 : foreach(cell, newExpectedTLEs)
4143 : {
4144 16 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4145 :
4146 16 : if (currentTle->tli == recoveryTargetTLI)
4147 : {
4148 8 : found = true;
4149 8 : break;
4150 : }
4151 : }
4152 8 : if (!found)
4153 : {
4154 0 : ereport(LOG,
4155 : (errmsg("new timeline %u is not a child of database system timeline %u",
4156 : newtarget,
4157 : replayTLI)));
4158 0 : return false;
4159 : }
4160 :
4161 : /*
4162 : * The current timeline was found in the history file, but check that the
4163 : * next timeline was forked off from it *after* the current recovery
4164 : * location.
4165 : */
4166 8 : if (currentTle->end < replayLSN)
4167 : {
4168 0 : ereport(LOG,
4169 : errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4170 : newtarget,
4171 : replayTLI,
4172 : LSN_FORMAT_ARGS(replayLSN)));
4173 0 : return false;
4174 : }
4175 :
4176 : /* The new timeline history seems valid. Switch target */
4177 8 : recoveryTargetTLI = newtarget;
4178 8 : list_free_deep(expectedTLEs);
4179 8 : expectedTLEs = newExpectedTLEs;
4180 :
4181 : /*
4182 : * As in StartupXLOG(), try to ensure we have all the history files
4183 : * between the old target and new target in pg_wal.
4184 : */
4185 8 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4186 :
4187 8 : ereport(LOG,
4188 : (errmsg("new target timeline is %u",
4189 : recoveryTargetTLI)));
4190 :
4191 8 : return true;
4192 : }
4193 :
4194 :
4195 : /*
4196 : * Open a logfile segment for reading (during recovery).
4197 : *
4198 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4199 : * Otherwise, it's assumed to be already available in pg_wal.
4200 : */
4201 : static int
4202 3729 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4203 : XLogSource source, bool notfoundOk)
4204 : {
4205 : char xlogfname[MAXFNAMELEN];
4206 : char activitymsg[MAXFNAMELEN + 16];
4207 : char path[MAXPGPATH];
4208 : int fd;
4209 :
4210 3729 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4211 :
4212 3729 : switch (source)
4213 : {
4214 906 : case XLOG_FROM_ARCHIVE:
4215 : /* Report recovery progress in PS display */
4216 906 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4217 : xlogfname);
4218 906 : set_ps_display(activitymsg);
4219 :
4220 906 : if (!RestoreArchivedFile(path, xlogfname,
4221 : "RECOVERYXLOG",
4222 : wal_segment_size,
4223 : InRedo))
4224 540 : return -1;
4225 366 : break;
4226 :
4227 2823 : case XLOG_FROM_PG_WAL:
4228 : case XLOG_FROM_STREAM:
4229 2823 : XLogFilePath(path, tli, segno, wal_segment_size);
4230 2823 : break;
4231 :
4232 0 : default:
4233 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4234 : }
4235 :
4236 : /*
4237 : * If the segment was fetched from archival storage, replace the existing
4238 : * xlog segment (if any) with the archival version.
4239 : */
4240 3189 : if (source == XLOG_FROM_ARCHIVE)
4241 : {
4242 : Assert(!IsInstallXLogFileSegmentActive());
4243 366 : KeepFileRestoredFromArchive(path, xlogfname);
4244 :
4245 : /*
4246 : * Set path to point at the new file in pg_wal.
4247 : */
4248 366 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4249 : }
4250 :
4251 3189 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4252 3189 : if (fd >= 0)
4253 : {
4254 : /* Success! */
4255 2970 : curFileTLI = tli;
4256 :
4257 : /* Report recovery progress in PS display */
4258 2970 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4259 : xlogfname);
4260 2970 : set_ps_display(activitymsg);
4261 :
4262 : /* Track source of data in assorted state variables */
4263 2970 : readSource = source;
4264 2970 : XLogReceiptSource = source;
4265 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4266 2970 : if (source != XLOG_FROM_STREAM)
4267 1762 : XLogReceiptTime = GetCurrentTimestamp();
4268 :
4269 2970 : return fd;
4270 : }
4271 219 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4272 0 : ereport(PANIC,
4273 : (errcode_for_file_access(),
4274 : errmsg("could not open file \"%s\": %m", path)));
4275 219 : return -1;
4276 : }
4277 :
4278 : /*
4279 : * Open a logfile segment for reading (during recovery).
4280 : *
4281 : * This version searches for the segment with any TLI listed in expectedTLEs.
4282 : */
4283 : static int
4284 1969 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4285 : {
4286 : char path[MAXPGPATH];
4287 : ListCell *cell;
4288 : int fd;
4289 : List *tles;
4290 :
4291 : /*
4292 : * Loop looking for a suitable timeline ID: we might need to read any of
4293 : * the timelines listed in expectedTLEs.
4294 : *
4295 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4296 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4297 : * to go backwards; this prevents us from picking up the wrong file when a
4298 : * parent timeline extends to higher segment numbers than the child we
4299 : * want to read.
4300 : *
4301 : * If we haven't read the timeline history file yet, read it now, so that
4302 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4303 : * however, unless we actually find a valid segment. That way if there is
4304 : * neither a timeline history file nor a WAL segment in the archive, and
4305 : * streaming replication is set up, we'll read the timeline history file
4306 : * streamed from the primary when we start streaming, instead of
4307 : * recovering with a dummy history generated here.
4308 : */
4309 1969 : if (expectedTLEs)
4310 881 : tles = expectedTLEs;
4311 : else
4312 1088 : tles = readTimeLineHistory(recoveryTargetTLI);
4313 :
4314 2195 : foreach(cell, tles)
4315 : {
4316 1995 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4317 1995 : TimeLineID tli = hent->tli;
4318 :
4319 1995 : if (tli < curFileTLI)
4320 7 : break; /* don't bother looking at too-old TLIs */
4321 :
4322 : /*
4323 : * Skip scanning the timeline ID that the logfile segment to read
4324 : * doesn't belong to
4325 : */
4326 1988 : if (XLogRecPtrIsValid(hent->begin))
4327 : {
4328 82 : XLogSegNo beginseg = 0;
4329 :
4330 82 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4331 :
4332 : /*
4333 : * The logfile segment that doesn't belong to the timeline is
4334 : * older or newer than the segment that the timeline started or
4335 : * ended at, respectively. It's sufficient to check only the
4336 : * starting segment of the timeline here. Since the timelines are
4337 : * scanned in descending order in this loop, any segments newer
4338 : * than the ending segment should belong to newer timeline and
4339 : * have already been read before. So it's not necessary to check
4340 : * the ending segment of the timeline here.
4341 : */
4342 82 : if (segno < beginseg)
4343 7 : continue;
4344 : }
4345 :
4346 1981 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4347 : {
4348 906 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4349 906 : if (fd != -1)
4350 : {
4351 366 : elog(DEBUG1, "got WAL segment from archive");
4352 366 : if (!expectedTLEs)
4353 19 : expectedTLEs = tles;
4354 1762 : return fd;
4355 : }
4356 : }
4357 :
4358 1615 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4359 : {
4360 1615 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4361 1615 : if (fd != -1)
4362 : {
4363 1396 : if (!expectedTLEs)
4364 1068 : expectedTLEs = tles;
4365 1396 : return fd;
4366 : }
4367 : }
4368 : }
4369 :
4370 : /* Couldn't find it. For simplicity, complain about front timeline */
4371 207 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4372 207 : errno = ENOENT;
4373 207 : ereport(DEBUG2,
4374 : (errcode_for_file_access(),
4375 : errmsg("could not open file \"%s\": %m", path)));
4376 207 : return -1;
4377 : }
4378 :
4379 : /*
4380 : * Set flag to signal the walreceiver to restart. (The startup process calls
4381 : * this on noticing a relevant configuration change.)
4382 : */
4383 : void
4384 13 : StartupRequestWalReceiverRestart(void)
4385 : {
4386 13 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4387 : {
4388 8 : ereport(LOG,
4389 : (errmsg("WAL receiver process shutdown requested")));
4390 :
4391 8 : pendingWalRcvRestart = true;
4392 : }
4393 13 : }
4394 :
4395 :
4396 : /*
4397 : * Has a standby promotion already been triggered?
4398 : *
4399 : * Unlike CheckForStandbyTrigger(), this works in any process
4400 : * that's connected to shared memory.
4401 : */
4402 : bool
4403 82 : PromoteIsTriggered(void)
4404 : {
4405 : /*
4406 : * We check shared state each time only until a standby promotion is
4407 : * triggered. We can't trigger a promotion again, so there's no need to
4408 : * keep checking after the shared variable has once been seen true.
4409 : */
4410 82 : if (LocalPromoteIsTriggered)
4411 53 : return true;
4412 :
4413 29 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4414 29 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4415 29 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4416 :
4417 29 : return LocalPromoteIsTriggered;
4418 : }
4419 :
4420 : static void
4421 50 : SetPromoteIsTriggered(void)
4422 : {
4423 50 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4424 50 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4425 50 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4426 :
4427 : /*
4428 : * Mark the recovery pause state as 'not paused' because the paused state
4429 : * ends and promotion continues if a promotion is triggered while recovery
4430 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4431 : * return 'paused' while a promotion is ongoing.
4432 : */
4433 50 : SetRecoveryPause(false);
4434 :
4435 50 : LocalPromoteIsTriggered = true;
4436 50 : }
4437 :
4438 : /*
4439 : * Check whether a promote request has arrived.
4440 : */
4441 : static bool
4442 16897 : CheckForStandbyTrigger(void)
4443 : {
4444 16897 : if (LocalPromoteIsTriggered)
4445 55 : return true;
4446 :
4447 16842 : if (IsPromoteSignaled() && CheckPromoteSignal())
4448 : {
4449 50 : ereport(LOG, (errmsg("received promote request")));
4450 50 : RemovePromoteSignalFiles();
4451 50 : ResetPromoteSignaled();
4452 50 : SetPromoteIsTriggered();
4453 50 : return true;
4454 : }
4455 :
4456 16792 : return false;
4457 : }
4458 :
4459 : /*
4460 : * Remove the files signaling a standby promotion request.
4461 : */
4462 : void
4463 1043 : RemovePromoteSignalFiles(void)
4464 : {
4465 1043 : unlink(PROMOTE_SIGNAL_FILE);
4466 1043 : }
4467 :
4468 : /*
4469 : * Check to see if a promote request has arrived.
4470 : */
4471 : bool
4472 731 : CheckPromoteSignal(void)
4473 : {
4474 : struct stat stat_buf;
4475 :
4476 731 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4477 100 : return true;
4478 :
4479 631 : return false;
4480 : }
4481 :
4482 : /*
4483 : * Wake up startup process to replay newly arrived WAL, or to notice that
4484 : * failover has been requested.
4485 : */
4486 : void
4487 42844 : WakeupRecovery(void)
4488 : {
4489 42844 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4490 42844 : }
4491 :
4492 : /*
4493 : * Schedule a walreceiver wakeup in the main recovery loop.
4494 : */
4495 : void
4496 2 : XLogRequestWalReceiverReply(void)
4497 : {
4498 2 : doRequestWalReceiverReply = true;
4499 2 : }
4500 :
4501 : /*
4502 : * Is HotStandby active yet? This is only important in special backends
4503 : * since normal backends won't ever be able to connect until this returns
4504 : * true. Postmaster knows this by way of signal, not via shared memory.
4505 : *
4506 : * Unlike testing standbyState, this works in any process that's connected to
4507 : * shared memory. (And note that standbyState alone doesn't tell the truth
4508 : * anyway.)
4509 : */
4510 : bool
4511 175 : HotStandbyActive(void)
4512 : {
4513 : /*
4514 : * We check shared state each time only until Hot Standby is active. We
4515 : * can't de-activate Hot Standby, so there's no need to keep checking
4516 : * after the shared variable has once been seen true.
4517 : */
4518 175 : if (LocalHotStandbyActive)
4519 25 : return true;
4520 : else
4521 : {
4522 : /* spinlock is essential on machines with weak memory ordering! */
4523 150 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4524 150 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4525 150 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4526 :
4527 150 : return LocalHotStandbyActive;
4528 : }
4529 : }
4530 :
4531 : /*
4532 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4533 : * where we don't need to ask any other process what the state is.
4534 : */
4535 : static bool
4536 0 : HotStandbyActiveInReplay(void)
4537 : {
4538 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4539 0 : return LocalHotStandbyActive;
4540 : }
4541 :
4542 : /*
4543 : * Get latest redo apply position.
4544 : *
4545 : * Exported to allow WALReceiver to read the pointer directly.
4546 : */
4547 : XLogRecPtr
4548 110745 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4549 : {
4550 : XLogRecPtr recptr;
4551 : TimeLineID tli;
4552 :
4553 110745 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4554 110745 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4555 110745 : tli = XLogRecoveryCtl->lastReplayedTLI;
4556 110745 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4557 :
4558 110745 : if (replayTLI)
4559 3199 : *replayTLI = tli;
4560 110745 : return recptr;
4561 : }
4562 :
4563 :
4564 : /*
4565 : * Get position of last applied, or the record being applied.
4566 : *
4567 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4568 : * record is currently being applied, this includes that record.
4569 : */
4570 : XLogRecPtr
4571 6668 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4572 : {
4573 : XLogRecPtr recptr;
4574 : TimeLineID tli;
4575 :
4576 6668 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4577 6668 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4578 6668 : tli = XLogRecoveryCtl->replayEndTLI;
4579 6668 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4580 :
4581 6668 : if (replayEndTLI)
4582 6668 : *replayEndTLI = tli;
4583 6668 : return recptr;
4584 : }
4585 :
4586 : /*
4587 : * Save timestamp of latest processed commit/abort record.
4588 : *
4589 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4590 : * seen by processes other than the startup process. Note in particular
4591 : * that CreateRestartPoint is executed in the checkpointer.
4592 : */
4593 : static void
4594 23630 : SetLatestXTime(TimestampTz xtime)
4595 : {
4596 23630 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4597 23630 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4598 23630 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4599 23630 : }
4600 :
4601 : /*
4602 : * Fetch timestamp of latest processed commit/abort record.
4603 : */
4604 : TimestampTz
4605 372 : GetLatestXTime(void)
4606 : {
4607 : TimestampTz xtime;
4608 :
4609 372 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4610 372 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4611 372 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4612 :
4613 372 : return xtime;
4614 : }
4615 :
4616 : /*
4617 : * Save timestamp of the next chunk of WAL records to apply.
4618 : *
4619 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4620 : * seen by all backends.
4621 : */
4622 : static void
4623 13430 : SetCurrentChunkStartTime(TimestampTz xtime)
4624 : {
4625 13430 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4626 13430 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4627 13430 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4628 13430 : }
4629 :
4630 : /*
4631 : * Fetch timestamp of latest processed commit/abort record.
4632 : * Startup process maintains an accurate local copy in XLogReceiptTime
4633 : */
4634 : TimestampTz
4635 288 : GetCurrentChunkReplayStartTime(void)
4636 : {
4637 : TimestampTz xtime;
4638 :
4639 288 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4640 288 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4641 288 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4642 :
4643 288 : return xtime;
4644 : }
4645 :
4646 : /*
4647 : * Returns time of receipt of current chunk of XLOG data, as well as
4648 : * whether it was received from streaming replication or from archives.
4649 : */
4650 : void
4651 31 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4652 : {
4653 : /*
4654 : * This must be executed in the startup process, since we don't export the
4655 : * relevant state to shared memory.
4656 : */
4657 : Assert(InRecovery);
4658 :
4659 31 : *rtime = XLogReceiptTime;
4660 31 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4661 31 : }
4662 :
4663 : /*
4664 : * Note that text field supplied is a parameter name and does not require
4665 : * translation
4666 : */
4667 : void
4668 715 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4669 : {
4670 715 : if (currValue < minValue)
4671 : {
4672 0 : if (HotStandbyActiveInReplay())
4673 : {
4674 0 : bool warned_for_promote = false;
4675 :
4676 0 : ereport(WARNING,
4677 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4678 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4679 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4680 : param_name,
4681 : currValue,
4682 : minValue)));
4683 :
4684 0 : SetRecoveryPause(true);
4685 :
4686 0 : ereport(LOG,
4687 : (errmsg("recovery has paused"),
4688 : errdetail("If recovery is unpaused, the server will shut down."),
4689 : errhint("You can then restart the server after making the necessary configuration changes.")));
4690 :
4691 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4692 : {
4693 0 : ProcessStartupProcInterrupts();
4694 :
4695 0 : if (CheckForStandbyTrigger())
4696 : {
4697 0 : if (!warned_for_promote)
4698 0 : ereport(WARNING,
4699 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4700 : errmsg("promotion is not possible because of insufficient parameter settings"),
4701 :
4702 : /*
4703 : * Repeat the detail from above so it's easy to find
4704 : * in the log.
4705 : */
4706 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4707 : param_name,
4708 : currValue,
4709 : minValue),
4710 : errhint("Restart the server after making the necessary configuration changes.")));
4711 0 : warned_for_promote = true;
4712 : }
4713 :
4714 : /*
4715 : * If recovery pause is requested then set it paused. While
4716 : * we are in the loop, user might resume and pause again so
4717 : * set this every time.
4718 : */
4719 0 : ConfirmRecoveryPaused();
4720 :
4721 : /*
4722 : * We wait on a condition variable that will wake us as soon
4723 : * as the pause ends, but we use a timeout so we can check the
4724 : * above conditions periodically too.
4725 : */
4726 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4727 : WAIT_EVENT_RECOVERY_PAUSE);
4728 : }
4729 0 : ConditionVariableCancelSleep();
4730 : }
4731 :
4732 0 : ereport(FATAL,
4733 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4734 : errmsg("recovery aborted because of insufficient parameter settings"),
4735 : /* Repeat the detail from above so it's easy to find in the log. */
4736 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4737 : param_name,
4738 : currValue,
4739 : minValue),
4740 : errhint("You can restart the server after making the necessary configuration changes.")));
4741 : }
4742 715 : }
4743 :
4744 :
4745 : /*
4746 : * GUC check_hook for primary_slot_name
4747 : */
4748 : bool
4749 1476 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4750 : {
4751 : int err_code;
4752 1476 : char *err_msg = NULL;
4753 1476 : char *err_hint = NULL;
4754 :
4755 1476 : if (*newval && strcmp(*newval, "") != 0 &&
4756 186 : !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
4757 : &err_msg, &err_hint))
4758 : {
4759 0 : GUC_check_errcode(err_code);
4760 0 : GUC_check_errdetail("%s", err_msg);
4761 0 : if (err_hint != NULL)
4762 0 : GUC_check_errhint("%s", err_hint);
4763 0 : return false;
4764 : }
4765 :
4766 1476 : return true;
4767 : }
4768 :
4769 : /*
4770 : * Recovery target settings: Only one of the several recovery_target* settings
4771 : * may be set. Setting a second one results in an error. The global variable
4772 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4773 : * variables store the actual target value (for example a string or a xid).
4774 : * The assign functions of the parameters check whether a competing parameter
4775 : * was already set. But we want to allow setting the same parameter multiple
4776 : * times. We also want to allow unsetting a parameter and setting a different
4777 : * one, so we unset recoveryTarget when the parameter is set to an empty
4778 : * string.
4779 : *
4780 : * XXX this code is broken by design. Throwing an error from a GUC assign
4781 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4782 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4783 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4784 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4785 : */
4786 :
4787 : pg_noreturn static void
4788 1 : error_multiple_recovery_targets(void)
4789 : {
4790 1 : ereport(ERROR,
4791 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4792 : errmsg("multiple recovery targets specified"),
4793 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4794 : }
4795 :
4796 : /*
4797 : * GUC check_hook for recovery_target
4798 : */
4799 : bool
4800 1291 : check_recovery_target(char **newval, void **extra, GucSource source)
4801 : {
4802 1291 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4803 : {
4804 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4805 0 : return false;
4806 : }
4807 1291 : return true;
4808 : }
4809 :
4810 : /*
4811 : * GUC assign_hook for recovery_target
4812 : */
4813 : void
4814 1291 : assign_recovery_target(const char *newval, void *extra)
4815 : {
4816 1291 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4817 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4818 0 : error_multiple_recovery_targets();
4819 :
4820 1291 : if (newval && strcmp(newval, "") != 0)
4821 1 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4822 : else
4823 1290 : recoveryTarget = RECOVERY_TARGET_UNSET;
4824 1291 : }
4825 :
4826 : /*
4827 : * GUC check_hook for recovery_target_lsn
4828 : */
4829 : bool
4830 1297 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4831 : {
4832 1297 : if (strcmp(*newval, "") != 0)
4833 : {
4834 : XLogRecPtr lsn;
4835 : XLogRecPtr *myextra;
4836 8 : ErrorSaveContext escontext = {T_ErrorSaveContext};
4837 :
4838 8 : lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4839 8 : if (escontext.error_occurred)
4840 0 : return false;
4841 :
4842 8 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4843 8 : if (!myextra)
4844 0 : return false;
4845 8 : *myextra = lsn;
4846 8 : *extra = myextra;
4847 : }
4848 1297 : return true;
4849 : }
4850 :
4851 : /*
4852 : * GUC assign_hook for recovery_target_lsn
4853 : */
4854 : void
4855 1297 : assign_recovery_target_lsn(const char *newval, void *extra)
4856 : {
4857 1297 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4858 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4859 0 : error_multiple_recovery_targets();
4860 :
4861 1297 : if (newval && strcmp(newval, "") != 0)
4862 : {
4863 8 : recoveryTarget = RECOVERY_TARGET_LSN;
4864 8 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4865 : }
4866 : else
4867 1289 : recoveryTarget = RECOVERY_TARGET_UNSET;
4868 1297 : }
4869 :
4870 : /*
4871 : * GUC check_hook for recovery_target_name
4872 : */
4873 : bool
4874 1297 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4875 : {
4876 : /* Use the value of newval directly */
4877 1297 : if (strlen(*newval) >= MAXFNAMELEN)
4878 : {
4879 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4880 : "recovery_target_name", MAXFNAMELEN - 1);
4881 0 : return false;
4882 : }
4883 1297 : return true;
4884 : }
4885 :
4886 : /*
4887 : * GUC assign_hook for recovery_target_name
4888 : */
4889 : void
4890 1297 : assign_recovery_target_name(const char *newval, void *extra)
4891 : {
4892 1297 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4893 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4894 0 : error_multiple_recovery_targets();
4895 :
4896 1297 : if (newval && strcmp(newval, "") != 0)
4897 : {
4898 6 : recoveryTarget = RECOVERY_TARGET_NAME;
4899 6 : recoveryTargetName = newval;
4900 : }
4901 : else
4902 1291 : recoveryTarget = RECOVERY_TARGET_UNSET;
4903 1297 : }
4904 :
4905 : /*
4906 : * GUC check_hook for recovery_target_time
4907 : *
4908 : * The interpretation of the recovery_target_time string can depend on the
4909 : * time zone setting, so we need to wait until after all GUC processing is
4910 : * done before we can do the final parsing of the string. This check function
4911 : * only does a parsing pass to catch syntax errors, but we store the string
4912 : * and parse it again when we need to use it.
4913 : */
4914 : bool
4915 1293 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4916 : {
4917 1293 : if (strcmp(*newval, "") != 0)
4918 : {
4919 : /* reject some special values */
4920 3 : if (strcmp(*newval, "now") == 0 ||
4921 3 : strcmp(*newval, "today") == 0 ||
4922 3 : strcmp(*newval, "tomorrow") == 0 ||
4923 3 : strcmp(*newval, "yesterday") == 0)
4924 : {
4925 0 : return false;
4926 : }
4927 :
4928 : /*
4929 : * parse timestamp value (see also timestamptz_in())
4930 : */
4931 : {
4932 3 : char *str = *newval;
4933 : fsec_t fsec;
4934 : struct pg_tm tt,
4935 3 : *tm = &tt;
4936 : int tz;
4937 : int dtype;
4938 : int nf;
4939 : int dterr;
4940 : char *field[MAXDATEFIELDS];
4941 : int ftype[MAXDATEFIELDS];
4942 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4943 : DateTimeErrorExtra dtextra;
4944 : TimestampTz timestamp;
4945 :
4946 3 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4947 : field, ftype, MAXDATEFIELDS, &nf);
4948 3 : if (dterr == 0)
4949 3 : dterr = DecodeDateTime(field, ftype, nf,
4950 : &dtype, tm, &fsec, &tz, &dtextra);
4951 3 : if (dterr != 0)
4952 0 : return false;
4953 3 : if (dtype != DTK_DATE)
4954 0 : return false;
4955 :
4956 3 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4957 : {
4958 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4959 0 : return false;
4960 : }
4961 : }
4962 : }
4963 1293 : return true;
4964 : }
4965 :
4966 : /*
4967 : * GUC assign_hook for recovery_target_time
4968 : */
4969 : void
4970 1293 : assign_recovery_target_time(const char *newval, void *extra)
4971 : {
4972 1293 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4973 1 : recoveryTarget != RECOVERY_TARGET_TIME)
4974 1 : error_multiple_recovery_targets();
4975 :
4976 1292 : if (newval && strcmp(newval, "") != 0)
4977 2 : recoveryTarget = RECOVERY_TARGET_TIME;
4978 : else
4979 1290 : recoveryTarget = RECOVERY_TARGET_UNSET;
4980 1292 : }
4981 :
4982 : /*
4983 : * GUC check_hook for recovery_target_timeline
4984 : */
4985 : bool
4986 1294 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4987 : {
4988 : RecoveryTargetTimeLineGoal rttg;
4989 : RecoveryTargetTimeLineGoal *myextra;
4990 :
4991 1294 : if (strcmp(*newval, "current") == 0)
4992 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4993 1294 : else if (strcmp(*newval, "latest") == 0)
4994 1291 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4995 : else
4996 : {
4997 : char *endp;
4998 : uint64 timeline;
4999 :
5000 3 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
5001 :
5002 3 : errno = 0;
5003 3 : timeline = strtou64(*newval, &endp, 0);
5004 :
5005 3 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5006 : {
5007 1 : GUC_check_errdetail("\"%s\" is not a valid number.",
5008 : "recovery_target_timeline");
5009 3 : return false;
5010 : }
5011 :
5012 2 : if (timeline < 1 || timeline > PG_UINT32_MAX)
5013 : {
5014 2 : GUC_check_errdetail("\"%s\" must be between %u and %u.",
5015 : "recovery_target_timeline", 1, PG_UINT32_MAX);
5016 2 : return false;
5017 : }
5018 : }
5019 :
5020 1291 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5021 1291 : if (!myextra)
5022 0 : return false;
5023 1291 : *myextra = rttg;
5024 1291 : *extra = myextra;
5025 :
5026 1291 : return true;
5027 : }
5028 :
5029 : /*
5030 : * GUC assign_hook for recovery_target_timeline
5031 : */
5032 : void
5033 1291 : assign_recovery_target_timeline(const char *newval, void *extra)
5034 : {
5035 1291 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5036 1291 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5037 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5038 : else
5039 1291 : recoveryTargetTLIRequested = 0;
5040 1291 : }
5041 :
5042 : /*
5043 : * GUC check_hook for recovery_target_xid
5044 : */
5045 : bool
5046 1293 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5047 : {
5048 1293 : if (strcmp(*newval, "") != 0)
5049 : {
5050 : TransactionId xid;
5051 : TransactionId *myextra;
5052 : char *endp;
5053 : char *val;
5054 :
5055 3 : errno = 0;
5056 :
5057 : /*
5058 : * Consume leading whitespace to determine if number is negative
5059 : */
5060 3 : val = *newval;
5061 :
5062 3 : while (isspace((unsigned char) *val))
5063 0 : val++;
5064 :
5065 : /*
5066 : * This cast will remove the epoch, if any
5067 : */
5068 3 : xid = (TransactionId) strtou64(val, &endp, 0);
5069 :
5070 3 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE || *val == '-')
5071 : {
5072 2 : GUC_check_errdetail("\"%s\" is not a valid number.",
5073 : "recovery_target_xid");
5074 2 : return false;
5075 : }
5076 :
5077 1 : if (xid < FirstNormalTransactionId)
5078 : {
5079 0 : GUC_check_errdetail("\"%s\" without epoch must be greater than or equal to %u.",
5080 : "recovery_target_xid",
5081 : FirstNormalTransactionId);
5082 0 : return false;
5083 : }
5084 :
5085 1 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5086 1 : if (!myextra)
5087 0 : return false;
5088 1 : *myextra = xid;
5089 1 : *extra = myextra;
5090 : }
5091 1291 : return true;
5092 : }
5093 :
5094 : /*
5095 : * GUC assign_hook for recovery_target_xid
5096 : */
5097 : void
5098 1291 : assign_recovery_target_xid(const char *newval, void *extra)
5099 : {
5100 1291 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5101 0 : recoveryTarget != RECOVERY_TARGET_XID)
5102 0 : error_multiple_recovery_targets();
5103 :
5104 1291 : if (newval && strcmp(newval, "") != 0)
5105 : {
5106 1 : recoveryTarget = RECOVERY_TARGET_XID;
5107 1 : recoveryTargetXid = *((TransactionId *) extra);
5108 : }
5109 : else
5110 1290 : recoveryTarget = RECOVERY_TARGET_UNSET;
5111 1291 : }
|