Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <time.h>
29 : #include <sys/stat.h>
30 : #include <sys/time.h>
31 : #include <unistd.h>
32 :
33 : #include "access/timeline.h"
34 : #include "access/transam.h"
35 : #include "access/xact.h"
36 : #include "access/xlog_internal.h"
37 : #include "access/xlogarchive.h"
38 : #include "access/xlogprefetcher.h"
39 : #include "access/xlogreader.h"
40 : #include "access/xlogrecovery.h"
41 : #include "access/xlogutils.h"
42 : #include "access/xlogwait.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "nodes/miscnodes.h"
49 : #include "pgstat.h"
50 : #include "postmaster/bgwriter.h"
51 : #include "postmaster/startup.h"
52 : #include "replication/slot.h"
53 : #include "replication/slotsync.h"
54 : #include "replication/walreceiver.h"
55 : #include "storage/fd.h"
56 : #include "storage/ipc.h"
57 : #include "storage/latch.h"
58 : #include "storage/pmsignal.h"
59 : #include "storage/procarray.h"
60 : #include "storage/spin.h"
61 : #include "storage/subsystems.h"
62 : #include "utils/datetime.h"
63 : #include "utils/fmgrprotos.h"
64 : #include "utils/guc_hooks.h"
65 : #include "utils/pgstat_internal.h"
66 : #include "utils/pg_lsn.h"
67 : #include "utils/ps_status.h"
68 : #include "utils/pg_rusage.h"
69 : #include "utils/wait_event.h"
70 :
71 : /* Unsupported old recovery command file names (relative to $PGDATA) */
72 : #define RECOVERY_COMMAND_FILE "recovery.conf"
73 : #define RECOVERY_COMMAND_DONE "recovery.done"
74 :
75 : /*
76 : * GUC support
77 : */
78 : const struct config_enum_entry recovery_target_action_options[] = {
79 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
80 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
81 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
82 : {NULL, 0, false}
83 : };
84 :
85 : /* options formerly taken from recovery.conf for archive recovery */
86 : char *recoveryRestoreCommand = NULL;
87 : char *recoveryEndCommand = NULL;
88 : char *archiveCleanupCommand = NULL;
89 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
90 : bool recoveryTargetInclusive = true;
91 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
92 : TransactionId recoveryTargetXid;
93 : char *recovery_target_time_string;
94 : TimestampTz recoveryTargetTime;
95 : const char *recoveryTargetName;
96 : XLogRecPtr recoveryTargetLSN;
97 : int recovery_min_apply_delay = 0;
98 :
99 : /* options formerly taken from recovery.conf for XLOG streaming */
100 : char *PrimaryConnInfo = NULL;
101 : char *PrimarySlotName = NULL;
102 : bool wal_receiver_create_temp_slot = false;
103 :
104 : /*
105 : * recoveryTargetTimeLineGoal: what the user requested, if any
106 : *
107 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
108 : *
109 : * recoveryTargetTLI: the currently understood target timeline; changes
110 : *
111 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
112 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
113 : * always the first list member). Only these TLIs are expected to be seen in
114 : * the WAL segments we read, and indeed only these TLIs will be considered as
115 : * candidate WAL files to open at all.
116 : *
117 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
118 : * (This is not necessarily the same as the timeline from which we are
119 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
120 : * scanning data that was copied from an ancestor timeline when the current
121 : * file was created.) During a sequential scan we do not allow this value
122 : * to decrease.
123 : */
124 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
125 : TimeLineID recoveryTargetTLIRequested = 0;
126 : TimeLineID recoveryTargetTLI = 0;
127 : static List *expectedTLEs;
128 : static TimeLineID curFileTLI;
129 :
130 : /*
131 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
132 : * ie. signal files were present. When InArchiveRecovery is set, we are
133 : * currently recovering using offline XLOG archives. These variables are only
134 : * valid in the startup process.
135 : *
136 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
137 : * currently performing crash recovery using only XLOG files in pg_wal, but
138 : * will switch to using offline XLOG archives as soon as we reach the end of
139 : * WAL in pg_wal.
140 : */
141 : bool ArchiveRecoveryRequested = false;
142 : bool InArchiveRecovery = false;
143 :
144 : /*
145 : * When StandbyModeRequested is set, standby mode was requested, i.e.
146 : * standby.signal file was present. When StandbyMode is set, we are currently
147 : * in standby mode. These variables are only valid in the startup process.
148 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
149 : */
150 : static bool StandbyModeRequested = false;
151 : bool StandbyMode = false;
152 :
153 : /* was a signal file present at startup? */
154 : static bool standby_signal_file_found = false;
155 : static bool recovery_signal_file_found = false;
156 :
157 : /*
158 : * CheckPointLoc is the position of the checkpoint record that determines
159 : * where to start the replay. It comes from the backup label file or the
160 : * control file.
161 : *
162 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
163 : * file or the control file. In standby mode, XLOG streaming usually starts
164 : * from the position where an invalid record was found. But if we fail to
165 : * read even the initial checkpoint record, we use the REDO location instead
166 : * of the checkpoint location as the start position of XLOG streaming.
167 : * Otherwise we would have to jump backwards to the REDO location after
168 : * reading the checkpoint record, because the REDO record can precede the
169 : * checkpoint record.
170 : */
171 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
172 : static TimeLineID CheckPointTLI = 0;
173 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
174 : static TimeLineID RedoStartTLI = 0;
175 :
176 : /*
177 : * Local copy of SharedHotStandbyActive variable. False actually means "not
178 : * known, need to check the shared state".
179 : */
180 : static bool LocalHotStandbyActive = false;
181 :
182 : /*
183 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
184 : * known, need to check the shared state".
185 : */
186 : static bool LocalPromoteIsTriggered = false;
187 :
188 : /* Has the recovery code requested a walreceiver wakeup? */
189 : static bool doRequestWalReceiverReply;
190 :
191 : /* XLogReader object used to parse the WAL records */
192 : static XLogReaderState *xlogreader = NULL;
193 :
194 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
195 : static XLogPrefetcher *xlogprefetcher = NULL;
196 :
197 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
198 : typedef struct XLogPageReadPrivate
199 : {
200 : int emode;
201 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
202 : bool randAccess;
203 : TimeLineID replayTLI;
204 : } XLogPageReadPrivate;
205 :
206 : /* flag to tell XLogPageRead that we have started replaying */
207 : static bool InRedo = false;
208 :
209 : /*
210 : * Codes indicating where we got a WAL file from during recovery, or where
211 : * to attempt to get one.
212 : */
213 : typedef enum
214 : {
215 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
216 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
217 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
218 : XLOG_FROM_STREAM, /* streamed from primary */
219 : } XLogSource;
220 :
221 : /* human-readable names for XLogSources, for debugging output */
222 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
223 :
224 : /*
225 : * readFile is -1 or a kernel FD for the log file segment that's currently
226 : * open for reading. readSegNo identifies the segment. readOff is the offset
227 : * of the page just read, readLen indicates how much of it has been read into
228 : * readBuf, and readSource indicates where we got the currently open file from.
229 : *
230 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
231 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
232 : * worthwhile, since the XLOG is not read by general-purpose sessions.
233 : */
234 : static int readFile = -1;
235 : static XLogSegNo readSegNo = 0;
236 : static uint32 readOff = 0;
237 : static uint32 readLen = 0;
238 : static XLogSource readSource = XLOG_FROM_ANY;
239 :
240 : /*
241 : * Keeps track of which source we're currently reading from. This is
242 : * different from readSource in that this is always set, even when we don't
243 : * currently have a WAL file open. If lastSourceFailed is set, our last
244 : * attempt to read from currentSource failed, and we should try another source
245 : * next.
246 : *
247 : * pendingWalRcvRestart is set when a config change occurs that requires a
248 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
249 : */
250 : static XLogSource currentSource = XLOG_FROM_ANY;
251 : static bool lastSourceFailed = false;
252 : static bool pendingWalRcvRestart = false;
253 :
254 : /*
255 : * These variables track when we last obtained some WAL data to process,
256 : * and where we got it from. (XLogReceiptSource is initially the same as
257 : * readSource, but readSource gets reset to zero when we don't have data
258 : * to process right now. It is also different from currentSource, which
259 : * also changes when we try to read from a source and fail, while
260 : * XLogReceiptSource tracks where we last successfully read some WAL.)
261 : */
262 : static TimestampTz XLogReceiptTime = 0;
263 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
264 :
265 : /* Local copy of WalRcv->flushedUpto */
266 : static XLogRecPtr flushedUpto = InvalidXLogRecPtr;
267 : static TimeLineID receiveTLI = 0;
268 :
269 : /*
270 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
271 : *
272 : * In order to reach consistency, we must replay the WAL up to
273 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
274 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
275 : * to backupStartPoint.
276 : *
277 : * Note: In archive recovery, after consistency has been reached, the
278 : * functions in xlog.c will start updating minRecoveryPoint in the control
279 : * file. But this copy of minRecoveryPoint variable reflects the value at the
280 : * beginning of recovery, and is *not* updated after consistency is reached.
281 : */
282 : static XLogRecPtr minRecoveryPoint;
283 : static TimeLineID minRecoveryPointTLI;
284 :
285 : static XLogRecPtr backupStartPoint;
286 : static XLogRecPtr backupEndPoint;
287 : static bool backupEndRequired = false;
288 :
289 : /*
290 : * Have we reached a consistent database state? In crash recovery, we have
291 : * to replay all the WAL, so reachedConsistency is never set. During archive
292 : * recovery, the database is consistent once minRecoveryPoint is reached.
293 : *
294 : * Consistent state means that the system is internally consistent, all
295 : * the WAL has been replayed up to a certain point, and importantly, there
296 : * is no trace of later actions on disk.
297 : *
298 : * This flag is used only by the startup process and postmaster. When
299 : * minRecoveryPoint is reached, the startup process sets it to true and
300 : * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
301 : * which then sets it to true upon receiving the signal.
302 : */
303 : bool reachedConsistency = false;
304 :
305 : /* Buffers dedicated to consistency checks of size BLCKSZ */
306 : static char *replay_image_masked = NULL;
307 : static char *primary_image_masked = NULL;
308 :
309 : XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
310 :
311 : static void XLogRecoveryShmemRequest(void *arg);
312 : static void XLogRecoveryShmemInit(void *arg);
313 :
314 : const ShmemCallbacks XLogRecoveryShmemCallbacks = {
315 : .request_fn = XLogRecoveryShmemRequest,
316 : .init_fn = XLogRecoveryShmemInit,
317 : };
318 :
319 : /*
320 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
321 : * recovery completes; missingContrecPtr is the location of the first
322 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
323 : * details.
324 : */
325 : static XLogRecPtr abortedRecPtr;
326 : static XLogRecPtr missingContrecPtr;
327 :
328 : /*
329 : * if recoveryStopsBefore/After returns true, it saves information of the stop
330 : * point here
331 : */
332 : static TransactionId recoveryStopXid;
333 : static TimestampTz recoveryStopTime;
334 : static XLogRecPtr recoveryStopLSN;
335 : static char recoveryStopName[MAXFNAMELEN];
336 : static bool recoveryStopAfter;
337 :
338 : /* prototypes for local functions */
339 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
340 :
341 : static void EnableStandbyMode(void);
342 : static void readRecoverySignalFile(void);
343 : static void validateRecoveryParameters(void);
344 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
345 : TimeLineID *backupLabelTLI,
346 : bool *backupEndRequired, bool *backupFromStandby);
347 : static bool read_tablespace_map(List **tablespaces);
348 :
349 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
350 : static void CheckRecoveryConsistency(void);
351 : static void rm_redo_error_callback(void *arg);
352 : #ifdef WAL_DEBUG
353 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
354 : #endif
355 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
356 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
357 : TimeLineID prevTLI, TimeLineID replayTLI);
358 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
359 : static void verifyBackupPageConsistency(XLogReaderState *record);
360 :
361 : static bool recoveryStopsBefore(XLogReaderState *record);
362 : static bool recoveryStopsAfter(XLogReaderState *record);
363 : static char *getRecoveryStopReason(void);
364 : static void recoveryPausesHere(bool endOfRecovery);
365 : static bool recoveryApplyDelay(XLogReaderState *record);
366 : static void ConfirmRecoveryPaused(void);
367 :
368 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
369 : int emode, bool fetching_ckpt,
370 : TimeLineID replayTLI);
371 :
372 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
373 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
374 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
375 : bool randAccess,
376 : bool fetching_ckpt,
377 : XLogRecPtr tliRecPtr,
378 : TimeLineID replayTLI,
379 : XLogRecPtr replayLSN,
380 : bool nonblocking);
381 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
382 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
383 : XLogRecPtr RecPtr, TimeLineID replayTLI);
384 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
385 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
386 : XLogSource source, bool notfoundOk);
387 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
388 :
389 : static bool CheckForStandbyTrigger(void);
390 : static void SetPromoteIsTriggered(void);
391 : static bool HotStandbyActiveInReplay(void);
392 :
393 : static void SetCurrentChunkStartTime(TimestampTz xtime);
394 : static void SetLatestXTime(TimestampTz xtime);
395 :
396 : /*
397 : * Register shared memory for WAL recovery
398 : */
399 : static void
400 1238 : XLogRecoveryShmemRequest(void *arg)
401 : {
402 1238 : ShmemRequestStruct(.name = "XLOG Recovery Ctl",
403 : .size = sizeof(XLogRecoveryCtlData),
404 : .ptr = (void **) &XLogRecoveryCtl,
405 : );
406 1238 : }
407 :
408 : static void
409 1235 : XLogRecoveryShmemInit(void *arg)
410 : {
411 1235 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
412 :
413 1235 : SpinLockInit(&XLogRecoveryCtl->info_lck);
414 1235 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
415 1235 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
416 1235 : }
417 :
418 : /*
419 : * A thin wrapper to enable StandbyMode and do other preparatory work as
420 : * needed.
421 : */
422 : static void
423 114 : EnableStandbyMode(void)
424 : {
425 114 : StandbyMode = true;
426 :
427 : /*
428 : * To avoid server log bloat, we don't report recovery progress in a
429 : * standby as it will always be in recovery unless promoted. We disable
430 : * startup progress timeout in standby mode to avoid calling
431 : * startup_progress_timeout_handler() unnecessarily.
432 : */
433 114 : disable_startup_progress_timeout();
434 114 : }
435 :
436 : /*
437 : * Prepare the system for WAL recovery, if needed.
438 : *
439 : * This is called by StartupXLOG() which coordinates the server startup
440 : * sequence. This function analyzes the control file and the backup label
441 : * file, if any, and figures out whether we need to perform crash recovery or
442 : * archive recovery, and how far we need to replay the WAL to reach a
443 : * consistent state.
444 : *
445 : * This doesn't yet change the on-disk state, except for creating the symlinks
446 : * from table space map file if any, and for fetching WAL files needed to find
447 : * the checkpoint record. On entry, the caller has already read the control
448 : * file into memory, and passes it as argument. This function updates it to
449 : * reflect the recovery state, and the caller is expected to write it back to
450 : * disk does after initializing other subsystems, but before calling
451 : * PerformWalRecovery().
452 : *
453 : * This initializes some global variables like ArchiveRecoveryRequested, and
454 : * StandbyModeRequested and InRecovery.
455 : */
456 : void
457 1078 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
458 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
459 : {
460 : XLogPageReadPrivate *private;
461 : struct stat st;
462 : bool wasShutdown;
463 : XLogRecord *record;
464 : DBState dbstate_at_startup;
465 1078 : bool haveTblspcMap = false;
466 1078 : bool haveBackupLabel = false;
467 : CheckPoint checkPoint;
468 1078 : bool backupFromStandby = false;
469 :
470 1078 : dbstate_at_startup = ControlFile->state;
471 :
472 : /*
473 : * Initialize on the assumption we want to recover to the latest timeline
474 : * that's active according to pg_control.
475 : */
476 1078 : if (ControlFile->minRecoveryPointTLI >
477 1078 : ControlFile->checkPointCopy.ThisTimeLineID)
478 1 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
479 : else
480 1077 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
481 :
482 : /*
483 : * Check for signal files, and if so set up state for offline recovery
484 : */
485 1078 : readRecoverySignalFile();
486 1078 : validateRecoveryParameters();
487 :
488 : /*
489 : * Take ownership of the wakeup latch if we're going to sleep during
490 : * recovery, if required.
491 : */
492 1078 : if (ArchiveRecoveryRequested)
493 119 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
494 :
495 : /*
496 : * Set the WAL reading processor now, as it will be needed when reading
497 : * the checkpoint record required (backup_label or not).
498 : */
499 1078 : private = palloc0_object(XLogPageReadPrivate);
500 1078 : xlogreader =
501 1078 : XLogReaderAllocate(wal_segment_size, NULL,
502 1078 : XL_ROUTINE(.page_read = &XLogPageRead,
503 : .segment_open = NULL,
504 : .segment_close = wal_segment_close),
505 : private);
506 1078 : if (!xlogreader)
507 0 : ereport(ERROR,
508 : (errcode(ERRCODE_OUT_OF_MEMORY),
509 : errmsg("out of memory"),
510 : errdetail("Failed while allocating a WAL reading processor.")));
511 1078 : xlogreader->system_identifier = ControlFile->system_identifier;
512 :
513 : /*
514 : * Set the WAL decode buffer size. This limits how far ahead we can read
515 : * in the WAL.
516 : */
517 1078 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
518 :
519 : /* Create a WAL prefetcher. */
520 1078 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
521 :
522 : /*
523 : * Allocate two page buffers dedicated to WAL consistency checks. We do
524 : * it this way, rather than just making static arrays, for two reasons:
525 : * (1) no need to waste the storage in most instantiations of the backend;
526 : * (2) a static char array isn't guaranteed to have any particular
527 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
528 : */
529 1078 : replay_image_masked = (char *) palloc(BLCKSZ);
530 1078 : primary_image_masked = (char *) palloc(BLCKSZ);
531 :
532 : /*
533 : * Read the backup_label file. We want to run this part of the recovery
534 : * process after checking for signal files and after performing validation
535 : * of the recovery parameters.
536 : */
537 1078 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
538 : &backupFromStandby))
539 : {
540 82 : List *tablespaces = NIL;
541 :
542 : /*
543 : * Archive recovery was requested, and thanks to the backup label
544 : * file, we know how far we need to replay to reach consistency. Enter
545 : * archive recovery directly.
546 : */
547 82 : InArchiveRecovery = true;
548 82 : if (StandbyModeRequested)
549 70 : EnableStandbyMode();
550 :
551 : /*
552 : * Omitting backup_label when creating a new replica, PITR node etc.
553 : * unfortunately is a common cause of corruption. Logging that
554 : * backup_label was used makes it a bit easier to exclude that as the
555 : * cause of observed corruption.
556 : *
557 : * Do so before we try to read the checkpoint record (which can fail),
558 : * as otherwise it can be hard to understand why a checkpoint other
559 : * than ControlFile->checkPoint is used.
560 : */
561 82 : ereport(LOG,
562 : errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
563 : LSN_FORMAT_ARGS(RedoStartLSN),
564 : LSN_FORMAT_ARGS(CheckPointLoc),
565 : CheckPointTLI));
566 :
567 : /*
568 : * When a backup_label file is present, we want to roll forward from
569 : * the checkpoint it identifies, rather than using pg_control.
570 : */
571 82 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
572 : CheckPointTLI);
573 82 : if (record != NULL)
574 : {
575 82 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
576 82 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
577 82 : ereport(DEBUG1,
578 : errmsg_internal("checkpoint record is at %X/%08X",
579 : LSN_FORMAT_ARGS(CheckPointLoc)));
580 82 : InRecovery = true; /* force recovery even if SHUTDOWNED */
581 :
582 : /*
583 : * Make sure that REDO location exists. This may not be the case
584 : * if there was a crash during an online backup, which left a
585 : * backup_label around that references a WAL segment that's
586 : * already been archived.
587 : */
588 82 : if (checkPoint.redo < CheckPointLoc)
589 : {
590 82 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
591 82 : if (!ReadRecord(xlogprefetcher, LOG, false,
592 : checkPoint.ThisTimeLineID))
593 0 : ereport(FATAL,
594 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
595 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
596 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
597 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
598 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
599 : DataDir, DataDir, DataDir, DataDir));
600 : }
601 : }
602 : else
603 : {
604 0 : ereport(FATAL,
605 : errmsg("could not locate required checkpoint record at %X/%08X",
606 : LSN_FORMAT_ARGS(CheckPointLoc)),
607 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
608 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
609 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
610 : DataDir, DataDir, DataDir, DataDir));
611 : wasShutdown = false; /* keep compiler quiet */
612 : }
613 :
614 : /* Read the tablespace_map file if present and create symlinks. */
615 82 : if (read_tablespace_map(&tablespaces))
616 : {
617 : ListCell *lc;
618 :
619 4 : foreach(lc, tablespaces)
620 : {
621 2 : tablespaceinfo *ti = lfirst(lc);
622 : char *linkloc;
623 :
624 2 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
625 :
626 : /*
627 : * Remove the existing symlink if any and Create the symlink
628 : * under PGDATA.
629 : */
630 2 : remove_tablespace_symlink(linkloc);
631 :
632 2 : if (symlink(ti->path, linkloc) < 0)
633 0 : ereport(ERROR,
634 : (errcode_for_file_access(),
635 : errmsg("could not create symbolic link \"%s\": %m",
636 : linkloc)));
637 :
638 2 : pfree(ti->path);
639 2 : pfree(ti);
640 : }
641 :
642 : /* tell the caller to delete it later */
643 2 : haveTblspcMap = true;
644 : }
645 :
646 : /* tell the caller to delete it later */
647 82 : haveBackupLabel = true;
648 : }
649 : else
650 : {
651 : /* No backup_label file has been found if we are here. */
652 :
653 : /*
654 : * If tablespace_map file is present without backup_label file, there
655 : * is no use of such file. There is no harm in retaining it, but it
656 : * is better to get rid of the map file so that we don't have any
657 : * redundant file in data directory and it will avoid any sort of
658 : * confusion. It seems prudent though to just rename the file out of
659 : * the way rather than delete it completely, also we ignore any error
660 : * that occurs in rename operation as even if map file is present
661 : * without backup_label file, it is harmless.
662 : */
663 996 : if (stat(TABLESPACE_MAP, &st) == 0)
664 : {
665 1 : unlink(TABLESPACE_MAP_OLD);
666 1 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
667 1 : ereport(LOG,
668 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
669 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
670 : errdetail("File \"%s\" was renamed to \"%s\".",
671 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
672 : else
673 0 : ereport(LOG,
674 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
675 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
676 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
677 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
678 : }
679 :
680 : /*
681 : * It's possible that archive recovery was requested, but we don't
682 : * know how far we need to replay the WAL before we reach consistency.
683 : * This can happen for example if a base backup is taken from a
684 : * running server using an atomic filesystem snapshot, without calling
685 : * pg_backup_start/stop. Or if you just kill a running primary server
686 : * and put it into archive recovery by creating a recovery signal
687 : * file.
688 : *
689 : * Our strategy in that case is to perform crash recovery first,
690 : * replaying all the WAL present in pg_wal, and only enter archive
691 : * recovery after that.
692 : *
693 : * But usually we already know how far we need to replay the WAL (up
694 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
695 : * end-of-backup record), and we can enter archive recovery directly.
696 : */
697 996 : if (ArchiveRecoveryRequested &&
698 44 : (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) ||
699 9 : ControlFile->backupEndRequired ||
700 9 : XLogRecPtrIsValid(ControlFile->backupEndPoint) ||
701 9 : ControlFile->state == DB_SHUTDOWNED))
702 : {
703 43 : InArchiveRecovery = true;
704 43 : if (StandbyModeRequested)
705 43 : EnableStandbyMode();
706 : }
707 :
708 : /*
709 : * For the same reason as when starting up with backup_label present,
710 : * emit a log message when we continue initializing from a base
711 : * backup.
712 : */
713 996 : if (XLogRecPtrIsValid(ControlFile->backupStartPoint))
714 0 : ereport(LOG,
715 : errmsg("restarting backup recovery with redo LSN %X/%08X",
716 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
717 :
718 : /* Get the last valid checkpoint record. */
719 996 : CheckPointLoc = ControlFile->checkPoint;
720 996 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
721 996 : RedoStartLSN = ControlFile->checkPointCopy.redo;
722 996 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
723 996 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
724 : CheckPointTLI);
725 996 : if (record != NULL)
726 : {
727 995 : ereport(DEBUG1,
728 : errmsg_internal("checkpoint record is at %X/%08X",
729 : LSN_FORMAT_ARGS(CheckPointLoc)));
730 : }
731 : else
732 : {
733 : /*
734 : * We used to attempt to go back to a secondary checkpoint record
735 : * here, but only when not in standby mode. We now just fail if we
736 : * can't read the last checkpoint because this allows us to
737 : * simplify processing around checkpoints.
738 : */
739 1 : ereport(FATAL,
740 : errmsg("could not locate a valid checkpoint record at %X/%08X",
741 : LSN_FORMAT_ARGS(CheckPointLoc)));
742 : }
743 995 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
744 995 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
745 :
746 : /* Make sure that REDO location exists. */
747 995 : if (checkPoint.redo < CheckPointLoc)
748 : {
749 45 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
750 45 : if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
751 1 : ereport(FATAL,
752 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
753 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
754 : }
755 : }
756 :
757 1076 : if (ArchiveRecoveryRequested)
758 : {
759 119 : if (StandbyModeRequested)
760 114 : ereport(LOG,
761 : (errmsg("entering standby mode")));
762 5 : else if (recoveryTarget == RECOVERY_TARGET_XID)
763 0 : ereport(LOG,
764 : (errmsg("starting point-in-time recovery to XID %u",
765 : recoveryTargetXid)));
766 5 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
767 0 : ereport(LOG,
768 : (errmsg("starting point-in-time recovery to %s",
769 : timestamptz_to_str(recoveryTargetTime))));
770 5 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
771 3 : ereport(LOG,
772 : (errmsg("starting point-in-time recovery to \"%s\"",
773 : recoveryTargetName)));
774 2 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
775 0 : ereport(LOG,
776 : errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
777 : LSN_FORMAT_ARGS(recoveryTargetLSN)));
778 2 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
779 0 : ereport(LOG,
780 : (errmsg("starting point-in-time recovery to earliest consistent point")));
781 : else
782 2 : ereport(LOG,
783 : (errmsg("starting archive recovery")));
784 : }
785 :
786 : /*
787 : * If the location of the checkpoint record is not on the expected
788 : * timeline in the history of the requested timeline, we cannot proceed:
789 : * the backup is not part of the history of the requested timeline.
790 : */
791 : Assert(expectedTLEs); /* was initialized by reading checkpoint
792 : * record */
793 1076 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
794 : CheckPointTLI)
795 : {
796 : XLogRecPtr switchpoint;
797 :
798 : /*
799 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
800 : * not in expectedTLEs at all.
801 : */
802 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
803 0 : ereport(FATAL,
804 : (errmsg("requested timeline %u is not a child of this server's history",
805 : recoveryTargetTLI),
806 : /* translator: %s is a backup_label file or a pg_control file */
807 : errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
808 : haveBackupLabel ? "backup_label" : "pg_control",
809 : LSN_FORMAT_ARGS(CheckPointLoc),
810 : CheckPointTLI,
811 : LSN_FORMAT_ARGS(switchpoint))));
812 : }
813 :
814 : /*
815 : * The min recovery point should be part of the requested timeline's
816 : * history, too.
817 : */
818 1076 : if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) &&
819 41 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
820 41 : ControlFile->minRecoveryPointTLI)
821 0 : ereport(FATAL,
822 : errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
823 : recoveryTargetTLI,
824 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
825 : ControlFile->minRecoveryPointTLI));
826 :
827 1076 : ereport(DEBUG1,
828 : errmsg_internal("redo record is at %X/%08X; shutdown %s",
829 : LSN_FORMAT_ARGS(checkPoint.redo),
830 : wasShutdown ? "true" : "false"));
831 1076 : ereport(DEBUG1,
832 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
833 : U64FromFullTransactionId(checkPoint.nextXid),
834 : checkPoint.nextOid)));
835 1076 : ereport(DEBUG1,
836 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
837 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
838 1076 : ereport(DEBUG1,
839 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
840 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
841 1076 : ereport(DEBUG1,
842 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
843 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
844 1076 : ereport(DEBUG1,
845 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
846 : checkPoint.oldestCommitTsXid,
847 : checkPoint.newestCommitTsXid)));
848 1076 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
849 0 : ereport(PANIC,
850 : (errmsg("invalid next transaction ID")));
851 :
852 : /* sanity check */
853 1076 : if (checkPoint.redo > CheckPointLoc)
854 0 : ereport(PANIC,
855 : (errmsg("invalid redo in checkpoint record")));
856 :
857 : /*
858 : * Check whether we need to force recovery from WAL. If it appears to
859 : * have been a clean shutdown and we did not have a recovery signal file,
860 : * then assume no recovery needed.
861 : */
862 1076 : if (checkPoint.redo < CheckPointLoc)
863 : {
864 126 : if (wasShutdown)
865 0 : ereport(PANIC,
866 : (errmsg("invalid redo record in shutdown checkpoint")));
867 126 : InRecovery = true;
868 : }
869 950 : else if (ControlFile->state != DB_SHUTDOWNED)
870 93 : InRecovery = true;
871 857 : else if (ArchiveRecoveryRequested)
872 : {
873 : /* force recovery due to presence of recovery signal file */
874 8 : InRecovery = true;
875 : }
876 :
877 : /*
878 : * If recovery is needed, update our in-memory copy of pg_control to show
879 : * that we are recovering and to show the selected checkpoint as the place
880 : * we are starting from. We also mark pg_control with any minimum recovery
881 : * stop point obtained from a backup history file.
882 : *
883 : * We don't write the changes to disk yet, though. Only do that after
884 : * initializing various subsystems.
885 : */
886 1076 : if (InRecovery)
887 : {
888 227 : if (InArchiveRecovery)
889 : {
890 125 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
891 : }
892 : else
893 : {
894 102 : ereport(LOG,
895 : (errmsg("database system was not properly shut down; "
896 : "automatic recovery in progress")));
897 102 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
898 1 : ereport(LOG,
899 : (errmsg("crash recovery starts in timeline %u "
900 : "and has target timeline %u",
901 : ControlFile->checkPointCopy.ThisTimeLineID,
902 : recoveryTargetTLI)));
903 102 : ControlFile->state = DB_IN_CRASH_RECOVERY;
904 : }
905 227 : ControlFile->checkPoint = CheckPointLoc;
906 227 : ControlFile->checkPointCopy = checkPoint;
907 227 : if (InArchiveRecovery)
908 : {
909 : /* initialize minRecoveryPoint if not set yet */
910 125 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
911 : {
912 85 : ControlFile->minRecoveryPoint = checkPoint.redo;
913 85 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
914 : }
915 : }
916 :
917 : /*
918 : * Set backupStartPoint if we're starting recovery from a base backup.
919 : *
920 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
921 : * location if we're starting recovery from a base backup which was
922 : * taken from a standby. In this case, the database system status in
923 : * pg_control must indicate that the database was already in recovery.
924 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
925 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
926 : * before reaching this point; e.g. because restore_command or
927 : * primary_conninfo were faulty.
928 : *
929 : * Any other state indicates that the backup somehow became corrupted
930 : * and we can't sensibly continue with recovery.
931 : */
932 227 : if (haveBackupLabel)
933 : {
934 82 : ControlFile->backupStartPoint = checkPoint.redo;
935 82 : ControlFile->backupEndRequired = backupEndRequired;
936 :
937 82 : if (backupFromStandby)
938 : {
939 5 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
940 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
941 0 : ereport(FATAL,
942 : (errmsg("backup_label contains data inconsistent with control file"),
943 : errhint("This means that the backup is corrupted and you will "
944 : "have to use another backup for recovery.")));
945 5 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
946 : }
947 : }
948 : }
949 :
950 : /* remember these, so that we know when we have reached consistency */
951 1076 : backupStartPoint = ControlFile->backupStartPoint;
952 1076 : backupEndRequired = ControlFile->backupEndRequired;
953 1076 : backupEndPoint = ControlFile->backupEndPoint;
954 1076 : if (InArchiveRecovery)
955 : {
956 125 : minRecoveryPoint = ControlFile->minRecoveryPoint;
957 125 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
958 : }
959 : else
960 : {
961 951 : minRecoveryPoint = InvalidXLogRecPtr;
962 951 : minRecoveryPointTLI = 0;
963 : }
964 :
965 : /*
966 : * Start recovery assuming that the final record isn't lost.
967 : */
968 1076 : abortedRecPtr = InvalidXLogRecPtr;
969 1076 : missingContrecPtr = InvalidXLogRecPtr;
970 :
971 1076 : *wasShutdown_ptr = wasShutdown;
972 1076 : *haveBackupLabel_ptr = haveBackupLabel;
973 1076 : *haveTblspcMap_ptr = haveTblspcMap;
974 1076 : }
975 :
976 : /*
977 : * See if there are any recovery signal files and if so, set state for
978 : * recovery.
979 : *
980 : * See if there is a recovery command file (recovery.conf), and if so
981 : * throw an ERROR since as of PG12 we no longer recognize that.
982 : */
983 : static void
984 1078 : readRecoverySignalFile(void)
985 : {
986 : struct stat stat_buf;
987 :
988 1078 : if (IsBootstrapProcessingMode())
989 959 : return;
990 :
991 : /*
992 : * Check for old recovery API file: recovery.conf
993 : */
994 1021 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
995 0 : ereport(FATAL,
996 : (errcode_for_file_access(),
997 : errmsg("using recovery command file \"%s\" is not supported",
998 : RECOVERY_COMMAND_FILE)));
999 :
1000 : /*
1001 : * Remove unused .done file, if present. Ignore if absent.
1002 : */
1003 1021 : unlink(RECOVERY_COMMAND_DONE);
1004 :
1005 : /*
1006 : * Check for recovery signal files and if found, fsync them since they
1007 : * represent server state information. We don't sweat too much about the
1008 : * possibility of fsync failure, however.
1009 : */
1010 1021 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1011 : {
1012 : int fd;
1013 :
1014 114 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1015 : S_IRUSR | S_IWUSR);
1016 114 : if (fd >= 0)
1017 : {
1018 114 : (void) pg_fsync(fd);
1019 114 : close(fd);
1020 : }
1021 114 : standby_signal_file_found = true;
1022 : }
1023 :
1024 1021 : if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1025 : {
1026 : int fd;
1027 :
1028 6 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1029 : S_IRUSR | S_IWUSR);
1030 6 : if (fd >= 0)
1031 : {
1032 6 : (void) pg_fsync(fd);
1033 6 : close(fd);
1034 : }
1035 6 : recovery_signal_file_found = true;
1036 : }
1037 :
1038 : /*
1039 : * If both signal files are present, standby signal file takes precedence.
1040 : * If neither is present then we won't enter archive recovery.
1041 : */
1042 1021 : StandbyModeRequested = false;
1043 1021 : ArchiveRecoveryRequested = false;
1044 1021 : if (standby_signal_file_found)
1045 : {
1046 114 : StandbyModeRequested = true;
1047 114 : ArchiveRecoveryRequested = true;
1048 : }
1049 907 : else if (recovery_signal_file_found)
1050 : {
1051 5 : StandbyModeRequested = false;
1052 5 : ArchiveRecoveryRequested = true;
1053 : }
1054 : else
1055 902 : return;
1056 :
1057 : /*
1058 : * We don't support standby mode in standalone backends; that requires
1059 : * other processes such as the WAL receiver to be alive.
1060 : */
1061 119 : if (StandbyModeRequested && !IsUnderPostmaster)
1062 0 : ereport(FATAL,
1063 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1064 : errmsg("standby mode is not supported by single-user servers")));
1065 : }
1066 :
1067 : static void
1068 1078 : validateRecoveryParameters(void)
1069 : {
1070 1078 : if (!ArchiveRecoveryRequested)
1071 959 : return;
1072 :
1073 : /*
1074 : * Check for compulsory parameters
1075 : */
1076 119 : if (StandbyModeRequested)
1077 : {
1078 114 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1079 11 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1080 2 : ereport(WARNING,
1081 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1082 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1083 : }
1084 : else
1085 : {
1086 5 : if (recoveryRestoreCommand == NULL ||
1087 5 : strcmp(recoveryRestoreCommand, "") == 0)
1088 0 : ereport(FATAL,
1089 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1090 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1091 : }
1092 :
1093 : /*
1094 : * Override any inconsistent requests. Note that this is a change of
1095 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1096 : * hot_standby = off, which was surprising behaviour.
1097 : */
1098 119 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1099 112 : !EnableHotStandby)
1100 3 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1101 :
1102 : /*
1103 : * Final parsing of recovery_target_time string; see also
1104 : * check_recovery_target_time().
1105 : */
1106 119 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1107 : {
1108 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1109 : CStringGetDatum(recovery_target_time_string),
1110 : ObjectIdGetDatum(InvalidOid),
1111 : Int32GetDatum(-1)));
1112 : }
1113 :
1114 : /*
1115 : * If user specified recovery_target_timeline, validate it or compute the
1116 : * "latest" value. We can't do this until after we've gotten the restore
1117 : * command and set InArchiveRecovery, because we need to fetch timeline
1118 : * history files from the archive.
1119 : */
1120 119 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1121 : {
1122 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1123 :
1124 : /* Timeline 1 does not have a history file, all else should */
1125 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1126 0 : ereport(FATAL,
1127 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1128 : errmsg("recovery target timeline %u does not exist",
1129 : rtli)));
1130 0 : recoveryTargetTLI = rtli;
1131 : }
1132 119 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1133 : {
1134 : /* We start the "latest" search from pg_control's timeline */
1135 119 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1136 : }
1137 : else
1138 : {
1139 : /*
1140 : * else we just use the recoveryTargetTLI as already read from
1141 : * ControlFile
1142 : */
1143 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1144 : }
1145 : }
1146 :
1147 : /*
1148 : * read_backup_label: check to see if a backup_label file is present
1149 : *
1150 : * If we see a backup_label during recovery, we assume that we are recovering
1151 : * from a backup dump file, and we therefore roll forward from the checkpoint
1152 : * identified by the label file, NOT what pg_control says. This avoids the
1153 : * problem that pg_control might have been archived one or more checkpoints
1154 : * later than the start of the dump, and so if we rely on it as the start
1155 : * point, we will fail to restore a consistent database state.
1156 : *
1157 : * Returns true if a backup_label was found (and fills the checkpoint
1158 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1159 : * returns false if not. If this backup_label came from a streamed backup,
1160 : * *backupEndRequired is set to true. If this backup_label was created during
1161 : * recovery, *backupFromStandby is set to true.
1162 : *
1163 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1164 : * and TLI read from the backup file.
1165 : */
1166 : static bool
1167 1078 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1168 : bool *backupEndRequired, bool *backupFromStandby)
1169 : {
1170 : char startxlogfilename[MAXFNAMELEN];
1171 : TimeLineID tli_from_walseg,
1172 : tli_from_file;
1173 : FILE *lfp;
1174 : char ch;
1175 : char backuptype[20];
1176 : char backupfrom[20];
1177 : char backuplabel[MAXPGPATH];
1178 : char backuptime[128];
1179 : uint32 hi,
1180 : lo;
1181 :
1182 : /* suppress possible uninitialized-variable warnings */
1183 1078 : *checkPointLoc = InvalidXLogRecPtr;
1184 1078 : *backupLabelTLI = 0;
1185 1078 : *backupEndRequired = false;
1186 1078 : *backupFromStandby = false;
1187 :
1188 : /*
1189 : * See if label file is present
1190 : */
1191 1078 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1192 1078 : if (!lfp)
1193 : {
1194 996 : if (errno != ENOENT)
1195 0 : ereport(FATAL,
1196 : (errcode_for_file_access(),
1197 : errmsg("could not read file \"%s\": %m",
1198 : BACKUP_LABEL_FILE)));
1199 996 : return false; /* it's not there, all is fine */
1200 : }
1201 :
1202 : /*
1203 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1204 : * is pretty crude, but we are not expecting any variability in the file
1205 : * format).
1206 : */
1207 82 : if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1208 82 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1209 0 : ereport(FATAL,
1210 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1211 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1212 82 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1213 82 : RedoStartTLI = tli_from_walseg;
1214 82 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1215 82 : &hi, &lo, &ch) != 3 || ch != '\n')
1216 0 : ereport(FATAL,
1217 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1218 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1219 82 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1220 82 : *backupLabelTLI = tli_from_walseg;
1221 :
1222 : /*
1223 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1224 : * which could mean either pg_basebackup or the pg_backup_start/stop
1225 : * method was used) or if this label came from somewhere else (the only
1226 : * other option today being from pg_rewind). If this was a streamed
1227 : * backup then we know that we need to play through until we get to the
1228 : * end of the WAL which was generated during the backup (at which point we
1229 : * will have reached consistency and backupEndRequired will be reset to be
1230 : * false).
1231 : */
1232 82 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1233 : {
1234 82 : if (strcmp(backuptype, "streamed") == 0)
1235 81 : *backupEndRequired = true;
1236 : }
1237 :
1238 : /*
1239 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1240 : * it was from a standby, we'll double-check that the control file state
1241 : * matches that of a standby.
1242 : */
1243 82 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1244 : {
1245 82 : if (strcmp(backupfrom, "standby") == 0)
1246 5 : *backupFromStandby = true;
1247 : }
1248 :
1249 : /*
1250 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1251 : * but checking for their presence is useful for debugging and the next
1252 : * sanity checks. Cope also with the fact that the result buffers have a
1253 : * pre-allocated size, hence if the backup_label file has been generated
1254 : * with strings longer than the maximum assumed here an incorrect parsing
1255 : * happens. That's fine as only minor consistency checks are done
1256 : * afterwards.
1257 : */
1258 82 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1259 82 : ereport(DEBUG1,
1260 : (errmsg_internal("backup time %s in file \"%s\"",
1261 : backuptime, BACKUP_LABEL_FILE)));
1262 :
1263 82 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1264 81 : ereport(DEBUG1,
1265 : (errmsg_internal("backup label %s in file \"%s\"",
1266 : backuplabel, BACKUP_LABEL_FILE)));
1267 :
1268 : /*
1269 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1270 : * it as a sanity check if present.
1271 : */
1272 82 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1273 : {
1274 81 : if (tli_from_walseg != tli_from_file)
1275 0 : ereport(FATAL,
1276 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1277 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1278 : errdetail("Timeline ID parsed is %u, but expected %u.",
1279 : tli_from_file, tli_from_walseg)));
1280 :
1281 81 : ereport(DEBUG1,
1282 : (errmsg_internal("backup timeline %u in file \"%s\"",
1283 : tli_from_file, BACKUP_LABEL_FILE)));
1284 : }
1285 :
1286 82 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1287 0 : ereport(FATAL,
1288 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1289 : errmsg("this is an incremental backup, not a data directory"),
1290 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1291 :
1292 82 : if (ferror(lfp) || FreeFile(lfp))
1293 0 : ereport(FATAL,
1294 : (errcode_for_file_access(),
1295 : errmsg("could not read file \"%s\": %m",
1296 : BACKUP_LABEL_FILE)));
1297 :
1298 82 : return true;
1299 : }
1300 :
1301 : /*
1302 : * read_tablespace_map: check to see if a tablespace_map file is present
1303 : *
1304 : * If we see a tablespace_map file during recovery, we assume that we are
1305 : * recovering from a backup dump file, and we therefore need to create symlinks
1306 : * as per the information present in tablespace_map file.
1307 : *
1308 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1309 : * with a tablespaceinfo struct for each tablespace listed in the file);
1310 : * returns false if not.
1311 : */
1312 : static bool
1313 82 : read_tablespace_map(List **tablespaces)
1314 : {
1315 : tablespaceinfo *ti;
1316 : FILE *lfp;
1317 : char str[MAXPGPATH];
1318 : int ch,
1319 : i,
1320 : n;
1321 : bool was_backslash;
1322 :
1323 : /*
1324 : * See if tablespace_map file is present
1325 : */
1326 82 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1327 82 : if (!lfp)
1328 : {
1329 80 : if (errno != ENOENT)
1330 0 : ereport(FATAL,
1331 : (errcode_for_file_access(),
1332 : errmsg("could not read file \"%s\": %m",
1333 : TABLESPACE_MAP)));
1334 80 : return false; /* it's not there, all is fine */
1335 : }
1336 :
1337 : /*
1338 : * Read and parse the link name and path lines from tablespace_map file
1339 : * (this code is pretty crude, but we are not expecting any variability in
1340 : * the file format). De-escape any backslashes that were inserted.
1341 : */
1342 2 : i = 0;
1343 2 : was_backslash = false;
1344 77 : while ((ch = fgetc(lfp)) != EOF)
1345 : {
1346 75 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1347 2 : {
1348 : char *endp;
1349 :
1350 2 : if (i == 0)
1351 0 : continue; /* \r immediately followed by \n */
1352 :
1353 : /*
1354 : * The de-escaped line should contain an OID followed by exactly
1355 : * one space followed by a path. The path might start with
1356 : * spaces, so don't be too liberal about parsing.
1357 : */
1358 2 : str[i] = '\0';
1359 2 : n = 0;
1360 12 : while (str[n] && str[n] != ' ')
1361 10 : n++;
1362 2 : if (n < 1 || n >= i - 1)
1363 0 : ereport(FATAL,
1364 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1365 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1366 2 : str[n++] = '\0';
1367 :
1368 2 : ti = palloc0_object(tablespaceinfo);
1369 2 : errno = 0;
1370 2 : ti->oid = strtoul(str, &endp, 10);
1371 2 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1372 0 : ereport(FATAL,
1373 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1374 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1375 2 : ti->path = pstrdup(str + n);
1376 2 : *tablespaces = lappend(*tablespaces, ti);
1377 :
1378 2 : i = 0;
1379 2 : continue;
1380 : }
1381 73 : else if (!was_backslash && ch == '\\')
1382 0 : was_backslash = true;
1383 : else
1384 : {
1385 73 : if (i < sizeof(str) - 1)
1386 73 : str[i++] = ch;
1387 73 : was_backslash = false;
1388 : }
1389 : }
1390 :
1391 2 : if (i != 0 || was_backslash) /* last line not terminated? */
1392 0 : ereport(FATAL,
1393 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1394 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1395 :
1396 2 : if (ferror(lfp) || FreeFile(lfp))
1397 0 : ereport(FATAL,
1398 : (errcode_for_file_access(),
1399 : errmsg("could not read file \"%s\": %m",
1400 : TABLESPACE_MAP)));
1401 :
1402 2 : return true;
1403 : }
1404 :
1405 : /*
1406 : * Finish WAL recovery.
1407 : *
1408 : * This does not close the 'xlogreader' yet, because in some cases the caller
1409 : * still wants to re-read the last checkpoint record by calling
1410 : * ReadCheckpointRecord().
1411 : *
1412 : * Returns the position of the last valid or applied record, after which new
1413 : * WAL should be appended, information about why recovery was ended, and some
1414 : * other things. See the EndOfWalRecoveryInfo struct for details.
1415 : */
1416 : EndOfWalRecoveryInfo *
1417 1011 : FinishWalRecovery(void)
1418 : {
1419 1011 : EndOfWalRecoveryInfo *result = palloc_object(EndOfWalRecoveryInfo);
1420 : XLogRecPtr lastRec;
1421 : TimeLineID lastRecTLI;
1422 : XLogRecPtr endOfLog;
1423 :
1424 : /*
1425 : * Kill WAL receiver, if it's still running, before we continue to write
1426 : * the startup checkpoint and aborted-contrecord records. It will trump
1427 : * over these records and subsequent ones if it's still alive when we
1428 : * start writing WAL.
1429 : */
1430 1011 : XLogShutdownWalRcv();
1431 :
1432 : /*
1433 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1434 : * it and to prevent it from keep trying to fetch the failover slots.
1435 : *
1436 : * We do not update the 'synced' column in 'pg_replication_slots' system
1437 : * view from true to false here, as any failed update could leave 'synced'
1438 : * column false for some slots. This could cause issues during slot sync
1439 : * after restarting the server as a standby. While updating the 'synced'
1440 : * column after switching to the new timeline is an option, it does not
1441 : * simplify the handling for the 'synced' column. Therefore, we retain the
1442 : * 'synced' column as true after promotion as it may provide useful
1443 : * information about the slot origin.
1444 : */
1445 1011 : ShutDownSlotSync();
1446 :
1447 : /*
1448 : * We are now done reading the xlog from stream. Turn off streaming
1449 : * recovery to force fetching the files (which would be required at end of
1450 : * recovery, e.g., timeline history file) from archive or pg_wal.
1451 : *
1452 : * Note that standby mode must be turned off after killing WAL receiver,
1453 : * i.e., calling XLogShutdownWalRcv().
1454 : */
1455 : Assert(!WalRcvStreaming());
1456 1011 : StandbyMode = false;
1457 :
1458 : /*
1459 : * Determine where to start writing WAL next.
1460 : *
1461 : * Re-fetch the last valid or last applied record, so we can identify the
1462 : * exact endpoint of what we consider the valid portion of WAL. There may
1463 : * be an incomplete continuation record after that, in which case
1464 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1465 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1466 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1467 : *
1468 : * An important side-effect of this is to load the last page into
1469 : * xlogreader. The caller uses it to initialize the WAL for writing.
1470 : */
1471 1011 : if (!InRecovery)
1472 : {
1473 848 : lastRec = CheckPointLoc;
1474 848 : lastRecTLI = CheckPointTLI;
1475 : }
1476 : else
1477 : {
1478 163 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1479 163 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1480 : }
1481 1011 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1482 1011 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1483 1011 : endOfLog = xlogreader->EndRecPtr;
1484 :
1485 : /*
1486 : * Remember the TLI in the filename of the XLOG segment containing the
1487 : * end-of-log. It could be different from the timeline that endOfLog
1488 : * nominally belongs to, if there was a timeline switch in that segment,
1489 : * and we were reading the old WAL from a segment belonging to a higher
1490 : * timeline.
1491 : */
1492 1011 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1493 :
1494 1011 : if (ArchiveRecoveryRequested)
1495 : {
1496 : /*
1497 : * We are no longer in archive recovery state.
1498 : *
1499 : * We are now done reading the old WAL. Turn off archive fetching if
1500 : * it was active.
1501 : */
1502 : Assert(InArchiveRecovery);
1503 55 : InArchiveRecovery = false;
1504 :
1505 : /*
1506 : * If the ending log segment is still open, close it (to avoid
1507 : * problems on Windows with trying to rename or delete an open file).
1508 : */
1509 55 : if (readFile >= 0)
1510 : {
1511 55 : close(readFile);
1512 55 : readFile = -1;
1513 : }
1514 : }
1515 :
1516 : /*
1517 : * Copy the last partial block to the caller, for initializing the WAL
1518 : * buffer for appending new WAL.
1519 : */
1520 1011 : if (endOfLog % XLOG_BLCKSZ != 0)
1521 : {
1522 : char *page;
1523 : int len;
1524 : XLogRecPtr pageBeginPtr;
1525 :
1526 989 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1527 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1528 :
1529 : /* Copy the valid part of the last block */
1530 989 : len = endOfLog % XLOG_BLCKSZ;
1531 989 : page = palloc(len);
1532 989 : memcpy(page, xlogreader->readBuf, len);
1533 :
1534 989 : result->lastPageBeginPtr = pageBeginPtr;
1535 989 : result->lastPage = page;
1536 : }
1537 : else
1538 : {
1539 : /* There is no partial block to copy. */
1540 22 : result->lastPageBeginPtr = endOfLog;
1541 22 : result->lastPage = NULL;
1542 : }
1543 :
1544 : /*
1545 : * Create a comment for the history file to explain why and where timeline
1546 : * changed.
1547 : */
1548 1011 : result->recoveryStopReason = getRecoveryStopReason();
1549 :
1550 1011 : result->lastRec = lastRec;
1551 1011 : result->lastRecTLI = lastRecTLI;
1552 1011 : result->endOfLog = endOfLog;
1553 :
1554 1011 : result->abortedRecPtr = abortedRecPtr;
1555 1011 : result->missingContrecPtr = missingContrecPtr;
1556 :
1557 1011 : result->standby_signal_file_found = standby_signal_file_found;
1558 1011 : result->recovery_signal_file_found = recovery_signal_file_found;
1559 :
1560 1011 : return result;
1561 : }
1562 :
1563 : /*
1564 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1565 : */
1566 : void
1567 1011 : ShutdownWalRecovery(void)
1568 : {
1569 : char recoveryPath[MAXPGPATH];
1570 :
1571 : /* Final update of pg_stat_recovery_prefetch. */
1572 1011 : XLogPrefetcherComputeStats(xlogprefetcher);
1573 :
1574 : /* Shut down xlogreader */
1575 1011 : if (readFile >= 0)
1576 : {
1577 956 : close(readFile);
1578 956 : readFile = -1;
1579 : }
1580 1011 : pfree(xlogreader->private_data);
1581 1011 : XLogReaderFree(xlogreader);
1582 1011 : XLogPrefetcherFree(xlogprefetcher);
1583 :
1584 1011 : if (ArchiveRecoveryRequested)
1585 : {
1586 : /*
1587 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1588 : * rid of it.
1589 : */
1590 55 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1591 55 : unlink(recoveryPath); /* ignore any error */
1592 :
1593 : /* Get rid of any remaining recovered timeline-history file, too */
1594 55 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1595 55 : unlink(recoveryPath); /* ignore any error */
1596 : }
1597 :
1598 : /*
1599 : * We don't need the latch anymore. It's not strictly necessary to disown
1600 : * it, but let's do it for the sake of tidiness.
1601 : */
1602 1011 : if (ArchiveRecoveryRequested)
1603 55 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1604 1011 : }
1605 :
1606 : /*
1607 : * Perform WAL recovery.
1608 : *
1609 : * If the system was shut down cleanly, this is never called.
1610 : */
1611 : void
1612 226 : PerformWalRecovery(void)
1613 : {
1614 : XLogRecord *record;
1615 226 : bool reachedRecoveryTarget = false;
1616 : TimeLineID replayTLI;
1617 :
1618 : /*
1619 : * Initialize shared variables for tracking progress of WAL replay, as if
1620 : * we had just replayed the record before the REDO location (or the
1621 : * checkpoint record itself, if it's a shutdown checkpoint).
1622 : */
1623 226 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1624 226 : if (RedoStartLSN < CheckPointLoc)
1625 : {
1626 125 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1627 125 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1628 125 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1629 : }
1630 : else
1631 : {
1632 101 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1633 101 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1634 101 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1635 : }
1636 226 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1637 226 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1638 226 : XLogRecoveryCtl->recoveryLastXTime = 0;
1639 226 : XLogRecoveryCtl->currentChunkStartTime = 0;
1640 226 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1641 226 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1642 :
1643 : /* Also ensure XLogReceiptTime has a sane value */
1644 226 : XLogReceiptTime = GetCurrentTimestamp();
1645 :
1646 : /*
1647 : * Let postmaster know we've started redo now, so that it can launch the
1648 : * archiver if necessary.
1649 : */
1650 226 : if (IsUnderPostmaster)
1651 217 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1652 :
1653 : /*
1654 : * Allow read-only connections immediately if we're consistent already.
1655 : */
1656 226 : CheckRecoveryConsistency();
1657 :
1658 : /*
1659 : * Find the first record that logically follows the checkpoint --- it
1660 : * might physically precede it, though.
1661 : */
1662 226 : if (RedoStartLSN < CheckPointLoc)
1663 : {
1664 : /* back up to find the record */
1665 125 : replayTLI = RedoStartTLI;
1666 125 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1667 125 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1668 :
1669 : /*
1670 : * If a checkpoint record's redo pointer points back to an earlier
1671 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1672 : * record.
1673 : */
1674 125 : if (record->xl_rmid != RM_XLOG_ID ||
1675 125 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1676 0 : ereport(FATAL,
1677 : errmsg("unexpected record type found at redo point %X/%08X",
1678 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1679 : }
1680 : else
1681 : {
1682 : /* just have to read next record after CheckPoint */
1683 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1684 101 : replayTLI = CheckPointTLI;
1685 101 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1686 : }
1687 :
1688 226 : if (record != NULL)
1689 : {
1690 : TimestampTz xtime;
1691 : PGRUsage ru0;
1692 :
1693 217 : pg_rusage_init(&ru0);
1694 :
1695 217 : InRedo = true;
1696 :
1697 217 : RmgrStartup();
1698 :
1699 217 : ereport(LOG,
1700 : errmsg("redo starts at %X/%08X",
1701 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1702 :
1703 : /* Prepare to report progress of the redo phase. */
1704 217 : if (!StandbyMode)
1705 108 : begin_startup_progress_phase();
1706 :
1707 : /*
1708 : * main redo apply loop
1709 : */
1710 : do
1711 : {
1712 2942755 : if (!StandbyMode)
1713 327222 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1714 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1715 :
1716 : #ifdef WAL_DEBUG
1717 : if (XLOG_DEBUG)
1718 : {
1719 : StringInfoData buf;
1720 :
1721 : initStringInfo(&buf);
1722 : appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1723 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1724 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1725 : xlog_outrec(&buf, xlogreader);
1726 : appendStringInfoString(&buf, " - ");
1727 : xlog_outdesc(&buf, xlogreader);
1728 : elog(LOG, "%s", buf.data);
1729 : pfree(buf.data);
1730 : }
1731 : #endif
1732 :
1733 : /* Handle interrupt signals of startup process */
1734 2942755 : ProcessStartupProcInterrupts();
1735 :
1736 : /*
1737 : * Pause WAL replay, if requested by a hot-standby session via
1738 : * SetRecoveryPause().
1739 : *
1740 : * Note that we intentionally don't take the info_lck spinlock
1741 : * here. We might therefore read a slightly stale value of the
1742 : * recoveryPause flag, but it can't be very stale (no worse than
1743 : * the last spinlock we did acquire). Since a pause request is a
1744 : * pretty asynchronous thing anyway, possibly responding to it one
1745 : * WAL record later than we otherwise would is a minor issue, so
1746 : * it doesn't seem worth adding another spinlock cycle to prevent
1747 : * that.
1748 : */
1749 2942755 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1750 : RECOVERY_NOT_PAUSED)
1751 0 : recoveryPausesHere(false);
1752 :
1753 : /*
1754 : * Have we reached our recovery target?
1755 : */
1756 2942755 : if (recoveryStopsBefore(xlogreader))
1757 : {
1758 2 : reachedRecoveryTarget = true;
1759 2 : break;
1760 : }
1761 :
1762 : /*
1763 : * If we've been asked to lag the primary, wait on latch until
1764 : * enough time has passed.
1765 : */
1766 2942753 : if (recoveryApplyDelay(xlogreader))
1767 : {
1768 : /*
1769 : * We test for paused recovery again here. If user sets
1770 : * delayed apply, it may be because they expect to pause
1771 : * recovery in case of problems, so we must test again here
1772 : * otherwise pausing during the delay-wait wouldn't work.
1773 : */
1774 30 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1775 : RECOVERY_NOT_PAUSED)
1776 0 : recoveryPausesHere(false);
1777 : }
1778 :
1779 : /*
1780 : * Apply the record
1781 : */
1782 2942753 : ApplyWalRecord(xlogreader, record, &replayTLI);
1783 :
1784 : /*
1785 : * If we replayed an LSN that someone was waiting for then walk
1786 : * over the shared memory array and set latches to notify the
1787 : * waiters.
1788 : */
1789 5885502 : if (waitLSNState &&
1790 2942751 : (XLogRecoveryCtl->lastReplayedEndRecPtr >=
1791 2942751 : pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_STANDBY_REPLAY])))
1792 13 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY, XLogRecoveryCtl->lastReplayedEndRecPtr);
1793 :
1794 : /* Exit loop if we reached inclusive recovery target */
1795 2942751 : if (recoveryStopsAfter(xlogreader))
1796 : {
1797 5 : reachedRecoveryTarget = true;
1798 5 : break;
1799 : }
1800 :
1801 : /* Else, try to fetch the next WAL record */
1802 2942746 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1803 2942686 : } while (record != NULL);
1804 :
1805 : /*
1806 : * end of main redo apply loop
1807 : */
1808 :
1809 155 : if (reachedRecoveryTarget)
1810 : {
1811 7 : if (!reachedConsistency)
1812 0 : ereport(FATAL,
1813 : (errmsg("requested recovery stop point is before consistent recovery point")));
1814 :
1815 : /*
1816 : * This is the last point where we can restart recovery with a new
1817 : * recovery target, if we shutdown and begin again. After this,
1818 : * Resource Managers may choose to do permanent corrective actions
1819 : * at end of recovery.
1820 : */
1821 7 : switch (recoveryTargetAction)
1822 : {
1823 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1824 :
1825 : /*
1826 : * exit with special return code to request shutdown of
1827 : * postmaster. Log messages issued from postmaster.
1828 : */
1829 0 : proc_exit(3);
1830 :
1831 1 : case RECOVERY_TARGET_ACTION_PAUSE:
1832 1 : SetRecoveryPause(true);
1833 1 : recoveryPausesHere(true);
1834 :
1835 : /* drop into promote */
1836 : pg_fallthrough;
1837 :
1838 7 : case RECOVERY_TARGET_ACTION_PROMOTE:
1839 7 : break;
1840 : }
1841 : }
1842 :
1843 155 : RmgrCleanup();
1844 :
1845 155 : ereport(LOG,
1846 : errmsg("redo done at %X/%08X system usage: %s",
1847 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1848 : pg_rusage_show(&ru0)));
1849 155 : xtime = GetLatestXTime();
1850 155 : if (xtime)
1851 38 : ereport(LOG,
1852 : (errmsg("last completed transaction was at log time %s",
1853 : timestamptz_to_str(xtime))));
1854 :
1855 155 : InRedo = false;
1856 : }
1857 : else
1858 : {
1859 : /* there are no WAL records following the checkpoint */
1860 9 : ereport(LOG,
1861 : (errmsg("redo is not required")));
1862 : }
1863 :
1864 : /*
1865 : * This check is intentionally after the above log messages that indicate
1866 : * how far recovery went.
1867 : */
1868 164 : if (ArchiveRecoveryRequested &&
1869 56 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1870 8 : !reachedRecoveryTarget)
1871 1 : ereport(FATAL,
1872 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1873 : errmsg("recovery ended before configured recovery target was reached")));
1874 163 : }
1875 :
1876 : /*
1877 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1878 : */
1879 : static void
1880 2942753 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1881 : {
1882 : ErrorContextCallback errcallback;
1883 2942753 : bool switchedTLI = false;
1884 :
1885 : /* Setup error traceback support for ereport() */
1886 2942753 : errcallback.callback = rm_redo_error_callback;
1887 2942753 : errcallback.arg = xlogreader;
1888 2942753 : errcallback.previous = error_context_stack;
1889 2942753 : error_context_stack = &errcallback;
1890 :
1891 : /*
1892 : * TransamVariables->nextXid must be beyond record's xid.
1893 : */
1894 2942753 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1895 :
1896 : /*
1897 : * Before replaying this record, check if this record causes the current
1898 : * timeline to change. The record is already considered to be part of the
1899 : * new timeline, so we update replayTLI before replaying it. That's
1900 : * important so that replayEndTLI, which is recorded as the minimum
1901 : * recovery point's TLI if recovery stops after this record, is set
1902 : * correctly.
1903 : */
1904 2942753 : if (record->xl_rmid == RM_XLOG_ID)
1905 : {
1906 114829 : TimeLineID newReplayTLI = *replayTLI;
1907 114829 : TimeLineID prevReplayTLI = *replayTLI;
1908 114829 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1909 :
1910 114829 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1911 : {
1912 : CheckPoint checkPoint;
1913 :
1914 41 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1915 41 : newReplayTLI = checkPoint.ThisTimeLineID;
1916 41 : prevReplayTLI = checkPoint.PrevTimeLineID;
1917 : }
1918 114788 : else if (info == XLOG_END_OF_RECOVERY)
1919 : {
1920 : xl_end_of_recovery xlrec;
1921 :
1922 10 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1923 10 : newReplayTLI = xlrec.ThisTimeLineID;
1924 10 : prevReplayTLI = xlrec.PrevTimeLineID;
1925 : }
1926 :
1927 114829 : if (newReplayTLI != *replayTLI)
1928 : {
1929 : /* Check that it's OK to switch to this TLI */
1930 11 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1931 : newReplayTLI, prevReplayTLI, *replayTLI);
1932 :
1933 : /* Following WAL records should be run with new TLI */
1934 11 : *replayTLI = newReplayTLI;
1935 11 : switchedTLI = true;
1936 : }
1937 : }
1938 :
1939 : /*
1940 : * Update shared replayEndRecPtr before replaying this record, so that
1941 : * XLogFlush will update minRecoveryPoint correctly.
1942 : */
1943 2942753 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1944 2942753 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1945 2942753 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1946 2942753 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1947 :
1948 : /*
1949 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1950 : */
1951 2942753 : if (standbyState >= STANDBY_INITIALIZED &&
1952 2635134 : TransactionIdIsValid(record->xl_xid))
1953 2576103 : RecordKnownAssignedTransactionIds(record->xl_xid);
1954 :
1955 : /*
1956 : * Some XLOG record types that are related to recovery are processed
1957 : * directly here, rather than in xlog_redo()
1958 : */
1959 2942753 : if (record->xl_rmid == RM_XLOG_ID)
1960 114829 : xlogrecovery_redo(xlogreader, *replayTLI);
1961 :
1962 : /* Now apply the WAL record itself */
1963 2942753 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1964 :
1965 : /*
1966 : * After redo, check whether the backup pages associated with the WAL
1967 : * record are consistent with the existing pages. This check is done only
1968 : * if consistency check is enabled for this record.
1969 : */
1970 2942751 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1971 2261562 : verifyBackupPageConsistency(xlogreader);
1972 :
1973 : /* Pop the error context stack */
1974 2942751 : error_context_stack = errcallback.previous;
1975 :
1976 : /*
1977 : * Update lastReplayedEndRecPtr after this record has been successfully
1978 : * replayed.
1979 : */
1980 2942751 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1981 2942751 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1982 2942751 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1983 2942751 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1984 2942751 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1985 :
1986 : /* ------
1987 : * Wakeup walsenders:
1988 : *
1989 : * On the standby, the WAL is flushed first (which will only wake up
1990 : * physical walsenders) and then applied, which will only wake up logical
1991 : * walsenders.
1992 : *
1993 : * Indeed, logical walsenders on standby can't decode and send data until
1994 : * it's been applied.
1995 : *
1996 : * Physical walsenders don't need to be woken up during replay unless
1997 : * cascading replication is allowed and time line change occurred (so that
1998 : * they can notice that they are on a new time line).
1999 : *
2000 : * That's why the wake up conditions are for:
2001 : *
2002 : * - physical walsenders in case of new time line and cascade
2003 : * replication is allowed
2004 : * - logical walsenders in case cascade replication is allowed (could not
2005 : * be created otherwise)
2006 : * ------
2007 : */
2008 2942751 : if (AllowCascadeReplication())
2009 2690868 : WalSndWakeup(switchedTLI, true);
2010 :
2011 : /*
2012 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2013 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2014 : * a reply to the primary.
2015 : */
2016 2942751 : if (doRequestWalReceiverReply)
2017 : {
2018 2 : doRequestWalReceiverReply = false;
2019 2 : WalRcvRequestApplyReply();
2020 : }
2021 :
2022 : /* Allow read-only connections if we're consistent now */
2023 2942751 : CheckRecoveryConsistency();
2024 :
2025 : /* Is this a timeline switch? */
2026 2942751 : if (switchedTLI)
2027 : {
2028 : /*
2029 : * Before we continue on the new timeline, clean up any (possibly
2030 : * bogus) future WAL segments on the old timeline.
2031 : */
2032 11 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2033 :
2034 : /* Reset the prefetcher. */
2035 11 : XLogPrefetchReconfigure();
2036 : }
2037 2942751 : }
2038 :
2039 : /*
2040 : * Some XLOG RM record types that are directly related to WAL recovery are
2041 : * handled here rather than in the xlog_redo()
2042 : */
2043 : static void
2044 114829 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2045 : {
2046 114829 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2047 114829 : XLogRecPtr lsn = record->EndRecPtr;
2048 :
2049 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2050 :
2051 114829 : if (info == XLOG_OVERWRITE_CONTRECORD)
2052 : {
2053 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2054 : xl_overwrite_contrecord xlrec;
2055 :
2056 1 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2057 1 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2058 0 : elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2059 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2060 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2061 :
2062 : /* We have safely skipped the aborted record */
2063 1 : abortedRecPtr = InvalidXLogRecPtr;
2064 1 : missingContrecPtr = InvalidXLogRecPtr;
2065 :
2066 1 : ereport(LOG,
2067 : errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2068 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2069 : timestamptz_to_str(xlrec.overwrite_time)));
2070 :
2071 : /* Verifying the record should only happen once */
2072 1 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2073 : }
2074 114828 : else if (info == XLOG_BACKUP_END)
2075 : {
2076 : XLogRecPtr startpoint;
2077 :
2078 97 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2079 :
2080 97 : if (backupStartPoint == startpoint)
2081 : {
2082 : /*
2083 : * We have reached the end of base backup, the point where
2084 : * pg_backup_stop() was done. The data on disk is now consistent
2085 : * (assuming we have also reached minRecoveryPoint). Set
2086 : * backupEndPoint to the current LSN, so that the next call to
2087 : * CheckRecoveryConsistency() will notice it and do the
2088 : * end-of-backup processing.
2089 : */
2090 80 : elog(DEBUG1, "end of backup record reached");
2091 :
2092 80 : backupEndPoint = lsn;
2093 : }
2094 : else
2095 17 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2096 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2097 : }
2098 114829 : }
2099 :
2100 : /*
2101 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2102 : * directories.
2103 : *
2104 : * Replay of database creation XLOG records for databases that were later
2105 : * dropped can create fake directories in pg_tblspc. By the time consistency
2106 : * is reached these directories should have been removed; here we verify
2107 : * that this did indeed happen. This is to be called at the point where
2108 : * consistent state is reached.
2109 : *
2110 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2111 : * useful for testing purposes, and also allows for an escape hatch in case
2112 : * things go south.
2113 : */
2114 : static void
2115 125 : CheckTablespaceDirectory(void)
2116 : {
2117 : DIR *dir;
2118 : struct dirent *de;
2119 :
2120 125 : dir = AllocateDir(PG_TBLSPC_DIR);
2121 382 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2122 : {
2123 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2124 :
2125 : /* Skip entries of non-oid names */
2126 257 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2127 250 : continue;
2128 :
2129 7 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2130 :
2131 7 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2132 4 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2133 : (errcode(ERRCODE_DATA_CORRUPTED),
2134 : errmsg("unexpected directory entry \"%s\" found in %s",
2135 : de->d_name, PG_TBLSPC_DIR),
2136 : errdetail("All directory entries in %s/ should be symbolic links.",
2137 : PG_TBLSPC_DIR),
2138 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2139 : }
2140 125 : }
2141 :
2142 : /*
2143 : * Checks if recovery has reached a consistent state. When consistency is
2144 : * reached and we have a valid starting standby snapshot, tell postmaster
2145 : * that it can start accepting read-only connections.
2146 : */
2147 : static void
2148 2942978 : CheckRecoveryConsistency(void)
2149 : {
2150 : XLogRecPtr lastReplayedEndRecPtr;
2151 : TimeLineID lastReplayedTLI;
2152 :
2153 : /*
2154 : * During crash recovery, we don't reach a consistent state until we've
2155 : * replayed all the WAL.
2156 : */
2157 2942978 : if (!XLogRecPtrIsValid(minRecoveryPoint))
2158 322109 : return;
2159 :
2160 : Assert(InArchiveRecovery);
2161 :
2162 : /*
2163 : * assume that we are called in the startup process, and hence don't need
2164 : * a lock to read lastReplayedEndRecPtr
2165 : */
2166 2620869 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2167 2620869 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2168 :
2169 : /*
2170 : * Have we reached the point where our base backup was completed?
2171 : */
2172 2620869 : if (XLogRecPtrIsValid(backupEndPoint) &&
2173 113 : backupEndPoint <= lastReplayedEndRecPtr)
2174 : {
2175 82 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2176 82 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2177 :
2178 82 : elog(DEBUG1, "end of backup reached");
2179 :
2180 : /*
2181 : * We have reached the end of base backup, as indicated by pg_control.
2182 : * Update the control file accordingly.
2183 : */
2184 82 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2185 82 : backupStartPoint = InvalidXLogRecPtr;
2186 82 : backupEndPoint = InvalidXLogRecPtr;
2187 82 : backupEndRequired = false;
2188 :
2189 82 : ereport(LOG,
2190 : errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2191 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2192 : LSN_FORMAT_ARGS(saveBackupEndPoint)));
2193 : }
2194 :
2195 : /*
2196 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2197 : * known to be incorrectly set if recovering from a backup, until the
2198 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2199 : * All we know prior to that is that we're not consistent yet.
2200 : */
2201 2620869 : if (!reachedConsistency && !backupEndRequired &&
2202 7850 : minRecoveryPoint <= lastReplayedEndRecPtr)
2203 : {
2204 : /*
2205 : * Check to see if the XLOG sequence contained any unresolved
2206 : * references to uninitialized pages.
2207 : */
2208 125 : XLogCheckInvalidPages();
2209 :
2210 : /*
2211 : * Check that pg_tblspc doesn't contain any real directories. Replay
2212 : * of Database/CREATE_* records may have created fictitious tablespace
2213 : * directories that should have been removed by the time consistency
2214 : * was reached.
2215 : */
2216 125 : CheckTablespaceDirectory();
2217 :
2218 125 : reachedConsistency = true;
2219 125 : SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2220 125 : ereport(LOG,
2221 : errmsg("consistent recovery state reached at %X/%08X",
2222 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2223 : }
2224 :
2225 : /*
2226 : * Have we got a valid starting snapshot that will allow queries to be
2227 : * run? If so, we can tell postmaster that the database is consistent now,
2228 : * enabling connections.
2229 : */
2230 2620869 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2231 2620621 : !LocalHotStandbyActive &&
2232 116 : reachedConsistency &&
2233 : IsUnderPostmaster)
2234 : {
2235 116 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2236 116 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2237 116 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2238 :
2239 116 : LocalHotStandbyActive = true;
2240 :
2241 116 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2242 : }
2243 : }
2244 :
2245 : /*
2246 : * Error context callback for errors occurring during rm_redo().
2247 : */
2248 : static void
2249 154 : rm_redo_error_callback(void *arg)
2250 : {
2251 154 : XLogReaderState *record = (XLogReaderState *) arg;
2252 : StringInfoData buf;
2253 :
2254 154 : initStringInfo(&buf);
2255 154 : xlog_outdesc(&buf, record);
2256 154 : xlog_block_info(&buf, record);
2257 :
2258 : /* translator: %s is a WAL record description */
2259 154 : errcontext("WAL redo at %X/%08X for %s",
2260 154 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2261 : buf.data);
2262 :
2263 154 : pfree(buf.data);
2264 154 : }
2265 :
2266 : /*
2267 : * Returns a string describing an XLogRecord, consisting of its identity
2268 : * optionally followed by a colon, a space, and a further description.
2269 : */
2270 : void
2271 154 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2272 : {
2273 154 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2274 154 : uint8 info = XLogRecGetInfo(record);
2275 : const char *id;
2276 :
2277 154 : appendStringInfoString(buf, rmgr.rm_name);
2278 154 : appendStringInfoChar(buf, '/');
2279 :
2280 154 : id = rmgr.rm_identify(info);
2281 154 : if (id == NULL)
2282 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2283 : else
2284 154 : appendStringInfo(buf, "%s: ", id);
2285 :
2286 154 : rmgr.rm_desc(buf, record);
2287 154 : }
2288 :
2289 : #ifdef WAL_DEBUG
2290 :
2291 : static void
2292 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2293 : {
2294 : appendStringInfo(buf, "prev %X/%08X; xid %u",
2295 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2296 : XLogRecGetXid(record));
2297 :
2298 : appendStringInfo(buf, "; len %u",
2299 : XLogRecGetDataLen(record));
2300 :
2301 : xlog_block_info(buf, record);
2302 : }
2303 : #endif /* WAL_DEBUG */
2304 :
2305 : /*
2306 : * Returns a string giving information about all the blocks in an
2307 : * XLogRecord.
2308 : */
2309 : static void
2310 154 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2311 : {
2312 : int block_id;
2313 :
2314 : /* decode block references */
2315 208 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2316 : {
2317 : RelFileLocator rlocator;
2318 : ForkNumber forknum;
2319 : BlockNumber blk;
2320 :
2321 54 : if (!XLogRecGetBlockTagExtended(record, block_id,
2322 : &rlocator, &forknum, &blk, NULL))
2323 0 : continue;
2324 :
2325 54 : if (forknum != MAIN_FORKNUM)
2326 9 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2327 : block_id,
2328 : rlocator.spcOid, rlocator.dbOid,
2329 : rlocator.relNumber,
2330 : forknum,
2331 : blk);
2332 : else
2333 45 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2334 : block_id,
2335 : rlocator.spcOid, rlocator.dbOid,
2336 : rlocator.relNumber,
2337 : blk);
2338 54 : if (XLogRecHasBlockImage(record, block_id))
2339 33 : appendStringInfoString(buf, " FPW");
2340 : }
2341 154 : }
2342 :
2343 :
2344 : /*
2345 : * Check that it's OK to switch to new timeline during recovery.
2346 : *
2347 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2348 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2349 : */
2350 : static void
2351 11 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2352 : TimeLineID replayTLI)
2353 : {
2354 : /* Check that the record agrees on what the current (old) timeline is */
2355 11 : if (prevTLI != replayTLI)
2356 0 : ereport(PANIC,
2357 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2358 : prevTLI, replayTLI)));
2359 :
2360 : /*
2361 : * The new timeline better be in the list of timelines we expect to see,
2362 : * according to the timeline history. It should also not decrease.
2363 : */
2364 11 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2365 0 : ereport(PANIC,
2366 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2367 : newTLI, replayTLI)));
2368 :
2369 : /*
2370 : * If we have not yet reached min recovery point, and we're about to
2371 : * switch to a timeline greater than the timeline of the min recovery
2372 : * point: trouble. After switching to the new timeline, we could not
2373 : * possibly visit the min recovery point on the correct timeline anymore.
2374 : * This can happen if there is a newer timeline in the archive that
2375 : * branched before the timeline the min recovery point is on, and you
2376 : * attempt to do PITR to the new timeline.
2377 : */
2378 11 : if (XLogRecPtrIsValid(minRecoveryPoint) &&
2379 10 : lsn < minRecoveryPoint &&
2380 1 : newTLI > minRecoveryPointTLI)
2381 0 : ereport(PANIC,
2382 : errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2383 : newTLI,
2384 : LSN_FORMAT_ARGS(minRecoveryPoint),
2385 : minRecoveryPointTLI));
2386 :
2387 : /* Looks good */
2388 11 : }
2389 :
2390 :
2391 : /*
2392 : * Extract timestamp from WAL record.
2393 : *
2394 : * If the record contains a timestamp, returns true, and saves the timestamp
2395 : * in *recordXtime. If the record type has no timestamp, returns false.
2396 : * Currently, only transaction commit/abort records and restore points contain
2397 : * timestamps.
2398 : */
2399 : static bool
2400 47130 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2401 : {
2402 47130 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2403 47130 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2404 47130 : uint8 rmid = XLogRecGetRmid(record);
2405 :
2406 47130 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2407 : {
2408 2 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2409 2 : return true;
2410 : }
2411 47128 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2412 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2413 : {
2414 43164 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2415 43164 : return true;
2416 : }
2417 3964 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2418 : xact_info == XLOG_XACT_ABORT_PREPARED))
2419 : {
2420 3964 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2421 3964 : return true;
2422 : }
2423 0 : return false;
2424 : }
2425 :
2426 : /*
2427 : * Checks whether the current buffer page and backup page stored in the
2428 : * WAL record are consistent or not. Before comparing the two pages, a
2429 : * masking can be applied to the pages to ignore certain areas like hint bits,
2430 : * unused space between pd_lower and pd_upper among other things. This
2431 : * function should be called once WAL replay has been completed for a
2432 : * given record.
2433 : */
2434 : static void
2435 2261562 : verifyBackupPageConsistency(XLogReaderState *record)
2436 : {
2437 2261562 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2438 : RelFileLocator rlocator;
2439 : ForkNumber forknum;
2440 : BlockNumber blkno;
2441 : int block_id;
2442 :
2443 : /* Records with no backup blocks have no need for consistency checks. */
2444 2261562 : if (!XLogRecHasAnyBlockRefs(record))
2445 79 : return;
2446 :
2447 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2448 :
2449 4695632 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2450 : {
2451 : Buffer buf;
2452 : Page page;
2453 :
2454 2434149 : if (!XLogRecGetBlockTagExtended(record, block_id,
2455 : &rlocator, &forknum, &blkno, NULL))
2456 : {
2457 : /*
2458 : * WAL record doesn't contain a block reference with the given id.
2459 : * Do nothing.
2460 : */
2461 2149 : continue;
2462 : }
2463 :
2464 : Assert(XLogRecHasBlockImage(record, block_id));
2465 :
2466 2432000 : if (XLogRecBlockImageApply(record, block_id))
2467 : {
2468 : /*
2469 : * WAL record has already applied the page, so bypass the
2470 : * consistency check as that would result in comparing the full
2471 : * page stored in the record with itself.
2472 : */
2473 29699 : continue;
2474 : }
2475 :
2476 : /*
2477 : * Read the contents from the current buffer and store it in a
2478 : * temporary page.
2479 : */
2480 2402301 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2481 : RBM_NORMAL_NO_LOG,
2482 : InvalidBuffer);
2483 2402301 : if (!BufferIsValid(buf))
2484 0 : continue;
2485 :
2486 2402301 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2487 2402301 : page = BufferGetPage(buf);
2488 :
2489 : /*
2490 : * Take a copy of the local page where WAL has been applied to have a
2491 : * comparison base before masking it...
2492 : */
2493 2402301 : memcpy(replay_image_masked, page, BLCKSZ);
2494 :
2495 : /* No need for this page anymore now that a copy is in. */
2496 2402301 : UnlockReleaseBuffer(buf);
2497 :
2498 : /*
2499 : * If the block LSN is already ahead of this WAL record, we can't
2500 : * expect contents to match. This can happen if recovery is
2501 : * restarted.
2502 : */
2503 2402301 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2504 0 : continue;
2505 :
2506 : /*
2507 : * Read the contents from the backup copy, stored in WAL record and
2508 : * store it in a temporary page. There is no need to allocate a new
2509 : * page here, a local buffer is fine to hold its contents and a mask
2510 : * can be directly applied on it.
2511 : */
2512 2402301 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2513 0 : ereport(ERROR,
2514 : (errcode(ERRCODE_INTERNAL_ERROR),
2515 : errmsg_internal("%s", record->errormsg_buf)));
2516 :
2517 : /*
2518 : * If masking function is defined, mask both the primary and replay
2519 : * images
2520 : */
2521 2402301 : if (rmgr.rm_mask != NULL)
2522 : {
2523 2402301 : rmgr.rm_mask(replay_image_masked, blkno);
2524 2402301 : rmgr.rm_mask(primary_image_masked, blkno);
2525 : }
2526 :
2527 : /* Time to compare the primary and replay images. */
2528 2402301 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2529 : {
2530 0 : elog(FATAL,
2531 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2532 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2533 : forknum, blkno);
2534 : }
2535 : }
2536 : }
2537 :
2538 : /*
2539 : * For point-in-time recovery, this function decides whether we want to
2540 : * stop applying the XLOG before the current record.
2541 : *
2542 : * Returns true if we are stopping, false otherwise. If stopping, some
2543 : * information is saved in recoveryStopXid et al for use in annotating the
2544 : * new timeline's history file.
2545 : */
2546 : static bool
2547 2942755 : recoveryStopsBefore(XLogReaderState *record)
2548 : {
2549 2942755 : bool stopsHere = false;
2550 : uint8 xact_info;
2551 : bool isCommit;
2552 2942755 : TimestampTz recordXtime = 0;
2553 : TransactionId recordXid;
2554 :
2555 : /*
2556 : * Ignore recovery target settings when not in archive recovery (meaning
2557 : * we are in crash recovery).
2558 : */
2559 2942755 : if (!ArchiveRecoveryRequested)
2560 307605 : return false;
2561 :
2562 : /* Check if we should stop as soon as reaching consistency */
2563 2635150 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2564 : {
2565 0 : ereport(LOG,
2566 : (errmsg("recovery stopping after reaching consistency")));
2567 :
2568 0 : recoveryStopAfter = false;
2569 0 : recoveryStopXid = InvalidTransactionId;
2570 0 : recoveryStopLSN = InvalidXLogRecPtr;
2571 0 : recoveryStopTime = 0;
2572 0 : recoveryStopName[0] = '\0';
2573 0 : return true;
2574 : }
2575 :
2576 : /* Check if target LSN has been reached */
2577 2635150 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2578 8553 : !recoveryTargetInclusive &&
2579 481 : record->ReadRecPtr >= recoveryTargetLSN)
2580 : {
2581 2 : recoveryStopAfter = false;
2582 2 : recoveryStopXid = InvalidTransactionId;
2583 2 : recoveryStopLSN = record->ReadRecPtr;
2584 2 : recoveryStopTime = 0;
2585 2 : recoveryStopName[0] = '\0';
2586 2 : ereport(LOG,
2587 : errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2588 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2589 2 : return true;
2590 : }
2591 :
2592 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2593 2635148 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2594 2611297 : return false;
2595 :
2596 23851 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2597 :
2598 23851 : if (xact_info == XLOG_XACT_COMMIT)
2599 : {
2600 21542 : isCommit = true;
2601 21542 : recordXid = XLogRecGetXid(record);
2602 : }
2603 2309 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2604 : {
2605 26 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2606 : xl_xact_parsed_commit parsed;
2607 :
2608 26 : isCommit = true;
2609 26 : ParseCommitRecord(XLogRecGetInfo(record),
2610 : xlrec,
2611 : &parsed);
2612 26 : recordXid = parsed.twophase_xid;
2613 : }
2614 2283 : else if (xact_info == XLOG_XACT_ABORT)
2615 : {
2616 1967 : isCommit = false;
2617 1967 : recordXid = XLogRecGetXid(record);
2618 : }
2619 316 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2620 : {
2621 15 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2622 : xl_xact_parsed_abort parsed;
2623 :
2624 15 : isCommit = false;
2625 15 : ParseAbortRecord(XLogRecGetInfo(record),
2626 : xlrec,
2627 : &parsed);
2628 15 : recordXid = parsed.twophase_xid;
2629 : }
2630 : else
2631 301 : return false;
2632 :
2633 23550 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2634 : {
2635 : /*
2636 : * There can be only one transaction end record with this exact
2637 : * transactionid
2638 : *
2639 : * when testing for an xid, we MUST test for equality only, since
2640 : * transactions are numbered in the order they start, not the order
2641 : * they complete. A higher numbered xid will complete before you about
2642 : * 50% of the time...
2643 : */
2644 0 : stopsHere = (recordXid == recoveryTargetXid);
2645 : }
2646 :
2647 : /*
2648 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2649 : * We don't expect getRecordTimestamp ever to fail, since we already know
2650 : * this is a commit or abort record; but test its result anyway.
2651 : */
2652 23550 : if (getRecordTimestamp(record, &recordXtime) &&
2653 23550 : recoveryTarget == RECOVERY_TARGET_TIME)
2654 : {
2655 : /*
2656 : * There can be many transactions that share the same commit time, so
2657 : * we stop after the last one, if we are inclusive, or stop at the
2658 : * first one if we are exclusive
2659 : */
2660 0 : if (recoveryTargetInclusive)
2661 0 : stopsHere = (recordXtime > recoveryTargetTime);
2662 : else
2663 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2664 : }
2665 :
2666 23550 : if (stopsHere)
2667 : {
2668 0 : recoveryStopAfter = false;
2669 0 : recoveryStopXid = recordXid;
2670 0 : recoveryStopTime = recordXtime;
2671 0 : recoveryStopLSN = InvalidXLogRecPtr;
2672 0 : recoveryStopName[0] = '\0';
2673 :
2674 0 : if (isCommit)
2675 : {
2676 0 : ereport(LOG,
2677 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2678 : recoveryStopXid,
2679 : timestamptz_to_str(recoveryStopTime))));
2680 : }
2681 : else
2682 : {
2683 0 : ereport(LOG,
2684 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2685 : recoveryStopXid,
2686 : timestamptz_to_str(recoveryStopTime))));
2687 : }
2688 : }
2689 :
2690 23550 : return stopsHere;
2691 : }
2692 :
2693 : /*
2694 : * Same as recoveryStopsBefore, but called after applying the record.
2695 : *
2696 : * We also track the timestamp of the latest applied COMMIT/ABORT
2697 : * record in XLogRecoveryCtl->recoveryLastXTime.
2698 : */
2699 : static bool
2700 2942751 : recoveryStopsAfter(XLogReaderState *record)
2701 : {
2702 : uint8 info;
2703 : uint8 xact_info;
2704 : uint8 rmid;
2705 2942751 : TimestampTz recordXtime = 0;
2706 :
2707 : /*
2708 : * Ignore recovery target settings when not in archive recovery (meaning
2709 : * we are in crash recovery).
2710 : */
2711 2942751 : if (!ArchiveRecoveryRequested)
2712 307605 : return false;
2713 :
2714 2635146 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2715 2635146 : rmid = XLogRecGetRmid(record);
2716 :
2717 : /*
2718 : * There can be many restore points that share the same name; we stop at
2719 : * the first one.
2720 : */
2721 2635146 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2722 20 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2723 : {
2724 : xl_restore_point *recordRestorePointData;
2725 :
2726 3 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2727 :
2728 3 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2729 : {
2730 2 : recoveryStopAfter = true;
2731 2 : recoveryStopXid = InvalidTransactionId;
2732 2 : recoveryStopLSN = InvalidXLogRecPtr;
2733 2 : (void) getRecordTimestamp(record, &recoveryStopTime);
2734 2 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2735 :
2736 2 : ereport(LOG,
2737 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2738 : recoveryStopName,
2739 : timestamptz_to_str(recoveryStopTime))));
2740 2 : return true;
2741 : }
2742 : }
2743 :
2744 : /* Check if the target LSN has been reached */
2745 2635144 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2746 8072 : recoveryTargetInclusive &&
2747 8072 : record->ReadRecPtr >= recoveryTargetLSN)
2748 : {
2749 3 : recoveryStopAfter = true;
2750 3 : recoveryStopXid = InvalidTransactionId;
2751 3 : recoveryStopLSN = record->ReadRecPtr;
2752 3 : recoveryStopTime = 0;
2753 3 : recoveryStopName[0] = '\0';
2754 3 : ereport(LOG,
2755 : errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2756 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2757 3 : return true;
2758 : }
2759 :
2760 2635141 : if (rmid != RM_XACT_ID)
2761 2611292 : return false;
2762 :
2763 23849 : xact_info = info & XLOG_XACT_OPMASK;
2764 :
2765 23849 : if (xact_info == XLOG_XACT_COMMIT ||
2766 2283 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2767 316 : xact_info == XLOG_XACT_ABORT ||
2768 : xact_info == XLOG_XACT_ABORT_PREPARED)
2769 : {
2770 : TransactionId recordXid;
2771 :
2772 : /* Update the last applied transaction timestamp */
2773 23548 : if (getRecordTimestamp(record, &recordXtime))
2774 23548 : SetLatestXTime(recordXtime);
2775 :
2776 : /* Extract the XID of the committed/aborted transaction */
2777 23548 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2778 : {
2779 26 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2780 : xl_xact_parsed_commit parsed;
2781 :
2782 26 : ParseCommitRecord(XLogRecGetInfo(record),
2783 : xlrec,
2784 : &parsed);
2785 26 : recordXid = parsed.twophase_xid;
2786 : }
2787 23522 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2788 : {
2789 15 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2790 : xl_xact_parsed_abort parsed;
2791 :
2792 15 : ParseAbortRecord(XLogRecGetInfo(record),
2793 : xlrec,
2794 : &parsed);
2795 15 : recordXid = parsed.twophase_xid;
2796 : }
2797 : else
2798 23507 : recordXid = XLogRecGetXid(record);
2799 :
2800 : /*
2801 : * There can be only one transaction end record with this exact
2802 : * transactionid
2803 : *
2804 : * when testing for an xid, we MUST test for equality only, since
2805 : * transactions are numbered in the order they start, not the order
2806 : * they complete. A higher numbered xid will complete before you about
2807 : * 50% of the time...
2808 : */
2809 23548 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2810 0 : recordXid == recoveryTargetXid)
2811 : {
2812 0 : recoveryStopAfter = true;
2813 0 : recoveryStopXid = recordXid;
2814 0 : recoveryStopTime = recordXtime;
2815 0 : recoveryStopLSN = InvalidXLogRecPtr;
2816 0 : recoveryStopName[0] = '\0';
2817 :
2818 0 : if (xact_info == XLOG_XACT_COMMIT ||
2819 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2820 : {
2821 0 : ereport(LOG,
2822 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2823 : recoveryStopXid,
2824 : timestamptz_to_str(recoveryStopTime))));
2825 : }
2826 0 : else if (xact_info == XLOG_XACT_ABORT ||
2827 : xact_info == XLOG_XACT_ABORT_PREPARED)
2828 : {
2829 0 : ereport(LOG,
2830 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2831 : recoveryStopXid,
2832 : timestamptz_to_str(recoveryStopTime))));
2833 : }
2834 0 : return true;
2835 : }
2836 : }
2837 :
2838 : /* Check if we should stop as soon as reaching consistency */
2839 23849 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2840 : {
2841 0 : ereport(LOG,
2842 : (errmsg("recovery stopping after reaching consistency")));
2843 :
2844 0 : recoveryStopAfter = true;
2845 0 : recoveryStopXid = InvalidTransactionId;
2846 0 : recoveryStopTime = 0;
2847 0 : recoveryStopLSN = InvalidXLogRecPtr;
2848 0 : recoveryStopName[0] = '\0';
2849 0 : return true;
2850 : }
2851 :
2852 23849 : return false;
2853 : }
2854 :
2855 : /*
2856 : * Create a comment for the history file to explain why and where
2857 : * timeline changed.
2858 : */
2859 : static char *
2860 1011 : getRecoveryStopReason(void)
2861 : {
2862 : char reason[200];
2863 :
2864 1011 : if (recoveryTarget == RECOVERY_TARGET_XID)
2865 0 : snprintf(reason, sizeof(reason),
2866 : "%s transaction %u",
2867 0 : recoveryStopAfter ? "after" : "before",
2868 : recoveryStopXid);
2869 1011 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2870 0 : snprintf(reason, sizeof(reason),
2871 : "%s %s\n",
2872 0 : recoveryStopAfter ? "after" : "before",
2873 : timestamptz_to_str(recoveryStopTime));
2874 1011 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2875 6 : snprintf(reason, sizeof(reason),
2876 : "%s LSN %X/%08X\n",
2877 6 : recoveryStopAfter ? "after" : "before",
2878 6 : LSN_FORMAT_ARGS(recoveryStopLSN));
2879 1005 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2880 3 : snprintf(reason, sizeof(reason),
2881 : "at restore point \"%s\"",
2882 : recoveryStopName);
2883 1002 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2884 0 : snprintf(reason, sizeof(reason), "reached consistency");
2885 : else
2886 1002 : snprintf(reason, sizeof(reason), "no recovery target specified");
2887 :
2888 1011 : return pstrdup(reason);
2889 : }
2890 :
2891 : /*
2892 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2893 : *
2894 : * endOfRecovery is true if the recovery target is reached and
2895 : * the paused state starts at the end of recovery because of
2896 : * recovery_target_action=pause, and false otherwise.
2897 : */
2898 : static void
2899 4 : recoveryPausesHere(bool endOfRecovery)
2900 : {
2901 : /* Don't pause unless users can connect! */
2902 4 : if (!LocalHotStandbyActive)
2903 0 : return;
2904 :
2905 : /* Don't pause after standby promotion has been triggered */
2906 4 : if (LocalPromoteIsTriggered)
2907 0 : return;
2908 :
2909 4 : if (endOfRecovery)
2910 1 : ereport(LOG,
2911 : (errmsg("pausing at the end of recovery"),
2912 : errhint("Execute pg_wal_replay_resume() to promote.")));
2913 : else
2914 3 : ereport(LOG,
2915 : (errmsg("recovery has paused"),
2916 : errhint("Execute pg_wal_replay_resume() to continue.")));
2917 :
2918 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2919 12 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2920 : {
2921 10 : ProcessStartupProcInterrupts();
2922 10 : if (CheckForStandbyTrigger())
2923 2 : return;
2924 :
2925 : /*
2926 : * If recovery pause is requested then set it paused. While we are in
2927 : * the loop, user might resume and pause again so set this every time.
2928 : */
2929 8 : ConfirmRecoveryPaused();
2930 :
2931 : /*
2932 : * We wait on a condition variable that will wake us as soon as the
2933 : * pause ends, but we use a timeout so we can check the above exit
2934 : * condition periodically too.
2935 : */
2936 8 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2937 : WAIT_EVENT_RECOVERY_PAUSE);
2938 : }
2939 2 : ConditionVariableCancelSleep();
2940 : }
2941 :
2942 : /*
2943 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2944 : * certain record types are applied at least that interval behind the primary.
2945 : *
2946 : * Returns true if we waited.
2947 : *
2948 : * Note that the delay is calculated between the WAL record log time and
2949 : * the current time on standby. We would prefer to keep track of when this
2950 : * standby received each WAL record, which would allow a more consistent
2951 : * approach and one not affected by time synchronisation issues, but that
2952 : * is significantly more effort and complexity for little actual gain in
2953 : * usability.
2954 : */
2955 : static bool
2956 2942753 : recoveryApplyDelay(XLogReaderState *record)
2957 : {
2958 : uint8 xact_info;
2959 : TimestampTz xtime;
2960 : TimestampTz delayUntil;
2961 : long msecs;
2962 :
2963 : /* nothing to do if no delay configured */
2964 2942753 : if (recovery_min_apply_delay <= 0)
2965 2942600 : return false;
2966 :
2967 : /* no delay is applied on a database not yet consistent */
2968 153 : if (!reachedConsistency)
2969 4 : return false;
2970 :
2971 : /* nothing to do if crash recovery is requested */
2972 149 : if (!ArchiveRecoveryRequested)
2973 0 : return false;
2974 :
2975 : /*
2976 : * Is it a COMMIT record?
2977 : *
2978 : * We deliberately choose not to delay aborts since they have no effect on
2979 : * MVCC. We already allow replay of records that don't have a timestamp,
2980 : * so there is already opportunity for issues caused by early conflicts on
2981 : * standbys.
2982 : */
2983 149 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2984 119 : return false;
2985 :
2986 30 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2987 :
2988 30 : if (xact_info != XLOG_XACT_COMMIT &&
2989 : xact_info != XLOG_XACT_COMMIT_PREPARED)
2990 0 : return false;
2991 :
2992 30 : if (!getRecordTimestamp(record, &xtime))
2993 0 : return false;
2994 :
2995 30 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
2996 :
2997 : /*
2998 : * Exit without arming the latch if it's already past time to apply this
2999 : * record
3000 : */
3001 30 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3002 30 : if (msecs <= 0)
3003 0 : return false;
3004 :
3005 : while (true)
3006 : {
3007 103 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3008 :
3009 : /* This might change recovery_min_apply_delay. */
3010 103 : ProcessStartupProcInterrupts();
3011 :
3012 103 : if (CheckForStandbyTrigger())
3013 0 : break;
3014 :
3015 : /*
3016 : * Recalculate delayUntil as recovery_min_apply_delay could have
3017 : * changed while waiting in this loop.
3018 : */
3019 103 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3020 :
3021 : /*
3022 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3023 : */
3024 103 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3025 : delayUntil);
3026 :
3027 103 : if (msecs <= 0)
3028 30 : break;
3029 :
3030 73 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3031 :
3032 73 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3033 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3034 : msecs,
3035 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3036 : }
3037 30 : return true;
3038 : }
3039 :
3040 : /*
3041 : * Get the current state of the recovery pause request.
3042 : */
3043 : RecoveryPauseState
3044 18 : GetRecoveryPauseState(void)
3045 : {
3046 : RecoveryPauseState state;
3047 :
3048 18 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3049 18 : state = XLogRecoveryCtl->recoveryPauseState;
3050 18 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3051 :
3052 18 : return state;
3053 : }
3054 :
3055 : /*
3056 : * Set the recovery pause state.
3057 : *
3058 : * If recovery pause is requested then sets the recovery pause state to
3059 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3060 : * to 'not paused' to resume the recovery. The recovery pause will be
3061 : * confirmed by the ConfirmRecoveryPaused.
3062 : */
3063 : void
3064 56 : SetRecoveryPause(bool recoveryPause)
3065 : {
3066 56 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3067 :
3068 56 : if (!recoveryPause)
3069 51 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3070 5 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3071 5 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3072 :
3073 56 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3074 :
3075 56 : if (!recoveryPause)
3076 51 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3077 56 : }
3078 :
3079 : /*
3080 : * Confirm the recovery pause by setting the recovery pause state to
3081 : * RECOVERY_PAUSED.
3082 : */
3083 : static void
3084 8 : ConfirmRecoveryPaused(void)
3085 : {
3086 : /* If recovery pause is requested then set it paused */
3087 8 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3088 8 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3089 4 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3090 8 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3091 8 : }
3092 :
3093 :
3094 : /*
3095 : * Attempt to read the next XLOG record.
3096 : *
3097 : * Before first call, the reader needs to be positioned to the first record
3098 : * by calling XLogPrefetcherBeginRead().
3099 : *
3100 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3101 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3102 : * record is available.
3103 : */
3104 : static XLogRecord *
3105 2945188 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3106 : bool fetching_ckpt, TimeLineID replayTLI)
3107 : {
3108 : XLogRecord *record;
3109 2945188 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3110 2945188 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3111 :
3112 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3113 :
3114 : /* Pass through parameters to XLogPageRead */
3115 2945188 : private->fetching_ckpt = fetching_ckpt;
3116 2945188 : private->emode = emode;
3117 2945188 : private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3118 2945188 : private->replayTLI = replayTLI;
3119 :
3120 : /* This is the first attempt to read this page. */
3121 2945188 : lastSourceFailed = false;
3122 :
3123 : for (;;)
3124 142 : {
3125 : char *errormsg;
3126 :
3127 2945330 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3128 2945270 : if (record == NULL)
3129 : {
3130 : /*
3131 : * When we find that WAL ends in an incomplete record, keep track
3132 : * of that record. After recovery is done, we'll write a record
3133 : * to indicate to downstream WAL readers that that portion is to
3134 : * be ignored.
3135 : *
3136 : * However, when ArchiveRecoveryRequested = true, we're going to
3137 : * switch to a new timeline at the end of recovery. We will only
3138 : * copy WAL over to the new timeline up to the end of the last
3139 : * complete record, so if we did this, we would later create an
3140 : * overwrite contrecord in the wrong place, breaking everything.
3141 : */
3142 301 : if (!ArchiveRecoveryRequested &&
3143 110 : XLogRecPtrIsValid(xlogreader->abortedRecPtr))
3144 : {
3145 11 : abortedRecPtr = xlogreader->abortedRecPtr;
3146 11 : missingContrecPtr = xlogreader->missingContrecPtr;
3147 : }
3148 :
3149 301 : if (readFile >= 0)
3150 : {
3151 274 : close(readFile);
3152 274 : readFile = -1;
3153 : }
3154 :
3155 : /*
3156 : * We only end up here without a message when XLogPageRead()
3157 : * failed - in that case we already logged something. In
3158 : * StandbyMode that only happens if we have been triggered, so we
3159 : * shouldn't loop anymore in that case.
3160 : */
3161 301 : if (errormsg)
3162 274 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3163 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3164 : }
3165 :
3166 : /*
3167 : * Check page TLI is one of the expected values.
3168 : */
3169 2944969 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3170 : {
3171 : char fname[MAXFNAMELEN];
3172 : XLogSegNo segno;
3173 : int32 offset;
3174 :
3175 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3176 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3177 : wal_segment_size);
3178 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3179 : wal_segment_size);
3180 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3181 : errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3182 : xlogreader->latestPageTLI,
3183 : fname,
3184 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3185 : offset));
3186 0 : record = NULL;
3187 : }
3188 :
3189 2945270 : if (record)
3190 : {
3191 : /* Great, got a record */
3192 2945128 : return record;
3193 : }
3194 : else
3195 : {
3196 : /* No valid record available from this source */
3197 301 : lastSourceFailed = true;
3198 :
3199 : /*
3200 : * If archive recovery was requested, but we were still doing
3201 : * crash recovery, switch to archive recovery and retry using the
3202 : * offline archive. We have now replayed all the valid WAL in
3203 : * pg_wal, so we are presumably now consistent.
3204 : *
3205 : * We require that there's at least some valid WAL present in
3206 : * pg_wal, however (!fetching_ckpt). We could recover using the
3207 : * WAL from the archive, even if pg_wal is completely empty, but
3208 : * we'd have no idea how far we'd have to replay to reach
3209 : * consistency. So err on the safe side and give up.
3210 : */
3211 301 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3212 1 : !fetching_ckpt)
3213 : {
3214 1 : ereport(DEBUG1,
3215 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3216 1 : InArchiveRecovery = true;
3217 1 : if (StandbyModeRequested)
3218 1 : EnableStandbyMode();
3219 :
3220 1 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3221 1 : minRecoveryPoint = xlogreader->EndRecPtr;
3222 1 : minRecoveryPointTLI = replayTLI;
3223 :
3224 1 : CheckRecoveryConsistency();
3225 :
3226 : /*
3227 : * Before we retry, reset lastSourceFailed and currentSource
3228 : * so that we will check the archive next.
3229 : */
3230 1 : lastSourceFailed = false;
3231 1 : currentSource = XLOG_FROM_ANY;
3232 :
3233 142 : continue;
3234 : }
3235 :
3236 : /* In standby mode, loop back to retry. Otherwise, give up. */
3237 300 : if (StandbyMode && !CheckForStandbyTrigger())
3238 141 : continue;
3239 : else
3240 159 : return NULL;
3241 : }
3242 : }
3243 : }
3244 :
3245 : /*
3246 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3247 : * already). Returns number of bytes read, if the page is read successfully,
3248 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3249 : * but only if they have not been previously reported.
3250 : *
3251 : * See XLogReaderRoutine.page_read for more details.
3252 : *
3253 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3254 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3255 : *
3256 : * This is responsible for restoring files from archive as needed, as well
3257 : * as for waiting for the requested WAL record to arrive in standby mode.
3258 : *
3259 : * xlogreader->private_data->emode specifies the log level used for reporting
3260 : * "file not found" or "end of WAL" situations in archive recovery, or in
3261 : * standby mode when promotion is triggered. If set to WARNING or below,
3262 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3263 : * levels the ereport() won't return.
3264 : *
3265 : * In standby mode, if after a successful return of XLogPageRead() the
3266 : * caller finds the record it's interested in to be broken, it should
3267 : * ereport the error with the level determined by
3268 : * emode_for_corrupt_record(), and then set lastSourceFailed
3269 : * and call XLogPageRead() again with the same arguments. This lets
3270 : * XLogPageRead() to try fetching the record from another source, or to
3271 : * sleep and retry.
3272 : */
3273 : static int
3274 1512530 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3275 : XLogRecPtr targetRecPtr, char *readBuf)
3276 : {
3277 1512530 : XLogPageReadPrivate *private =
3278 : (XLogPageReadPrivate *) xlogreader->private_data;
3279 1512530 : int emode = private->emode;
3280 : uint32 targetPageOff;
3281 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3282 : int r;
3283 : instr_time io_start;
3284 :
3285 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3286 :
3287 1512530 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3288 1512530 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3289 :
3290 : /*
3291 : * See if we need to switch to a new segment because the requested record
3292 : * is not in the currently open one.
3293 : */
3294 1512530 : if (readFile >= 0 &&
3295 1510712 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3296 : {
3297 : /*
3298 : * Request a restartpoint if we've replayed too much xlog since the
3299 : * last one.
3300 : */
3301 1484 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3302 : {
3303 1468 : if (XLogCheckpointNeeded(readSegNo))
3304 : {
3305 1348 : (void) GetRedoRecPtr();
3306 1348 : if (XLogCheckpointNeeded(readSegNo))
3307 1342 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3308 : }
3309 : }
3310 :
3311 1484 : close(readFile);
3312 1484 : readFile = -1;
3313 1484 : readSource = XLOG_FROM_ANY;
3314 : }
3315 :
3316 1512530 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3317 :
3318 1512533 : retry:
3319 : /* See if we need to retrieve more data */
3320 1512533 : if (readFile < 0 ||
3321 1509228 : (readSource == XLOG_FROM_STREAM &&
3322 1495452 : flushedUpto < targetPagePtr + reqLen))
3323 : {
3324 33485 : if (readFile >= 0 &&
3325 30180 : xlogreader->nonblocking &&
3326 14944 : readSource == XLOG_FROM_STREAM &&
3327 14944 : flushedUpto < targetPagePtr + reqLen)
3328 14944 : return XLREAD_WOULDBLOCK;
3329 :
3330 18481 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3331 18541 : private->randAccess,
3332 18541 : private->fetching_ckpt,
3333 : targetRecPtr,
3334 : private->replayTLI,
3335 : xlogreader->EndRecPtr,
3336 18541 : xlogreader->nonblocking))
3337 : {
3338 417 : case XLREAD_WOULDBLOCK:
3339 417 : return XLREAD_WOULDBLOCK;
3340 52 : case XLREAD_FAIL:
3341 52 : if (readFile >= 0)
3342 0 : close(readFile);
3343 52 : readFile = -1;
3344 52 : readLen = 0;
3345 52 : readSource = XLOG_FROM_ANY;
3346 52 : return XLREAD_FAIL;
3347 18012 : case XLREAD_SUCCESS:
3348 18012 : break;
3349 : }
3350 : }
3351 :
3352 : /*
3353 : * At this point, we have the right segment open and if we're streaming we
3354 : * know the requested record is in it.
3355 : */
3356 : Assert(readFile != -1);
3357 :
3358 : /*
3359 : * If the current segment is being streamed from the primary, calculate
3360 : * how much of the current page we have received already. We know the
3361 : * requested record has been received, but this is for the benefit of
3362 : * future calls, to allow quick exit at the top of this function.
3363 : */
3364 1497060 : if (readSource == XLOG_FROM_STREAM)
3365 : {
3366 1481560 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3367 1470472 : readLen = XLOG_BLCKSZ;
3368 : else
3369 11088 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3370 : targetPageOff;
3371 : }
3372 : else
3373 15500 : readLen = XLOG_BLCKSZ;
3374 :
3375 : /* Read the requested page */
3376 1497060 : readOff = targetPageOff;
3377 :
3378 : /* Measure I/O timing when reading segment */
3379 1497060 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3380 :
3381 1497060 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3382 1497060 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3383 1497060 : if (r != XLOG_BLCKSZ)
3384 : {
3385 : char fname[MAXFNAMELEN];
3386 0 : int save_errno = errno;
3387 :
3388 0 : pgstat_report_wait_end();
3389 :
3390 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3391 : io_start, 1, r);
3392 :
3393 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3394 0 : if (r < 0)
3395 : {
3396 0 : errno = save_errno;
3397 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3398 : (errcode_for_file_access(),
3399 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3400 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3401 : readOff)));
3402 : }
3403 : else
3404 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3405 : (errcode(ERRCODE_DATA_CORRUPTED),
3406 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3407 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3408 : readOff, r, (Size) XLOG_BLCKSZ)));
3409 0 : goto next_record_is_invalid;
3410 : }
3411 1497060 : pgstat_report_wait_end();
3412 :
3413 1497060 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3414 : io_start, 1, r);
3415 :
3416 : Assert(targetSegNo == readSegNo);
3417 : Assert(targetPageOff == readOff);
3418 : Assert(reqLen <= readLen);
3419 :
3420 1497060 : xlogreader->seg.ws_tli = curFileTLI;
3421 :
3422 : /*
3423 : * Check the page header immediately, so that we can retry immediately if
3424 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3425 : * validates the page header anyway, and would propagate the failure up to
3426 : * ReadRecord(), which would retry. However, there's a corner case with
3427 : * continuation records, if a record is split across two pages such that
3428 : * we would need to read the two pages from different sources across two
3429 : * WAL segments.
3430 : *
3431 : * The first page is only available locally, in pg_wal, because it's
3432 : * already been recycled on the primary. The second page, however, is not
3433 : * present in pg_wal, and we should stream it from the primary. There is a
3434 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3435 : * We would read the first page from the local WAL segment, but when
3436 : * reading the second page, we would read the bogus, recycled, WAL
3437 : * segment. If we didn't catch that case here, we would never recover,
3438 : * because ReadRecord() would retry reading the whole record from the
3439 : * beginning.
3440 : *
3441 : * Of course, this only catches errors in the page header, which is what
3442 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3443 : * corruption still has the same problem. But this at least fixes the
3444 : * common case, which can happen as part of normal operation.
3445 : *
3446 : * Validating the page header is cheap enough that doing it twice
3447 : * shouldn't be a big deal from a performance point of view.
3448 : *
3449 : * When not in standby mode, an invalid page header should cause recovery
3450 : * to end, not retry reading the page, so we don't need to validate the
3451 : * page header here for the retry. Instead, ReadPageInternal() is
3452 : * responsible for the validation.
3453 : */
3454 1497060 : if (StandbyMode &&
3455 1485365 : (targetPagePtr % wal_segment_size) == 0 &&
3456 1443 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3457 : {
3458 : /*
3459 : * Emit this error right now then retry this page immediately. Use
3460 : * errmsg_internal() because the message was already translated.
3461 : */
3462 4 : if (xlogreader->errormsg_buf[0])
3463 4 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3464 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3465 :
3466 : /* reset any error XLogReaderValidatePageHeader() might have set */
3467 4 : XLogReaderResetError(xlogreader);
3468 4 : goto next_record_is_invalid;
3469 : }
3470 :
3471 1497056 : return readLen;
3472 :
3473 4 : next_record_is_invalid:
3474 :
3475 : /*
3476 : * If we're reading ahead, give up fast. Retries and error reporting will
3477 : * be handled by a later read when recovery catches up to this point.
3478 : */
3479 4 : if (xlogreader->nonblocking)
3480 1 : return XLREAD_WOULDBLOCK;
3481 :
3482 3 : lastSourceFailed = true;
3483 :
3484 3 : if (readFile >= 0)
3485 3 : close(readFile);
3486 3 : readFile = -1;
3487 3 : readLen = 0;
3488 3 : readSource = XLOG_FROM_ANY;
3489 :
3490 : /* In standby-mode, keep trying */
3491 3 : if (StandbyMode)
3492 3 : goto retry;
3493 : else
3494 0 : return XLREAD_FAIL;
3495 : }
3496 :
3497 : /*
3498 : * Open the WAL segment containing WAL location 'RecPtr'.
3499 : *
3500 : * The segment can be fetched via restore_command, or via walreceiver having
3501 : * streamed the record, or it can already be present in pg_wal. Checking
3502 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3503 : * too, in case someone copies a new segment directly to pg_wal. That is not
3504 : * documented or recommended, though.
3505 : *
3506 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3507 : * prepare to read WAL starting from RedoStartLSN after this.
3508 : *
3509 : * 'RecPtr' might not point to the beginning of the record we're interested
3510 : * in, it might also point to the page or segment header. In that case,
3511 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3512 : * used to decide which timeline to stream the requested WAL from.
3513 : *
3514 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3515 : * timelines, we can reject a switch to a timeline that branched off before
3516 : * this point.
3517 : *
3518 : * If the record is not immediately available, the function returns XLREAD_FAIL
3519 : * if we're not in standby mode. In standby mode, the function waits for it to
3520 : * become available.
3521 : *
3522 : * When the requested record becomes available, the function opens the file
3523 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3524 : * of standby mode is triggered by the user, and there is no more WAL
3525 : * available, returns XLREAD_FAIL.
3526 : *
3527 : * If nonblocking is true, then give up immediately if we can't satisfy the
3528 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3529 : */
3530 : static XLogPageReadResult
3531 18541 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3532 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3533 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3534 : bool nonblocking)
3535 : {
3536 : static TimestampTz last_fail_time = 0;
3537 : TimestampTz now;
3538 18541 : bool streaming_reply_sent = false;
3539 :
3540 : /*-------
3541 : * Standby mode is implemented by a state machine:
3542 : *
3543 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3544 : * pg_wal (XLOG_FROM_PG_WAL)
3545 : * 2. Check for promotion trigger request
3546 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3547 : * 4. Rescan timelines
3548 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3549 : *
3550 : * Failure to read from the current source advances the state machine to
3551 : * the next state.
3552 : *
3553 : * 'currentSource' indicates the current state. There are no currentSource
3554 : * values for "check trigger", "rescan timelines", and "sleep" states,
3555 : * those actions are taken when reading from the previous source fails, as
3556 : * part of advancing to the next state.
3557 : *
3558 : * If standby mode is turned off while reading WAL from stream, we move
3559 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3560 : * the files (which would be required at end of recovery, e.g., timeline
3561 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3562 : * here because it's already stopped when standby mode is turned off at
3563 : * the end of recovery.
3564 : *-------
3565 : */
3566 18541 : if (!InArchiveRecovery)
3567 1075 : currentSource = XLOG_FROM_PG_WAL;
3568 17466 : else if (currentSource == XLOG_FROM_ANY ||
3569 17340 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3570 : {
3571 126 : lastSourceFailed = false;
3572 126 : currentSource = XLOG_FROM_ARCHIVE;
3573 : }
3574 :
3575 : for (;;)
3576 16087 : {
3577 34628 : XLogSource oldSource = currentSource;
3578 34628 : bool startWalReceiver = false;
3579 :
3580 : /*
3581 : * First check if we failed to read from the current source, and
3582 : * advance the state machine if so. The failure to read might've
3583 : * happened outside this function, e.g when a CRC check fails on a
3584 : * record, or within this loop.
3585 : */
3586 34628 : if (lastSourceFailed)
3587 : {
3588 : /*
3589 : * Don't allow any retry loops to occur during nonblocking
3590 : * readahead. Let the caller process everything that has been
3591 : * decoded already first.
3592 : */
3593 544 : if (nonblocking)
3594 84 : return XLREAD_WOULDBLOCK;
3595 :
3596 460 : switch (currentSource)
3597 : {
3598 278 : case XLOG_FROM_ARCHIVE:
3599 : case XLOG_FROM_PG_WAL:
3600 :
3601 : /*
3602 : * Check to see if promotion is requested. Note that we do
3603 : * this only after failure, so when you promote, we still
3604 : * finish replaying as much as we can from archive and
3605 : * pg_wal before failover.
3606 : */
3607 278 : if (StandbyMode && CheckForStandbyTrigger())
3608 : {
3609 22 : XLogShutdownWalRcv();
3610 22 : return XLREAD_FAIL;
3611 : }
3612 :
3613 : /*
3614 : * Not in standby mode, and we've now tried the archive
3615 : * and pg_wal.
3616 : */
3617 256 : if (!StandbyMode)
3618 30 : return XLREAD_FAIL;
3619 :
3620 : /*
3621 : * Move to XLOG_FROM_STREAM state, and set to start a
3622 : * walreceiver if necessary.
3623 : */
3624 226 : currentSource = XLOG_FROM_STREAM;
3625 226 : startWalReceiver = true;
3626 226 : break;
3627 :
3628 182 : case XLOG_FROM_STREAM:
3629 :
3630 : /*
3631 : * Failure while streaming. Most likely, we got here
3632 : * because streaming replication was terminated, or
3633 : * promotion was triggered. But we also get here if we
3634 : * find an invalid record in the WAL streamed from the
3635 : * primary, in which case something is seriously wrong.
3636 : * There's little chance that the problem will just go
3637 : * away, but PANIC is not good for availability either,
3638 : * especially in hot standby mode. So, we treat that the
3639 : * same as disconnection, and retry from archive/pg_wal
3640 : * again. The WAL in the archive should be identical to
3641 : * what was streamed, so it's unlikely that it helps, but
3642 : * one can hope...
3643 : */
3644 :
3645 : /*
3646 : * We should be able to move to XLOG_FROM_STREAM only in
3647 : * standby mode.
3648 : */
3649 : Assert(StandbyMode);
3650 :
3651 : /*
3652 : * Before we leave XLOG_FROM_STREAM state, make sure that
3653 : * walreceiver is not active, so that it won't overwrite
3654 : * WAL that we restore from archive.
3655 : *
3656 : * If walreceiver is actively streaming (or attempting to
3657 : * connect), we must shut it down. However, if it's
3658 : * already in WAITING state (e.g., due to timeline
3659 : * divergence), we only need to reset the install flag to
3660 : * allow archive restoration.
3661 : */
3662 182 : if (WalRcvStreaming())
3663 34 : XLogShutdownWalRcv();
3664 : else
3665 : {
3666 : /*
3667 : * WALRCV_STOPPING state is a transient state while
3668 : * the startup process is in ShutdownWalRcv(). It
3669 : * should never appear here since we would be waiting
3670 : * for the walreceiver to reach WALRCV_STOPPED in that
3671 : * case.
3672 : */
3673 : Assert(WalRcvGetState() != WALRCV_STOPPING);
3674 148 : ResetInstallXLogFileSegmentActive();
3675 : }
3676 :
3677 : /*
3678 : * Before we sleep, re-scan for possible new timelines if
3679 : * we were requested to recover to the latest timeline.
3680 : */
3681 182 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3682 : {
3683 182 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3684 : {
3685 7 : currentSource = XLOG_FROM_ARCHIVE;
3686 7 : break;
3687 : }
3688 : }
3689 :
3690 : /*
3691 : * XLOG_FROM_STREAM is the last state in our state
3692 : * machine, so we've exhausted all the options for
3693 : * obtaining the requested WAL. We're going to loop back
3694 : * and retry from the archive, but if it hasn't been long
3695 : * since last attempt, sleep wal_retrieve_retry_interval
3696 : * milliseconds to avoid busy-waiting.
3697 : */
3698 173 : now = GetCurrentTimestamp();
3699 173 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3700 : wal_retrieve_retry_interval))
3701 : {
3702 : long wait_time;
3703 :
3704 182 : wait_time = wal_retrieve_retry_interval -
3705 91 : TimestampDifferenceMilliseconds(last_fail_time, now);
3706 :
3707 91 : elog(LOG, "waiting for WAL to become available at %X/%08X",
3708 : LSN_FORMAT_ARGS(RecPtr));
3709 :
3710 : /* Do background tasks that might benefit us later. */
3711 91 : KnownAssignedTransactionIdsIdleMaintenance();
3712 :
3713 91 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3714 : WL_LATCH_SET | WL_TIMEOUT |
3715 : WL_EXIT_ON_PM_DEATH,
3716 : wait_time,
3717 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3718 91 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3719 91 : now = GetCurrentTimestamp();
3720 :
3721 : /* Handle interrupt signals of startup process */
3722 91 : ProcessStartupProcInterrupts();
3723 : }
3724 157 : last_fail_time = now;
3725 157 : currentSource = XLOG_FROM_ARCHIVE;
3726 157 : break;
3727 :
3728 0 : default:
3729 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3730 : }
3731 : }
3732 34084 : else if (currentSource == XLOG_FROM_PG_WAL)
3733 : {
3734 : /*
3735 : * We just successfully read a file in pg_wal. We prefer files in
3736 : * the archive over ones in pg_wal, so try the next file again
3737 : * from the archive first.
3738 : */
3739 1069 : if (InArchiveRecovery)
3740 0 : currentSource = XLOG_FROM_ARCHIVE;
3741 : }
3742 :
3743 34474 : if (currentSource != oldSource)
3744 390 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3745 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3746 : lastSourceFailed ? "failure" : "success");
3747 :
3748 : /*
3749 : * We've now handled possible failure. Try to read from the chosen
3750 : * source.
3751 : */
3752 34474 : lastSourceFailed = false;
3753 :
3754 34474 : switch (currentSource)
3755 : {
3756 1917 : case XLOG_FROM_ARCHIVE:
3757 : case XLOG_FROM_PG_WAL:
3758 :
3759 : /*
3760 : * WAL receiver must not be running when reading WAL from
3761 : * archive or pg_wal.
3762 : */
3763 : Assert(!WalRcvStreaming());
3764 :
3765 : /* Close any old file we might have open. */
3766 1917 : if (readFile >= 0)
3767 : {
3768 98 : close(readFile);
3769 98 : readFile = -1;
3770 : }
3771 : /* Reset curFileTLI if random fetch. */
3772 1917 : if (randAccess)
3773 1239 : curFileTLI = 0;
3774 :
3775 : /*
3776 : * Try to restore the file from archive, or read an existing
3777 : * file from pg_wal.
3778 : */
3779 1917 : readFile = XLogFileReadAnyTLI(readSegNo,
3780 1917 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3781 : currentSource);
3782 1917 : if (readFile >= 0)
3783 1724 : return XLREAD_SUCCESS; /* success! */
3784 :
3785 : /*
3786 : * Nope, not found in archive or pg_wal.
3787 : */
3788 193 : lastSourceFailed = true;
3789 193 : break;
3790 :
3791 32557 : case XLOG_FROM_STREAM:
3792 : {
3793 : bool havedata;
3794 :
3795 : /*
3796 : * We should be able to move to XLOG_FROM_STREAM only in
3797 : * standby mode.
3798 : */
3799 : Assert(StandbyMode);
3800 :
3801 : /*
3802 : * First, shutdown walreceiver if its restart has been
3803 : * requested -- but no point if we're already slated for
3804 : * starting it.
3805 : */
3806 32557 : if (pendingWalRcvRestart && !startWalReceiver)
3807 : {
3808 7 : XLogShutdownWalRcv();
3809 :
3810 : /*
3811 : * Re-scan for possible new timelines if we were
3812 : * requested to recover to the latest timeline.
3813 : */
3814 7 : if (recoveryTargetTimeLineGoal ==
3815 : RECOVERY_TARGET_TIMELINE_LATEST)
3816 7 : rescanLatestTimeLine(replayTLI, replayLSN);
3817 :
3818 7 : startWalReceiver = true;
3819 : }
3820 32557 : pendingWalRcvRestart = false;
3821 :
3822 : /*
3823 : * Launch walreceiver if needed.
3824 : *
3825 : * If fetching_ckpt is true, RecPtr points to the initial
3826 : * checkpoint location. In that case, we use RedoStartLSN
3827 : * as the streaming start position instead of RecPtr, so
3828 : * that when we later jump backwards to start redo at
3829 : * RedoStartLSN, we will have the logs streamed already.
3830 : */
3831 32557 : if (startWalReceiver &&
3832 233 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3833 : {
3834 : XLogRecPtr ptr;
3835 : TimeLineID tli;
3836 :
3837 196 : if (fetching_ckpt)
3838 : {
3839 0 : ptr = RedoStartLSN;
3840 0 : tli = RedoStartTLI;
3841 : }
3842 : else
3843 : {
3844 196 : ptr = RecPtr;
3845 :
3846 : /*
3847 : * Use the record begin position to determine the
3848 : * TLI, rather than the position we're reading.
3849 : */
3850 196 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3851 :
3852 196 : if (curFileTLI > 0 && tli < curFileTLI)
3853 0 : elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3854 : LSN_FORMAT_ARGS(tliRecPtr),
3855 : tli, curFileTLI);
3856 : }
3857 196 : curFileTLI = tli;
3858 196 : SetInstallXLogFileSegmentActive();
3859 196 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3860 : PrimarySlotName,
3861 : wal_receiver_create_temp_slot);
3862 196 : flushedUpto = InvalidXLogRecPtr;
3863 : }
3864 :
3865 : /*
3866 : * Check if WAL receiver is active or wait to start up.
3867 : */
3868 32557 : if (!WalRcvStreaming())
3869 : {
3870 148 : lastSourceFailed = true;
3871 148 : break;
3872 : }
3873 :
3874 : /*
3875 : * Walreceiver is active, so see if new data has arrived.
3876 : *
3877 : * We only advance XLogReceiptTime when we obtain fresh
3878 : * WAL from walreceiver and observe that we had already
3879 : * processed everything before the most recent "chunk"
3880 : * that it flushed to disk. In steady state where we are
3881 : * keeping up with the incoming data, XLogReceiptTime will
3882 : * be updated on each cycle. When we are behind,
3883 : * XLogReceiptTime will not advance, so the grace time
3884 : * allotted to conflicting queries will decrease.
3885 : */
3886 32409 : if (RecPtr < flushedUpto)
3887 1752 : havedata = true;
3888 : else
3889 : {
3890 : XLogRecPtr latestChunkStart;
3891 :
3892 30657 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3893 30657 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3894 : {
3895 15721 : havedata = true;
3896 15721 : if (latestChunkStart <= RecPtr)
3897 : {
3898 12110 : XLogReceiptTime = GetCurrentTimestamp();
3899 12110 : SetCurrentChunkStartTime(XLogReceiptTime);
3900 : }
3901 : }
3902 : else
3903 14936 : havedata = false;
3904 : }
3905 32409 : if (havedata)
3906 : {
3907 : /*
3908 : * Great, streamed far enough. Open the file if it's
3909 : * not open already. Also read the timeline history
3910 : * file if we haven't initialized timeline history
3911 : * yet; it should be streamed over and present in
3912 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3913 : * info is set correctly and XLogReceiptTime isn't
3914 : * changed.
3915 : *
3916 : * NB: We must set readTimeLineHistory based on
3917 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3918 : * be the same, but if recovery_target_timeline is
3919 : * 'latest' and archiving is configured, then it's
3920 : * possible that we managed to retrieve one or more
3921 : * new timeline history files from the archive,
3922 : * updating recoveryTargetTLI.
3923 : */
3924 17473 : if (readFile < 0)
3925 : {
3926 1185 : if (!expectedTLEs)
3927 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3928 1185 : readFile = XLogFileRead(readSegNo, receiveTLI,
3929 : XLOG_FROM_STREAM, false);
3930 : Assert(readFile >= 0);
3931 : }
3932 : else
3933 : {
3934 : /* just make sure source info is correct... */
3935 16288 : readSource = XLOG_FROM_STREAM;
3936 16288 : XLogReceiptSource = XLOG_FROM_STREAM;
3937 16288 : return XLREAD_SUCCESS;
3938 : }
3939 1185 : break;
3940 : }
3941 :
3942 : /* In nonblocking mode, return rather than sleeping. */
3943 14936 : if (nonblocking)
3944 333 : return XLREAD_WOULDBLOCK;
3945 :
3946 : /*
3947 : * Data not here yet. Check for trigger, then wait for
3948 : * walreceiver to wake us up when new WAL arrives.
3949 : */
3950 14603 : if (CheckForStandbyTrigger())
3951 : {
3952 : /*
3953 : * Note that we don't return XLREAD_FAIL immediately
3954 : * here. After being triggered, we still want to
3955 : * replay all the WAL that was already streamed. It's
3956 : * in pg_wal now, so we just treat this as a failure,
3957 : * and the state machine will move on to replay the
3958 : * streamed WAL from pg_wal, and then recheck the
3959 : * trigger and exit replay.
3960 : */
3961 34 : lastSourceFailed = true;
3962 34 : break;
3963 : }
3964 :
3965 : /*
3966 : * Since we have replayed everything we have received so
3967 : * far and are about to start waiting for more WAL, let's
3968 : * tell the upstream server our replay location now so
3969 : * that pg_stat_replication doesn't show stale
3970 : * information.
3971 : */
3972 14569 : if (!streaming_reply_sent)
3973 : {
3974 12522 : WalRcvRequestApplyReply();
3975 12522 : streaming_reply_sent = true;
3976 : }
3977 :
3978 : /* Do any background tasks that might benefit us later. */
3979 14569 : KnownAssignedTransactionIdsIdleMaintenance();
3980 :
3981 : /* Update pg_stat_recovery_prefetch before sleeping. */
3982 14569 : XLogPrefetcherComputeStats(xlogprefetcher);
3983 :
3984 : /*
3985 : * Wait for more WAL to arrive, when we will be woken
3986 : * immediately by the WAL receiver.
3987 : */
3988 14569 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3989 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
3990 : -1L,
3991 : WAIT_EVENT_RECOVERY_WAL_STREAM);
3992 14569 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3993 14569 : break;
3994 : }
3995 :
3996 0 : default:
3997 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3998 : }
3999 :
4000 : /*
4001 : * Check for recovery pause here so that we can confirm more quickly
4002 : * that a requested pause has actually taken effect.
4003 : */
4004 16129 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4005 : RECOVERY_NOT_PAUSED)
4006 3 : recoveryPausesHere(false);
4007 :
4008 : /*
4009 : * This possibly-long loop needs to handle interrupts of startup
4010 : * process.
4011 : */
4012 16129 : ProcessStartupProcInterrupts();
4013 : }
4014 :
4015 : return XLREAD_FAIL; /* not reached */
4016 : }
4017 :
4018 :
4019 : /*
4020 : * Determine what log level should be used to report a corrupt WAL record
4021 : * in the current WAL page, previously read by XLogPageRead().
4022 : *
4023 : * 'emode' is the error mode that would be used to report a file-not-found
4024 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4025 : * we're retrying the exact same record that we've tried previously, only
4026 : * complain the first time to keep the noise down. However, we only do when
4027 : * reading from pg_wal, because we don't expect any invalid records in archive
4028 : * or in records streamed from the primary. Files in the archive should be complete,
4029 : * and we should never hit the end of WAL because we stop and wait for more WAL
4030 : * to arrive before replaying it.
4031 : *
4032 : * NOTE: This function remembers the RecPtr value it was last called with,
4033 : * to suppress repeated messages about the same record. Only call this when
4034 : * you are about to ereport(), or you might cause a later message to be
4035 : * erroneously suppressed.
4036 : */
4037 : static int
4038 278 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4039 : {
4040 : static XLogRecPtr lastComplaint = InvalidXLogRecPtr;
4041 :
4042 278 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4043 : {
4044 275 : if (RecPtr == lastComplaint)
4045 70 : emode = DEBUG1;
4046 : else
4047 205 : lastComplaint = RecPtr;
4048 : }
4049 278 : return emode;
4050 : }
4051 :
4052 :
4053 : /*
4054 : * Subroutine to try to fetch and validate a prior checkpoint record.
4055 : */
4056 : static XLogRecord *
4057 1078 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4058 : TimeLineID replayTLI)
4059 : {
4060 : XLogRecord *record;
4061 : uint8 info;
4062 :
4063 : Assert(xlogreader != NULL);
4064 :
4065 1078 : if (!XRecOffIsValid(RecPtr))
4066 : {
4067 0 : ereport(LOG,
4068 : (errmsg("invalid checkpoint location")));
4069 0 : return NULL;
4070 : }
4071 :
4072 1078 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4073 1078 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4074 :
4075 1078 : if (record == NULL)
4076 : {
4077 1 : ereport(LOG,
4078 : (errmsg("invalid checkpoint record")));
4079 1 : return NULL;
4080 : }
4081 1077 : if (record->xl_rmid != RM_XLOG_ID)
4082 : {
4083 0 : ereport(LOG,
4084 : (errmsg("invalid resource manager ID in checkpoint record")));
4085 0 : return NULL;
4086 : }
4087 1077 : info = record->xl_info & ~XLR_INFO_MASK;
4088 1077 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4089 : info != XLOG_CHECKPOINT_ONLINE)
4090 : {
4091 0 : ereport(LOG,
4092 : (errmsg("invalid xl_info in checkpoint record")));
4093 0 : return NULL;
4094 : }
4095 1077 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4096 : {
4097 0 : ereport(LOG,
4098 : (errmsg("invalid length of checkpoint record")));
4099 0 : return NULL;
4100 : }
4101 1077 : return record;
4102 : }
4103 :
4104 : /*
4105 : * Scan for new timelines that might have appeared in the archive since we
4106 : * started recovery.
4107 : *
4108 : * If there are any, the function changes recovery target TLI to the latest
4109 : * one and returns 'true'.
4110 : */
4111 : static bool
4112 189 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4113 : {
4114 : List *newExpectedTLEs;
4115 : bool found;
4116 : ListCell *cell;
4117 : TimeLineID newtarget;
4118 189 : TimeLineID oldtarget = recoveryTargetTLI;
4119 189 : TimeLineHistoryEntry *currentTle = NULL;
4120 :
4121 189 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4122 187 : if (newtarget == recoveryTargetTLI)
4123 : {
4124 : /* No new timelines found */
4125 180 : return false;
4126 : }
4127 :
4128 : /*
4129 : * Determine the list of expected TLIs for the new TLI
4130 : */
4131 :
4132 7 : newExpectedTLEs = readTimeLineHistory(newtarget);
4133 :
4134 : /*
4135 : * If the current timeline is not part of the history of the new timeline,
4136 : * we cannot proceed to it.
4137 : */
4138 7 : found = false;
4139 14 : foreach(cell, newExpectedTLEs)
4140 : {
4141 14 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4142 :
4143 14 : if (currentTle->tli == recoveryTargetTLI)
4144 : {
4145 7 : found = true;
4146 7 : break;
4147 : }
4148 : }
4149 7 : if (!found)
4150 : {
4151 0 : ereport(LOG,
4152 : (errmsg("new timeline %u is not a child of database system timeline %u",
4153 : newtarget,
4154 : replayTLI)));
4155 0 : return false;
4156 : }
4157 :
4158 : /*
4159 : * The current timeline was found in the history file, but check that the
4160 : * next timeline was forked off from it *after* the current recovery
4161 : * location.
4162 : */
4163 7 : if (currentTle->end < replayLSN)
4164 : {
4165 0 : ereport(LOG,
4166 : errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4167 : newtarget,
4168 : replayTLI,
4169 : LSN_FORMAT_ARGS(replayLSN)));
4170 0 : return false;
4171 : }
4172 :
4173 : /* The new timeline history seems valid. Switch target */
4174 7 : recoveryTargetTLI = newtarget;
4175 7 : list_free_deep(expectedTLEs);
4176 7 : expectedTLEs = newExpectedTLEs;
4177 :
4178 : /*
4179 : * As in StartupXLOG(), try to ensure we have all the history files
4180 : * between the old target and new target in pg_wal.
4181 : */
4182 7 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4183 :
4184 7 : ereport(LOG,
4185 : (errmsg("new target timeline is %u",
4186 : recoveryTargetTLI)));
4187 :
4188 7 : return true;
4189 : }
4190 :
4191 :
4192 : /*
4193 : * Open a logfile segment for reading (during recovery).
4194 : *
4195 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4196 : * Otherwise, it's assumed to be already available in pg_wal.
4197 : */
4198 : static int
4199 3610 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4200 : XLogSource source, bool notfoundOk)
4201 : {
4202 : char xlogfname[MAXFNAMELEN];
4203 : char activitymsg[MAXFNAMELEN + 16];
4204 : char path[MAXPGPATH];
4205 : int fd;
4206 :
4207 3610 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4208 :
4209 3610 : switch (source)
4210 : {
4211 859 : case XLOG_FROM_ARCHIVE:
4212 : /* Report recovery progress in PS display */
4213 859 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4214 : xlogfname);
4215 859 : set_ps_display(activitymsg);
4216 :
4217 859 : if (!RestoreArchivedFile(path, xlogfname,
4218 : "RECOVERYXLOG",
4219 : wal_segment_size,
4220 : InRedo))
4221 497 : return -1;
4222 362 : break;
4223 :
4224 2751 : case XLOG_FROM_PG_WAL:
4225 : case XLOG_FROM_STREAM:
4226 2751 : XLogFilePath(path, tli, segno, wal_segment_size);
4227 2751 : break;
4228 :
4229 0 : default:
4230 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4231 : }
4232 :
4233 : /*
4234 : * If the segment was fetched from archival storage, replace the existing
4235 : * xlog segment (if any) with the archival version.
4236 : */
4237 3113 : if (source == XLOG_FROM_ARCHIVE)
4238 : {
4239 : Assert(!IsInstallXLogFileSegmentActive());
4240 362 : KeepFileRestoredFromArchive(path, xlogfname);
4241 :
4242 : /*
4243 : * Set path to point at the new file in pg_wal.
4244 : */
4245 362 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4246 : }
4247 :
4248 3113 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4249 3113 : if (fd >= 0)
4250 : {
4251 : /* Success! */
4252 2909 : curFileTLI = tli;
4253 :
4254 : /* Report recovery progress in PS display */
4255 2909 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4256 : xlogfname);
4257 2909 : set_ps_display(activitymsg);
4258 :
4259 : /* Track source of data in assorted state variables */
4260 2909 : readSource = source;
4261 2909 : XLogReceiptSource = source;
4262 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4263 2909 : if (source != XLOG_FROM_STREAM)
4264 1724 : XLogReceiptTime = GetCurrentTimestamp();
4265 :
4266 2909 : return fd;
4267 : }
4268 204 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4269 0 : ereport(PANIC,
4270 : (errcode_for_file_access(),
4271 : errmsg("could not open file \"%s\": %m", path)));
4272 204 : return -1;
4273 : }
4274 :
4275 : /*
4276 : * Open a logfile segment for reading (during recovery).
4277 : *
4278 : * This version searches for the segment with any TLI listed in expectedTLEs.
4279 : */
4280 : static int
4281 1917 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4282 : {
4283 : char path[MAXPGPATH];
4284 : ListCell *cell;
4285 : int fd;
4286 : List *tles;
4287 :
4288 : /*
4289 : * Loop looking for a suitable timeline ID: we might need to read any of
4290 : * the timelines listed in expectedTLEs.
4291 : *
4292 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4293 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4294 : * to go backwards; this prevents us from picking up the wrong file when a
4295 : * parent timeline extends to higher segment numbers than the child we
4296 : * want to read.
4297 : *
4298 : * If we haven't read the timeline history file yet, read it now, so that
4299 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4300 : * however, unless we actually find a valid segment. That way if there is
4301 : * neither a timeline history file nor a WAL segment in the archive, and
4302 : * streaming replication is set up, we'll read the timeline history file
4303 : * streamed from the primary when we start streaming, instead of
4304 : * recovering with a dummy history generated here.
4305 : */
4306 1917 : if (expectedTLEs)
4307 839 : tles = expectedTLEs;
4308 : else
4309 1078 : tles = readTimeLineHistory(recoveryTargetTLI);
4310 :
4311 2127 : foreach(cell, tles)
4312 : {
4313 1941 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4314 1941 : TimeLineID tli = hent->tli;
4315 :
4316 1941 : if (tli < curFileTLI)
4317 7 : break; /* don't bother looking at too-old TLIs */
4318 :
4319 : /*
4320 : * Skip scanning the timeline ID that the logfile segment to read
4321 : * doesn't belong to
4322 : */
4323 1934 : if (XLogRecPtrIsValid(hent->begin))
4324 : {
4325 79 : XLogSegNo beginseg = 0;
4326 :
4327 79 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4328 :
4329 : /*
4330 : * The logfile segment that doesn't belong to the timeline is
4331 : * older or newer than the segment that the timeline started or
4332 : * ended at, respectively. It's sufficient to check only the
4333 : * starting segment of the timeline here. Since the timelines are
4334 : * scanned in descending order in this loop, any segments newer
4335 : * than the ending segment should belong to newer timeline and
4336 : * have already been read before. So it's not necessary to check
4337 : * the ending segment of the timeline here.
4338 : */
4339 79 : if (segno < beginseg)
4340 6 : continue;
4341 : }
4342 :
4343 1928 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4344 : {
4345 859 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4346 859 : if (fd != -1)
4347 : {
4348 362 : elog(DEBUG1, "got WAL segment from archive");
4349 362 : if (!expectedTLEs)
4350 18 : expectedTLEs = tles;
4351 1724 : return fd;
4352 : }
4353 : }
4354 :
4355 1566 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4356 : {
4357 1566 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4358 1566 : if (fd != -1)
4359 : {
4360 1362 : if (!expectedTLEs)
4361 1059 : expectedTLEs = tles;
4362 1362 : return fd;
4363 : }
4364 : }
4365 : }
4366 :
4367 : /* Couldn't find it. For simplicity, complain about front timeline */
4368 193 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4369 193 : errno = ENOENT;
4370 193 : ereport(DEBUG2,
4371 : (errcode_for_file_access(),
4372 : errmsg("could not open file \"%s\": %m", path)));
4373 193 : return -1;
4374 : }
4375 :
4376 : /*
4377 : * Set flag to signal the walreceiver to restart. (The startup process calls
4378 : * this on noticing a relevant configuration change.)
4379 : */
4380 : void
4381 11 : StartupRequestWalReceiverRestart(void)
4382 : {
4383 11 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4384 : {
4385 7 : ereport(LOG,
4386 : (errmsg("WAL receiver process shutdown requested")));
4387 :
4388 7 : pendingWalRcvRestart = true;
4389 : }
4390 11 : }
4391 :
4392 :
4393 : /*
4394 : * Has a standby promotion already been triggered?
4395 : *
4396 : * Unlike CheckForStandbyTrigger(), this works in any process
4397 : * that's connected to shared memory.
4398 : */
4399 : bool
4400 72 : PromoteIsTriggered(void)
4401 : {
4402 : /*
4403 : * We check shared state each time only until a standby promotion is
4404 : * triggered. We can't trigger a promotion again, so there's no need to
4405 : * keep checking after the shared variable has once been seen true.
4406 : */
4407 72 : if (LocalPromoteIsTriggered)
4408 51 : return true;
4409 :
4410 21 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4411 21 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4412 21 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4413 :
4414 21 : return LocalPromoteIsTriggered;
4415 : }
4416 :
4417 : static void
4418 48 : SetPromoteIsTriggered(void)
4419 : {
4420 48 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4421 48 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4422 48 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4423 :
4424 : /*
4425 : * Mark the recovery pause state as 'not paused' because the paused state
4426 : * ends and promotion continues if a promotion is triggered while recovery
4427 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4428 : * return 'paused' while a promotion is ongoing.
4429 : */
4430 48 : SetRecoveryPause(false);
4431 :
4432 48 : LocalPromoteIsTriggered = true;
4433 48 : }
4434 :
4435 : /*
4436 : * Check whether a promote request has arrived.
4437 : */
4438 : static bool
4439 15152 : CheckForStandbyTrigger(void)
4440 : {
4441 15152 : if (LocalPromoteIsTriggered)
4442 57 : return true;
4443 :
4444 15095 : if (IsPromoteSignaled() && CheckPromoteSignal())
4445 : {
4446 48 : ereport(LOG, (errmsg("received promote request")));
4447 48 : RemovePromoteSignalFiles();
4448 48 : ResetPromoteSignaled();
4449 48 : SetPromoteIsTriggered();
4450 48 : return true;
4451 : }
4452 :
4453 15047 : return false;
4454 : }
4455 :
4456 : /*
4457 : * Remove the files signaling a standby promotion request.
4458 : */
4459 : void
4460 1031 : RemovePromoteSignalFiles(void)
4461 : {
4462 1031 : unlink(PROMOTE_SIGNAL_FILE);
4463 1031 : }
4464 :
4465 : /*
4466 : * Check to see if a promote request has arrived.
4467 : */
4468 : bool
4469 723 : CheckPromoteSignal(void)
4470 : {
4471 : struct stat stat_buf;
4472 :
4473 723 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4474 96 : return true;
4475 :
4476 627 : return false;
4477 : }
4478 :
4479 : /*
4480 : * Wake up startup process to replay newly arrived WAL, or to notice that
4481 : * failover has been requested.
4482 : */
4483 : void
4484 40227 : WakeupRecovery(void)
4485 : {
4486 40227 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4487 40227 : }
4488 :
4489 : /*
4490 : * Schedule a walreceiver wakeup in the main recovery loop.
4491 : */
4492 : void
4493 2 : XLogRequestWalReceiverReply(void)
4494 : {
4495 2 : doRequestWalReceiverReply = true;
4496 2 : }
4497 :
4498 : /*
4499 : * Is HotStandby active yet? This is only important in special backends
4500 : * since normal backends won't ever be able to connect until this returns
4501 : * true. Postmaster knows this by way of signal, not via shared memory.
4502 : *
4503 : * Unlike testing standbyState, this works in any process that's connected to
4504 : * shared memory. (And note that standbyState alone doesn't tell the truth
4505 : * anyway.)
4506 : */
4507 : bool
4508 172 : HotStandbyActive(void)
4509 : {
4510 : /*
4511 : * We check shared state each time only until Hot Standby is active. We
4512 : * can't de-activate Hot Standby, so there's no need to keep checking
4513 : * after the shared variable has once been seen true.
4514 : */
4515 172 : if (LocalHotStandbyActive)
4516 25 : return true;
4517 : else
4518 : {
4519 : /* spinlock is essential on machines with weak memory ordering! */
4520 147 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4521 147 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4522 147 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4523 :
4524 147 : return LocalHotStandbyActive;
4525 : }
4526 : }
4527 :
4528 : /*
4529 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4530 : * where we don't need to ask any other process what the state is.
4531 : */
4532 : static bool
4533 0 : HotStandbyActiveInReplay(void)
4534 : {
4535 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4536 0 : return LocalHotStandbyActive;
4537 : }
4538 :
4539 : /*
4540 : * Get latest redo apply position.
4541 : *
4542 : * Exported to allow WALReceiver to read the pointer directly.
4543 : */
4544 : XLogRecPtr
4545 106489 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4546 : {
4547 : XLogRecPtr recptr;
4548 : TimeLineID tli;
4549 :
4550 106489 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4551 106489 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4552 106489 : tli = XLogRecoveryCtl->lastReplayedTLI;
4553 106489 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4554 :
4555 106489 : if (replayTLI)
4556 3730 : *replayTLI = tli;
4557 106489 : return recptr;
4558 : }
4559 :
4560 :
4561 : /*
4562 : * Get position of last applied, or the record being applied.
4563 : *
4564 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4565 : * record is currently being applied, this includes that record.
4566 : */
4567 : XLogRecPtr
4568 6648 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4569 : {
4570 : XLogRecPtr recptr;
4571 : TimeLineID tli;
4572 :
4573 6648 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4574 6648 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4575 6648 : tli = XLogRecoveryCtl->replayEndTLI;
4576 6648 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4577 :
4578 6648 : if (replayEndTLI)
4579 6648 : *replayEndTLI = tli;
4580 6648 : return recptr;
4581 : }
4582 :
4583 : /*
4584 : * Save timestamp of latest processed commit/abort record.
4585 : *
4586 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4587 : * seen by processes other than the startup process. Note in particular
4588 : * that CreateRestartPoint is executed in the checkpointer.
4589 : */
4590 : static void
4591 23548 : SetLatestXTime(TimestampTz xtime)
4592 : {
4593 23548 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4594 23548 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4595 23548 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4596 23548 : }
4597 :
4598 : /*
4599 : * Fetch timestamp of latest processed commit/abort record.
4600 : */
4601 : TimestampTz
4602 365 : GetLatestXTime(void)
4603 : {
4604 : TimestampTz xtime;
4605 :
4606 365 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4607 365 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4608 365 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4609 :
4610 365 : return xtime;
4611 : }
4612 :
4613 : /*
4614 : * Save timestamp of the next chunk of WAL records to apply.
4615 : *
4616 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4617 : * seen by all backends.
4618 : */
4619 : static void
4620 12110 : SetCurrentChunkStartTime(TimestampTz xtime)
4621 : {
4622 12110 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4623 12110 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4624 12110 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4625 12110 : }
4626 :
4627 : /*
4628 : * Fetch timestamp of latest processed commit/abort record.
4629 : * Startup process maintains an accurate local copy in XLogReceiptTime
4630 : */
4631 : TimestampTz
4632 273 : GetCurrentChunkReplayStartTime(void)
4633 : {
4634 : TimestampTz xtime;
4635 :
4636 273 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4637 273 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4638 273 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4639 :
4640 273 : return xtime;
4641 : }
4642 :
4643 : /*
4644 : * Returns time of receipt of current chunk of XLOG data, as well as
4645 : * whether it was received from streaming replication or from archives.
4646 : */
4647 : void
4648 28 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4649 : {
4650 : /*
4651 : * This must be executed in the startup process, since we don't export the
4652 : * relevant state to shared memory.
4653 : */
4654 : Assert(InRecovery);
4655 :
4656 28 : *rtime = XLogReceiptTime;
4657 28 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4658 28 : }
4659 :
4660 : /*
4661 : * Note that text field supplied is a parameter name and does not require
4662 : * translation
4663 : */
4664 : void
4665 690 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4666 : {
4667 690 : if (currValue < minValue)
4668 : {
4669 0 : if (HotStandbyActiveInReplay())
4670 : {
4671 0 : bool warned_for_promote = false;
4672 :
4673 0 : ereport(WARNING,
4674 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4675 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4676 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4677 : param_name,
4678 : currValue,
4679 : minValue)));
4680 :
4681 0 : SetRecoveryPause(true);
4682 :
4683 0 : ereport(LOG,
4684 : (errmsg("recovery has paused"),
4685 : errdetail("If recovery is unpaused, the server will shut down."),
4686 : errhint("You can then restart the server after making the necessary configuration changes.")));
4687 :
4688 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4689 : {
4690 0 : ProcessStartupProcInterrupts();
4691 :
4692 0 : if (CheckForStandbyTrigger())
4693 : {
4694 0 : if (!warned_for_promote)
4695 0 : ereport(WARNING,
4696 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4697 : errmsg("promotion is not possible because of insufficient parameter settings"),
4698 :
4699 : /*
4700 : * Repeat the detail from above so it's easy to find
4701 : * in the log.
4702 : */
4703 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4704 : param_name,
4705 : currValue,
4706 : minValue),
4707 : errhint("Restart the server after making the necessary configuration changes.")));
4708 0 : warned_for_promote = true;
4709 : }
4710 :
4711 : /*
4712 : * If recovery pause is requested then set it paused. While
4713 : * we are in the loop, user might resume and pause again so
4714 : * set this every time.
4715 : */
4716 0 : ConfirmRecoveryPaused();
4717 :
4718 : /*
4719 : * We wait on a condition variable that will wake us as soon
4720 : * as the pause ends, but we use a timeout so we can check the
4721 : * above conditions periodically too.
4722 : */
4723 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4724 : WAIT_EVENT_RECOVERY_PAUSE);
4725 : }
4726 0 : ConditionVariableCancelSleep();
4727 : }
4728 :
4729 0 : ereport(FATAL,
4730 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4731 : errmsg("recovery aborted because of insufficient parameter settings"),
4732 : /* Repeat the detail from above so it's easy to find in the log. */
4733 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4734 : param_name,
4735 : currValue,
4736 : minValue),
4737 : errhint("You can restart the server after making the necessary configuration changes.")));
4738 : }
4739 690 : }
4740 :
4741 :
4742 : /*
4743 : * GUC check_hook for primary_slot_name
4744 : */
4745 : bool
4746 1468 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4747 : {
4748 : int err_code;
4749 1468 : char *err_msg = NULL;
4750 1468 : char *err_hint = NULL;
4751 :
4752 1468 : if (*newval && strcmp(*newval, "") != 0 &&
4753 188 : !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
4754 : &err_msg, &err_hint))
4755 : {
4756 0 : GUC_check_errcode(err_code);
4757 0 : GUC_check_errdetail("%s", err_msg);
4758 0 : if (err_hint != NULL)
4759 0 : GUC_check_errhint("%s", err_hint);
4760 0 : return false;
4761 : }
4762 :
4763 1468 : return true;
4764 : }
4765 :
4766 : /*
4767 : * Recovery target settings: Only one of the several recovery_target* settings
4768 : * may be set. Setting a second one results in an error. The global variable
4769 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4770 : * variables store the actual target value (for example a string or a xid).
4771 : * The assign functions of the parameters check whether a competing parameter
4772 : * was already set. But we want to allow setting the same parameter multiple
4773 : * times. We also want to allow unsetting a parameter and setting a different
4774 : * one, so we unset recoveryTarget when the parameter is set to an empty
4775 : * string.
4776 : *
4777 : * XXX this code is broken by design. Throwing an error from a GUC assign
4778 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4779 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4780 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4781 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4782 : */
4783 :
4784 : pg_noreturn static void
4785 1 : error_multiple_recovery_targets(void)
4786 : {
4787 1 : ereport(ERROR,
4788 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4789 : errmsg("multiple recovery targets specified"),
4790 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4791 : }
4792 :
4793 : /*
4794 : * GUC check_hook for recovery_target
4795 : */
4796 : bool
4797 1281 : check_recovery_target(char **newval, void **extra, GucSource source)
4798 : {
4799 1281 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4800 : {
4801 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4802 0 : return false;
4803 : }
4804 1281 : return true;
4805 : }
4806 :
4807 : /*
4808 : * GUC assign_hook for recovery_target
4809 : */
4810 : void
4811 1281 : assign_recovery_target(const char *newval, void *extra)
4812 : {
4813 1281 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4814 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4815 0 : error_multiple_recovery_targets();
4816 :
4817 1281 : if (newval && strcmp(newval, "") != 0)
4818 1 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4819 : else
4820 1280 : recoveryTarget = RECOVERY_TARGET_UNSET;
4821 1281 : }
4822 :
4823 : /*
4824 : * GUC check_hook for recovery_target_lsn
4825 : */
4826 : bool
4827 1287 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4828 : {
4829 1287 : if (strcmp(*newval, "") != 0)
4830 : {
4831 : XLogRecPtr lsn;
4832 : XLogRecPtr *myextra;
4833 8 : ErrorSaveContext escontext = {T_ErrorSaveContext};
4834 :
4835 8 : lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4836 8 : if (escontext.error_occurred)
4837 0 : return false;
4838 :
4839 8 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4840 8 : if (!myextra)
4841 0 : return false;
4842 8 : *myextra = lsn;
4843 8 : *extra = myextra;
4844 : }
4845 1287 : return true;
4846 : }
4847 :
4848 : /*
4849 : * GUC assign_hook for recovery_target_lsn
4850 : */
4851 : void
4852 1287 : assign_recovery_target_lsn(const char *newval, void *extra)
4853 : {
4854 1287 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4855 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4856 0 : error_multiple_recovery_targets();
4857 :
4858 1287 : if (newval && strcmp(newval, "") != 0)
4859 : {
4860 8 : recoveryTarget = RECOVERY_TARGET_LSN;
4861 8 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4862 : }
4863 : else
4864 1279 : recoveryTarget = RECOVERY_TARGET_UNSET;
4865 1287 : }
4866 :
4867 : /*
4868 : * GUC check_hook for recovery_target_name
4869 : */
4870 : bool
4871 1287 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4872 : {
4873 : /* Use the value of newval directly */
4874 1287 : if (strlen(*newval) >= MAXFNAMELEN)
4875 : {
4876 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4877 : "recovery_target_name", MAXFNAMELEN - 1);
4878 0 : return false;
4879 : }
4880 1287 : return true;
4881 : }
4882 :
4883 : /*
4884 : * GUC assign_hook for recovery_target_name
4885 : */
4886 : void
4887 1287 : assign_recovery_target_name(const char *newval, void *extra)
4888 : {
4889 1287 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4890 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4891 0 : error_multiple_recovery_targets();
4892 :
4893 1287 : if (newval && strcmp(newval, "") != 0)
4894 : {
4895 6 : recoveryTarget = RECOVERY_TARGET_NAME;
4896 6 : recoveryTargetName = newval;
4897 : }
4898 : else
4899 1281 : recoveryTarget = RECOVERY_TARGET_UNSET;
4900 1287 : }
4901 :
4902 : /*
4903 : * GUC check_hook for recovery_target_time
4904 : *
4905 : * The interpretation of the recovery_target_time string can depend on the
4906 : * time zone setting, so we need to wait until after all GUC processing is
4907 : * done before we can do the final parsing of the string. This check function
4908 : * only does a parsing pass to catch syntax errors, but we store the string
4909 : * and parse it again when we need to use it.
4910 : */
4911 : bool
4912 1283 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4913 : {
4914 1283 : if (strcmp(*newval, "") != 0)
4915 : {
4916 : /* reject some special values */
4917 3 : if (strcmp(*newval, "now") == 0 ||
4918 3 : strcmp(*newval, "today") == 0 ||
4919 3 : strcmp(*newval, "tomorrow") == 0 ||
4920 3 : strcmp(*newval, "yesterday") == 0)
4921 : {
4922 0 : return false;
4923 : }
4924 :
4925 : /*
4926 : * parse timestamp value (see also timestamptz_in())
4927 : */
4928 : {
4929 3 : char *str = *newval;
4930 : fsec_t fsec;
4931 : struct pg_tm tt,
4932 3 : *tm = &tt;
4933 : int tz;
4934 : int dtype;
4935 : int nf;
4936 : int dterr;
4937 : char *field[MAXDATEFIELDS];
4938 : int ftype[MAXDATEFIELDS];
4939 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4940 : DateTimeErrorExtra dtextra;
4941 : TimestampTz timestamp;
4942 :
4943 3 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4944 : field, ftype, MAXDATEFIELDS, &nf);
4945 3 : if (dterr == 0)
4946 3 : dterr = DecodeDateTime(field, ftype, nf,
4947 : &dtype, tm, &fsec, &tz, &dtextra);
4948 3 : if (dterr != 0)
4949 0 : return false;
4950 3 : if (dtype != DTK_DATE)
4951 0 : return false;
4952 :
4953 3 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4954 : {
4955 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4956 0 : return false;
4957 : }
4958 : }
4959 : }
4960 1283 : return true;
4961 : }
4962 :
4963 : /*
4964 : * GUC assign_hook for recovery_target_time
4965 : */
4966 : void
4967 1283 : assign_recovery_target_time(const char *newval, void *extra)
4968 : {
4969 1283 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4970 1 : recoveryTarget != RECOVERY_TARGET_TIME)
4971 1 : error_multiple_recovery_targets();
4972 :
4973 1282 : if (newval && strcmp(newval, "") != 0)
4974 2 : recoveryTarget = RECOVERY_TARGET_TIME;
4975 : else
4976 1280 : recoveryTarget = RECOVERY_TARGET_UNSET;
4977 1282 : }
4978 :
4979 : /*
4980 : * GUC check_hook for recovery_target_timeline
4981 : */
4982 : bool
4983 1284 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4984 : {
4985 : RecoveryTargetTimeLineGoal rttg;
4986 : RecoveryTargetTimeLineGoal *myextra;
4987 :
4988 1284 : if (strcmp(*newval, "current") == 0)
4989 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4990 1284 : else if (strcmp(*newval, "latest") == 0)
4991 1281 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4992 : else
4993 : {
4994 : char *endp;
4995 : uint64 timeline;
4996 :
4997 3 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
4998 :
4999 3 : errno = 0;
5000 3 : timeline = strtou64(*newval, &endp, 0);
5001 :
5002 3 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5003 : {
5004 1 : GUC_check_errdetail("\"%s\" is not a valid number.",
5005 : "recovery_target_timeline");
5006 3 : return false;
5007 : }
5008 :
5009 2 : if (timeline < 1 || timeline > PG_UINT32_MAX)
5010 : {
5011 2 : GUC_check_errdetail("\"%s\" must be between %u and %u.",
5012 : "recovery_target_timeline", 1, PG_UINT32_MAX);
5013 2 : return false;
5014 : }
5015 : }
5016 :
5017 1281 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5018 1281 : if (!myextra)
5019 0 : return false;
5020 1281 : *myextra = rttg;
5021 1281 : *extra = myextra;
5022 :
5023 1281 : return true;
5024 : }
5025 :
5026 : /*
5027 : * GUC assign_hook for recovery_target_timeline
5028 : */
5029 : void
5030 1281 : assign_recovery_target_timeline(const char *newval, void *extra)
5031 : {
5032 1281 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5033 1281 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5034 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5035 : else
5036 1281 : recoveryTargetTLIRequested = 0;
5037 1281 : }
5038 :
5039 : /*
5040 : * GUC check_hook for recovery_target_xid
5041 : */
5042 : bool
5043 1283 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5044 : {
5045 1283 : if (strcmp(*newval, "") != 0)
5046 : {
5047 : TransactionId xid;
5048 : TransactionId *myextra;
5049 : char *endp;
5050 : char *val;
5051 :
5052 3 : errno = 0;
5053 :
5054 : /*
5055 : * Consume leading whitespace to determine if number is negative
5056 : */
5057 3 : val = *newval;
5058 :
5059 3 : while (isspace((unsigned char) *val))
5060 0 : val++;
5061 :
5062 : /*
5063 : * This cast will remove the epoch, if any
5064 : */
5065 3 : xid = (TransactionId) strtou64(val, &endp, 0);
5066 :
5067 3 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE || *val == '-')
5068 : {
5069 2 : GUC_check_errdetail("\"%s\" is not a valid number.",
5070 : "recovery_target_xid");
5071 2 : return false;
5072 : }
5073 :
5074 1 : if (xid < FirstNormalTransactionId)
5075 : {
5076 0 : GUC_check_errdetail("\"%s\" without epoch must be greater than or equal to %u.",
5077 : "recovery_target_xid",
5078 : FirstNormalTransactionId);
5079 0 : return false;
5080 : }
5081 :
5082 1 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5083 1 : if (!myextra)
5084 0 : return false;
5085 1 : *myextra = xid;
5086 1 : *extra = myextra;
5087 : }
5088 1281 : return true;
5089 : }
5090 :
5091 : /*
5092 : * GUC assign_hook for recovery_target_xid
5093 : */
5094 : void
5095 1281 : assign_recovery_target_xid(const char *newval, void *extra)
5096 : {
5097 1281 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5098 0 : recoveryTarget != RECOVERY_TARGET_XID)
5099 0 : error_multiple_recovery_targets();
5100 :
5101 1281 : if (newval && strcmp(newval, "") != 0)
5102 : {
5103 1 : recoveryTarget = RECOVERY_TARGET_XID;
5104 1 : recoveryTargetXid = *((TransactionId *) extra);
5105 : }
5106 : else
5107 1280 : recoveryTarget = RECOVERY_TARGET_UNSET;
5108 1281 : }
|