Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * xlogrecovery.c
4 : * Functions for WAL recovery, standby mode
5 : *
6 : * This source file contains functions controlling WAL recovery.
7 : * InitWalRecovery() initializes the system for crash or archive recovery,
8 : * or standby mode, depending on configuration options and the state of
9 : * the control file and possible backup label file. PerformWalRecovery()
10 : * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 : * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 : * and prepares information needed to initialize the WAL for writes. In
13 : * addition to these three main functions, there are a bunch of functions
14 : * for interrogating recovery state and controlling the recovery process.
15 : *
16 : *
17 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 : * Portions Copyright (c) 1994, Regents of the University of California
19 : *
20 : * src/backend/access/transam/xlogrecovery.c
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 :
25 : #include "postgres.h"
26 :
27 : #include <ctype.h>
28 : #include <time.h>
29 : #include <sys/stat.h>
30 : #include <sys/time.h>
31 : #include <unistd.h>
32 :
33 : #include "access/timeline.h"
34 : #include "access/transam.h"
35 : #include "access/xact.h"
36 : #include "access/xlog_internal.h"
37 : #include "access/xlogarchive.h"
38 : #include "access/xlogprefetcher.h"
39 : #include "access/xlogreader.h"
40 : #include "access/xlogrecovery.h"
41 : #include "access/xlogutils.h"
42 : #include "access/xlogwait.h"
43 : #include "backup/basebackup.h"
44 : #include "catalog/pg_control.h"
45 : #include "commands/tablespace.h"
46 : #include "common/file_utils.h"
47 : #include "miscadmin.h"
48 : #include "nodes/miscnodes.h"
49 : #include "pgstat.h"
50 : #include "postmaster/bgwriter.h"
51 : #include "postmaster/startup.h"
52 : #include "replication/slot.h"
53 : #include "replication/slotsync.h"
54 : #include "replication/walreceiver.h"
55 : #include "storage/fd.h"
56 : #include "storage/ipc.h"
57 : #include "storage/latch.h"
58 : #include "storage/pmsignal.h"
59 : #include "storage/procarray.h"
60 : #include "storage/spin.h"
61 : #include "utils/datetime.h"
62 : #include "utils/fmgrprotos.h"
63 : #include "utils/guc_hooks.h"
64 : #include "utils/pgstat_internal.h"
65 : #include "utils/pg_lsn.h"
66 : #include "utils/ps_status.h"
67 : #include "utils/pg_rusage.h"
68 : #include "utils/wait_event.h"
69 :
70 : /* Unsupported old recovery command file names (relative to $PGDATA) */
71 : #define RECOVERY_COMMAND_FILE "recovery.conf"
72 : #define RECOVERY_COMMAND_DONE "recovery.done"
73 :
74 : /*
75 : * GUC support
76 : */
77 : const struct config_enum_entry recovery_target_action_options[] = {
78 : {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
79 : {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
80 : {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
81 : {NULL, 0, false}
82 : };
83 :
84 : /* options formerly taken from recovery.conf for archive recovery */
85 : char *recoveryRestoreCommand = NULL;
86 : char *recoveryEndCommand = NULL;
87 : char *archiveCleanupCommand = NULL;
88 : RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
89 : bool recoveryTargetInclusive = true;
90 : int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
91 : TransactionId recoveryTargetXid;
92 : char *recovery_target_time_string;
93 : TimestampTz recoveryTargetTime;
94 : const char *recoveryTargetName;
95 : XLogRecPtr recoveryTargetLSN;
96 : int recovery_min_apply_delay = 0;
97 :
98 : /* options formerly taken from recovery.conf for XLOG streaming */
99 : char *PrimaryConnInfo = NULL;
100 : char *PrimarySlotName = NULL;
101 : bool wal_receiver_create_temp_slot = false;
102 :
103 : /*
104 : * recoveryTargetTimeLineGoal: what the user requested, if any
105 : *
106 : * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
107 : *
108 : * recoveryTargetTLI: the currently understood target timeline; changes
109 : *
110 : * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
111 : * the timelines of its known parents, newest first (so recoveryTargetTLI is
112 : * always the first list member). Only these TLIs are expected to be seen in
113 : * the WAL segments we read, and indeed only these TLIs will be considered as
114 : * candidate WAL files to open at all.
115 : *
116 : * curFileTLI: the TLI appearing in the name of the current input WAL file.
117 : * (This is not necessarily the same as the timeline from which we are
118 : * replaying WAL, which StartupXLOG calls replayTLI, because we could be
119 : * scanning data that was copied from an ancestor timeline when the current
120 : * file was created.) During a sequential scan we do not allow this value
121 : * to decrease.
122 : */
123 : RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
124 : TimeLineID recoveryTargetTLIRequested = 0;
125 : TimeLineID recoveryTargetTLI = 0;
126 : static List *expectedTLEs;
127 : static TimeLineID curFileTLI;
128 :
129 : /*
130 : * When ArchiveRecoveryRequested is set, archive recovery was requested,
131 : * ie. signal files were present. When InArchiveRecovery is set, we are
132 : * currently recovering using offline XLOG archives. These variables are only
133 : * valid in the startup process.
134 : *
135 : * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
136 : * currently performing crash recovery using only XLOG files in pg_wal, but
137 : * will switch to using offline XLOG archives as soon as we reach the end of
138 : * WAL in pg_wal.
139 : */
140 : bool ArchiveRecoveryRequested = false;
141 : bool InArchiveRecovery = false;
142 :
143 : /*
144 : * When StandbyModeRequested is set, standby mode was requested, i.e.
145 : * standby.signal file was present. When StandbyMode is set, we are currently
146 : * in standby mode. These variables are only valid in the startup process.
147 : * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
148 : */
149 : static bool StandbyModeRequested = false;
150 : bool StandbyMode = false;
151 :
152 : /* was a signal file present at startup? */
153 : static bool standby_signal_file_found = false;
154 : static bool recovery_signal_file_found = false;
155 :
156 : /*
157 : * CheckPointLoc is the position of the checkpoint record that determines
158 : * where to start the replay. It comes from the backup label file or the
159 : * control file.
160 : *
161 : * RedoStartLSN is the checkpoint's REDO location, also from the backup label
162 : * file or the control file. In standby mode, XLOG streaming usually starts
163 : * from the position where an invalid record was found. But if we fail to
164 : * read even the initial checkpoint record, we use the REDO location instead
165 : * of the checkpoint location as the start position of XLOG streaming.
166 : * Otherwise we would have to jump backwards to the REDO location after
167 : * reading the checkpoint record, because the REDO record can precede the
168 : * checkpoint record.
169 : */
170 : static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
171 : static TimeLineID CheckPointTLI = 0;
172 : static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
173 : static TimeLineID RedoStartTLI = 0;
174 :
175 : /*
176 : * Local copy of SharedHotStandbyActive variable. False actually means "not
177 : * known, need to check the shared state".
178 : */
179 : static bool LocalHotStandbyActive = false;
180 :
181 : /*
182 : * Local copy of SharedPromoteIsTriggered variable. False actually means "not
183 : * known, need to check the shared state".
184 : */
185 : static bool LocalPromoteIsTriggered = false;
186 :
187 : /* Has the recovery code requested a walreceiver wakeup? */
188 : static bool doRequestWalReceiverReply;
189 :
190 : /* XLogReader object used to parse the WAL records */
191 : static XLogReaderState *xlogreader = NULL;
192 :
193 : /* XLogPrefetcher object used to consume WAL records with read-ahead */
194 : static XLogPrefetcher *xlogprefetcher = NULL;
195 :
196 : /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
197 : typedef struct XLogPageReadPrivate
198 : {
199 : int emode;
200 : bool fetching_ckpt; /* are we fetching a checkpoint record? */
201 : bool randAccess;
202 : TimeLineID replayTLI;
203 : } XLogPageReadPrivate;
204 :
205 : /* flag to tell XLogPageRead that we have started replaying */
206 : static bool InRedo = false;
207 :
208 : /*
209 : * Codes indicating where we got a WAL file from during recovery, or where
210 : * to attempt to get one.
211 : */
212 : typedef enum
213 : {
214 : XLOG_FROM_ANY = 0, /* request to read WAL from any source */
215 : XLOG_FROM_ARCHIVE, /* restored using restore_command */
216 : XLOG_FROM_PG_WAL, /* existing file in pg_wal */
217 : XLOG_FROM_STREAM, /* streamed from primary */
218 : } XLogSource;
219 :
220 : /* human-readable names for XLogSources, for debugging output */
221 : static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
222 :
223 : /*
224 : * readFile is -1 or a kernel FD for the log file segment that's currently
225 : * open for reading. readSegNo identifies the segment. readOff is the offset
226 : * of the page just read, readLen indicates how much of it has been read into
227 : * readBuf, and readSource indicates where we got the currently open file from.
228 : *
229 : * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
230 : * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
231 : * worthwhile, since the XLOG is not read by general-purpose sessions.
232 : */
233 : static int readFile = -1;
234 : static XLogSegNo readSegNo = 0;
235 : static uint32 readOff = 0;
236 : static uint32 readLen = 0;
237 : static XLogSource readSource = XLOG_FROM_ANY;
238 :
239 : /*
240 : * Keeps track of which source we're currently reading from. This is
241 : * different from readSource in that this is always set, even when we don't
242 : * currently have a WAL file open. If lastSourceFailed is set, our last
243 : * attempt to read from currentSource failed, and we should try another source
244 : * next.
245 : *
246 : * pendingWalRcvRestart is set when a config change occurs that requires a
247 : * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
248 : */
249 : static XLogSource currentSource = XLOG_FROM_ANY;
250 : static bool lastSourceFailed = false;
251 : static bool pendingWalRcvRestart = false;
252 :
253 : /*
254 : * These variables track when we last obtained some WAL data to process,
255 : * and where we got it from. (XLogReceiptSource is initially the same as
256 : * readSource, but readSource gets reset to zero when we don't have data
257 : * to process right now. It is also different from currentSource, which
258 : * also changes when we try to read from a source and fail, while
259 : * XLogReceiptSource tracks where we last successfully read some WAL.)
260 : */
261 : static TimestampTz XLogReceiptTime = 0;
262 : static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
263 :
264 : /* Local copy of WalRcv->flushedUpto */
265 : static XLogRecPtr flushedUpto = InvalidXLogRecPtr;
266 : static TimeLineID receiveTLI = 0;
267 :
268 : /*
269 : * Copy of minRecoveryPoint and backupEndPoint from the control file.
270 : *
271 : * In order to reach consistency, we must replay the WAL up to
272 : * minRecoveryPoint. If backupEndRequired is true, we must also reach
273 : * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
274 : * to backupStartPoint.
275 : *
276 : * Note: In archive recovery, after consistency has been reached, the
277 : * functions in xlog.c will start updating minRecoveryPoint in the control
278 : * file. But this copy of minRecoveryPoint variable reflects the value at the
279 : * beginning of recovery, and is *not* updated after consistency is reached.
280 : */
281 : static XLogRecPtr minRecoveryPoint;
282 : static TimeLineID minRecoveryPointTLI;
283 :
284 : static XLogRecPtr backupStartPoint;
285 : static XLogRecPtr backupEndPoint;
286 : static bool backupEndRequired = false;
287 :
288 : /*
289 : * Have we reached a consistent database state? In crash recovery, we have
290 : * to replay all the WAL, so reachedConsistency is never set. During archive
291 : * recovery, the database is consistent once minRecoveryPoint is reached.
292 : *
293 : * Consistent state means that the system is internally consistent, all
294 : * the WAL has been replayed up to a certain point, and importantly, there
295 : * is no trace of later actions on disk.
296 : *
297 : * This flag is used only by the startup process and postmaster. When
298 : * minRecoveryPoint is reached, the startup process sets it to true and
299 : * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
300 : * which then sets it to true upon receiving the signal.
301 : */
302 : bool reachedConsistency = false;
303 :
304 : /* Buffers dedicated to consistency checks of size BLCKSZ */
305 : static char *replay_image_masked = NULL;
306 : static char *primary_image_masked = NULL;
307 :
308 : XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
309 :
310 : /*
311 : * abortedRecPtr is the start pointer of a broken record at end of WAL when
312 : * recovery completes; missingContrecPtr is the location of the first
313 : * contrecord that went missing. See CreateOverwriteContrecordRecord for
314 : * details.
315 : */
316 : static XLogRecPtr abortedRecPtr;
317 : static XLogRecPtr missingContrecPtr;
318 :
319 : /*
320 : * if recoveryStopsBefore/After returns true, it saves information of the stop
321 : * point here
322 : */
323 : static TransactionId recoveryStopXid;
324 : static TimestampTz recoveryStopTime;
325 : static XLogRecPtr recoveryStopLSN;
326 : static char recoveryStopName[MAXFNAMELEN];
327 : static bool recoveryStopAfter;
328 :
329 : /* prototypes for local functions */
330 : static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
331 :
332 : static void EnableStandbyMode(void);
333 : static void readRecoverySignalFile(void);
334 : static void validateRecoveryParameters(void);
335 : static bool read_backup_label(XLogRecPtr *checkPointLoc,
336 : TimeLineID *backupLabelTLI,
337 : bool *backupEndRequired, bool *backupFromStandby);
338 : static bool read_tablespace_map(List **tablespaces);
339 :
340 : static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
341 : static void CheckRecoveryConsistency(void);
342 : static void rm_redo_error_callback(void *arg);
343 : #ifdef WAL_DEBUG
344 : static void xlog_outrec(StringInfo buf, XLogReaderState *record);
345 : #endif
346 : static void xlog_block_info(StringInfo buf, XLogReaderState *record);
347 : static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
348 : TimeLineID prevTLI, TimeLineID replayTLI);
349 : static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
350 : static void verifyBackupPageConsistency(XLogReaderState *record);
351 :
352 : static bool recoveryStopsBefore(XLogReaderState *record);
353 : static bool recoveryStopsAfter(XLogReaderState *record);
354 : static char *getRecoveryStopReason(void);
355 : static void recoveryPausesHere(bool endOfRecovery);
356 : static bool recoveryApplyDelay(XLogReaderState *record);
357 : static void ConfirmRecoveryPaused(void);
358 :
359 : static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
360 : int emode, bool fetching_ckpt,
361 : TimeLineID replayTLI);
362 :
363 : static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
364 : int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
365 : static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
366 : bool randAccess,
367 : bool fetching_ckpt,
368 : XLogRecPtr tliRecPtr,
369 : TimeLineID replayTLI,
370 : XLogRecPtr replayLSN,
371 : bool nonblocking);
372 : static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
373 : static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
374 : XLogRecPtr RecPtr, TimeLineID replayTLI);
375 : static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
376 : static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
377 : XLogSource source, bool notfoundOk);
378 : static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
379 :
380 : static bool CheckForStandbyTrigger(void);
381 : static void SetPromoteIsTriggered(void);
382 : static bool HotStandbyActiveInReplay(void);
383 :
384 : static void SetCurrentChunkStartTime(TimestampTz xtime);
385 : static void SetLatestXTime(TimestampTz xtime);
386 :
387 : /*
388 : * Initialization of shared memory for WAL recovery
389 : */
390 : Size
391 3387 : XLogRecoveryShmemSize(void)
392 : {
393 : Size size;
394 :
395 : /* XLogRecoveryCtl */
396 3387 : size = sizeof(XLogRecoveryCtlData);
397 :
398 3387 : return size;
399 : }
400 :
401 : void
402 1180 : XLogRecoveryShmemInit(void)
403 : {
404 : bool found;
405 :
406 1180 : XLogRecoveryCtl = (XLogRecoveryCtlData *)
407 1180 : ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
408 1180 : if (found)
409 0 : return;
410 1180 : memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
411 :
412 1180 : SpinLockInit(&XLogRecoveryCtl->info_lck);
413 1180 : InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
414 1180 : ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
415 : }
416 :
417 : /*
418 : * A thin wrapper to enable StandbyMode and do other preparatory work as
419 : * needed.
420 : */
421 : static void
422 112 : EnableStandbyMode(void)
423 : {
424 112 : StandbyMode = true;
425 :
426 : /*
427 : * To avoid server log bloat, we don't report recovery progress in a
428 : * standby as it will always be in recovery unless promoted. We disable
429 : * startup progress timeout in standby mode to avoid calling
430 : * startup_progress_timeout_handler() unnecessarily.
431 : */
432 112 : disable_startup_progress_timeout();
433 112 : }
434 :
435 : /*
436 : * Prepare the system for WAL recovery, if needed.
437 : *
438 : * This is called by StartupXLOG() which coordinates the server startup
439 : * sequence. This function analyzes the control file and the backup label
440 : * file, if any, and figures out whether we need to perform crash recovery or
441 : * archive recovery, and how far we need to replay the WAL to reach a
442 : * consistent state.
443 : *
444 : * This doesn't yet change the on-disk state, except for creating the symlinks
445 : * from table space map file if any, and for fetching WAL files needed to find
446 : * the checkpoint record. On entry, the caller has already read the control
447 : * file into memory, and passes it as argument. This function updates it to
448 : * reflect the recovery state, and the caller is expected to write it back to
449 : * disk does after initializing other subsystems, but before calling
450 : * PerformWalRecovery().
451 : *
452 : * This initializes some global variables like ArchiveRecoveryRequested, and
453 : * StandbyModeRequested and InRecovery.
454 : */
455 : void
456 1035 : InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
457 : bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
458 : {
459 : XLogPageReadPrivate *private;
460 : struct stat st;
461 : bool wasShutdown;
462 : XLogRecord *record;
463 : DBState dbstate_at_startup;
464 1035 : bool haveTblspcMap = false;
465 1035 : bool haveBackupLabel = false;
466 : CheckPoint checkPoint;
467 1035 : bool backupFromStandby = false;
468 :
469 1035 : dbstate_at_startup = ControlFile->state;
470 :
471 : /*
472 : * Initialize on the assumption we want to recover to the latest timeline
473 : * that's active according to pg_control.
474 : */
475 1035 : if (ControlFile->minRecoveryPointTLI >
476 1035 : ControlFile->checkPointCopy.ThisTimeLineID)
477 1 : recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
478 : else
479 1034 : recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
480 :
481 : /*
482 : * Check for signal files, and if so set up state for offline recovery
483 : */
484 1035 : readRecoverySignalFile();
485 1035 : validateRecoveryParameters();
486 :
487 : /*
488 : * Take ownership of the wakeup latch if we're going to sleep during
489 : * recovery, if required.
490 : */
491 1035 : if (ArchiveRecoveryRequested)
492 117 : OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
493 :
494 : /*
495 : * Set the WAL reading processor now, as it will be needed when reading
496 : * the checkpoint record required (backup_label or not).
497 : */
498 1035 : private = palloc0_object(XLogPageReadPrivate);
499 1035 : xlogreader =
500 1035 : XLogReaderAllocate(wal_segment_size, NULL,
501 1035 : XL_ROUTINE(.page_read = &XLogPageRead,
502 : .segment_open = NULL,
503 : .segment_close = wal_segment_close),
504 : private);
505 1035 : if (!xlogreader)
506 0 : ereport(ERROR,
507 : (errcode(ERRCODE_OUT_OF_MEMORY),
508 : errmsg("out of memory"),
509 : errdetail("Failed while allocating a WAL reading processor.")));
510 1035 : xlogreader->system_identifier = ControlFile->system_identifier;
511 :
512 : /*
513 : * Set the WAL decode buffer size. This limits how far ahead we can read
514 : * in the WAL.
515 : */
516 1035 : XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
517 :
518 : /* Create a WAL prefetcher. */
519 1035 : xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
520 :
521 : /*
522 : * Allocate two page buffers dedicated to WAL consistency checks. We do
523 : * it this way, rather than just making static arrays, for two reasons:
524 : * (1) no need to waste the storage in most instantiations of the backend;
525 : * (2) a static char array isn't guaranteed to have any particular
526 : * alignment, whereas palloc() will provide MAXALIGN'd storage.
527 : */
528 1035 : replay_image_masked = (char *) palloc(BLCKSZ);
529 1035 : primary_image_masked = (char *) palloc(BLCKSZ);
530 :
531 : /*
532 : * Read the backup_label file. We want to run this part of the recovery
533 : * process after checking for signal files and after performing validation
534 : * of the recovery parameters.
535 : */
536 1035 : if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
537 : &backupFromStandby))
538 : {
539 80 : List *tablespaces = NIL;
540 :
541 : /*
542 : * Archive recovery was requested, and thanks to the backup label
543 : * file, we know how far we need to replay to reach consistency. Enter
544 : * archive recovery directly.
545 : */
546 80 : InArchiveRecovery = true;
547 80 : if (StandbyModeRequested)
548 68 : EnableStandbyMode();
549 :
550 : /*
551 : * Omitting backup_label when creating a new replica, PITR node etc.
552 : * unfortunately is a common cause of corruption. Logging that
553 : * backup_label was used makes it a bit easier to exclude that as the
554 : * cause of observed corruption.
555 : *
556 : * Do so before we try to read the checkpoint record (which can fail),
557 : * as otherwise it can be hard to understand why a checkpoint other
558 : * than ControlFile->checkPoint is used.
559 : */
560 80 : ereport(LOG,
561 : errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
562 : LSN_FORMAT_ARGS(RedoStartLSN),
563 : LSN_FORMAT_ARGS(CheckPointLoc),
564 : CheckPointTLI));
565 :
566 : /*
567 : * When a backup_label file is present, we want to roll forward from
568 : * the checkpoint it identifies, rather than using pg_control.
569 : */
570 80 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
571 : CheckPointTLI);
572 80 : if (record != NULL)
573 : {
574 80 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
575 80 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
576 80 : ereport(DEBUG1,
577 : errmsg_internal("checkpoint record is at %X/%08X",
578 : LSN_FORMAT_ARGS(CheckPointLoc)));
579 80 : InRecovery = true; /* force recovery even if SHUTDOWNED */
580 :
581 : /*
582 : * Make sure that REDO location exists. This may not be the case
583 : * if there was a crash during an online backup, which left a
584 : * backup_label around that references a WAL segment that's
585 : * already been archived.
586 : */
587 80 : if (checkPoint.redo < CheckPointLoc)
588 : {
589 80 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
590 80 : if (!ReadRecord(xlogprefetcher, LOG, false,
591 : checkPoint.ThisTimeLineID))
592 0 : ereport(FATAL,
593 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
594 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
595 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
596 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
597 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
598 : DataDir, DataDir, DataDir, DataDir));
599 : }
600 : }
601 : else
602 : {
603 0 : ereport(FATAL,
604 : errmsg("could not locate required checkpoint record at %X/%08X",
605 : LSN_FORMAT_ARGS(CheckPointLoc)),
606 : errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
607 : "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
608 : "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
609 : DataDir, DataDir, DataDir, DataDir));
610 : wasShutdown = false; /* keep compiler quiet */
611 : }
612 :
613 : /* Read the tablespace_map file if present and create symlinks. */
614 80 : if (read_tablespace_map(&tablespaces))
615 : {
616 : ListCell *lc;
617 :
618 4 : foreach(lc, tablespaces)
619 : {
620 2 : tablespaceinfo *ti = lfirst(lc);
621 : char *linkloc;
622 :
623 2 : linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
624 :
625 : /*
626 : * Remove the existing symlink if any and Create the symlink
627 : * under PGDATA.
628 : */
629 2 : remove_tablespace_symlink(linkloc);
630 :
631 2 : if (symlink(ti->path, linkloc) < 0)
632 0 : ereport(ERROR,
633 : (errcode_for_file_access(),
634 : errmsg("could not create symbolic link \"%s\": %m",
635 : linkloc)));
636 :
637 2 : pfree(ti->path);
638 2 : pfree(ti);
639 : }
640 :
641 : /* tell the caller to delete it later */
642 2 : haveTblspcMap = true;
643 : }
644 :
645 : /* tell the caller to delete it later */
646 80 : haveBackupLabel = true;
647 : }
648 : else
649 : {
650 : /* No backup_label file has been found if we are here. */
651 :
652 : /*
653 : * If tablespace_map file is present without backup_label file, there
654 : * is no use of such file. There is no harm in retaining it, but it
655 : * is better to get rid of the map file so that we don't have any
656 : * redundant file in data directory and it will avoid any sort of
657 : * confusion. It seems prudent though to just rename the file out of
658 : * the way rather than delete it completely, also we ignore any error
659 : * that occurs in rename operation as even if map file is present
660 : * without backup_label file, it is harmless.
661 : */
662 955 : if (stat(TABLESPACE_MAP, &st) == 0)
663 : {
664 1 : unlink(TABLESPACE_MAP_OLD);
665 1 : if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
666 1 : ereport(LOG,
667 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
668 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
669 : errdetail("File \"%s\" was renamed to \"%s\".",
670 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
671 : else
672 0 : ereport(LOG,
673 : (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
674 : TABLESPACE_MAP, BACKUP_LABEL_FILE),
675 : errdetail("Could not rename file \"%s\" to \"%s\": %m.",
676 : TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
677 : }
678 :
679 : /*
680 : * It's possible that archive recovery was requested, but we don't
681 : * know how far we need to replay the WAL before we reach consistency.
682 : * This can happen for example if a base backup is taken from a
683 : * running server using an atomic filesystem snapshot, without calling
684 : * pg_backup_start/stop. Or if you just kill a running primary server
685 : * and put it into archive recovery by creating a recovery signal
686 : * file.
687 : *
688 : * Our strategy in that case is to perform crash recovery first,
689 : * replaying all the WAL present in pg_wal, and only enter archive
690 : * recovery after that.
691 : *
692 : * But usually we already know how far we need to replay the WAL (up
693 : * to minRecoveryPoint, up to backupEndPoint, or until we see an
694 : * end-of-backup record), and we can enter archive recovery directly.
695 : */
696 955 : if (ArchiveRecoveryRequested &&
697 44 : (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) ||
698 9 : ControlFile->backupEndRequired ||
699 9 : XLogRecPtrIsValid(ControlFile->backupEndPoint) ||
700 9 : ControlFile->state == DB_SHUTDOWNED))
701 : {
702 43 : InArchiveRecovery = true;
703 43 : if (StandbyModeRequested)
704 43 : EnableStandbyMode();
705 : }
706 :
707 : /*
708 : * For the same reason as when starting up with backup_label present,
709 : * emit a log message when we continue initializing from a base
710 : * backup.
711 : */
712 955 : if (XLogRecPtrIsValid(ControlFile->backupStartPoint))
713 0 : ereport(LOG,
714 : errmsg("restarting backup recovery with redo LSN %X/%08X",
715 : LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
716 :
717 : /* Get the last valid checkpoint record. */
718 955 : CheckPointLoc = ControlFile->checkPoint;
719 955 : CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
720 955 : RedoStartLSN = ControlFile->checkPointCopy.redo;
721 955 : RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
722 955 : record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
723 : CheckPointTLI);
724 955 : if (record != NULL)
725 : {
726 954 : ereport(DEBUG1,
727 : errmsg_internal("checkpoint record is at %X/%08X",
728 : LSN_FORMAT_ARGS(CheckPointLoc)));
729 : }
730 : else
731 : {
732 : /*
733 : * We used to attempt to go back to a secondary checkpoint record
734 : * here, but only when not in standby mode. We now just fail if we
735 : * can't read the last checkpoint because this allows us to
736 : * simplify processing around checkpoints.
737 : */
738 1 : ereport(FATAL,
739 : errmsg("could not locate a valid checkpoint record at %X/%08X",
740 : LSN_FORMAT_ARGS(CheckPointLoc)));
741 : }
742 954 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
743 954 : wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
744 :
745 : /* Make sure that REDO location exists. */
746 954 : if (checkPoint.redo < CheckPointLoc)
747 : {
748 44 : XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
749 44 : if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
750 1 : ereport(FATAL,
751 : errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
752 : LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
753 : }
754 : }
755 :
756 1033 : if (ArchiveRecoveryRequested)
757 : {
758 117 : if (StandbyModeRequested)
759 112 : ereport(LOG,
760 : (errmsg("entering standby mode")));
761 5 : else if (recoveryTarget == RECOVERY_TARGET_XID)
762 0 : ereport(LOG,
763 : (errmsg("starting point-in-time recovery to XID %u",
764 : recoveryTargetXid)));
765 5 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
766 0 : ereport(LOG,
767 : (errmsg("starting point-in-time recovery to %s",
768 : timestamptz_to_str(recoveryTargetTime))));
769 5 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
770 3 : ereport(LOG,
771 : (errmsg("starting point-in-time recovery to \"%s\"",
772 : recoveryTargetName)));
773 2 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
774 0 : ereport(LOG,
775 : errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
776 : LSN_FORMAT_ARGS(recoveryTargetLSN)));
777 2 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
778 0 : ereport(LOG,
779 : (errmsg("starting point-in-time recovery to earliest consistent point")));
780 : else
781 2 : ereport(LOG,
782 : (errmsg("starting archive recovery")));
783 : }
784 :
785 : /*
786 : * If the location of the checkpoint record is not on the expected
787 : * timeline in the history of the requested timeline, we cannot proceed:
788 : * the backup is not part of the history of the requested timeline.
789 : */
790 : Assert(expectedTLEs); /* was initialized by reading checkpoint
791 : * record */
792 1033 : if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
793 : CheckPointTLI)
794 : {
795 : XLogRecPtr switchpoint;
796 :
797 : /*
798 : * tliSwitchPoint will throw an error if the checkpoint's timeline is
799 : * not in expectedTLEs at all.
800 : */
801 0 : switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
802 0 : ereport(FATAL,
803 : (errmsg("requested timeline %u is not a child of this server's history",
804 : recoveryTargetTLI),
805 : /* translator: %s is a backup_label file or a pg_control file */
806 : errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
807 : haveBackupLabel ? "backup_label" : "pg_control",
808 : LSN_FORMAT_ARGS(CheckPointLoc),
809 : CheckPointTLI,
810 : LSN_FORMAT_ARGS(switchpoint))));
811 : }
812 :
813 : /*
814 : * The min recovery point should be part of the requested timeline's
815 : * history, too.
816 : */
817 1033 : if (XLogRecPtrIsValid(ControlFile->minRecoveryPoint) &&
818 41 : tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
819 41 : ControlFile->minRecoveryPointTLI)
820 0 : ereport(FATAL,
821 : errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
822 : recoveryTargetTLI,
823 : LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
824 : ControlFile->minRecoveryPointTLI));
825 :
826 1033 : ereport(DEBUG1,
827 : errmsg_internal("redo record is at %X/%08X; shutdown %s",
828 : LSN_FORMAT_ARGS(checkPoint.redo),
829 : wasShutdown ? "true" : "false"));
830 1033 : ereport(DEBUG1,
831 : (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
832 : U64FromFullTransactionId(checkPoint.nextXid),
833 : checkPoint.nextOid)));
834 1033 : ereport(DEBUG1,
835 : (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
836 : checkPoint.nextMulti, checkPoint.nextMultiOffset)));
837 1033 : ereport(DEBUG1,
838 : (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
839 : checkPoint.oldestXid, checkPoint.oldestXidDB)));
840 1033 : ereport(DEBUG1,
841 : (errmsg_internal("oldest MultiXactId: %u, in database %u",
842 : checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
843 1033 : ereport(DEBUG1,
844 : (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
845 : checkPoint.oldestCommitTsXid,
846 : checkPoint.newestCommitTsXid)));
847 1033 : if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
848 0 : ereport(PANIC,
849 : (errmsg("invalid next transaction ID")));
850 :
851 : /* sanity check */
852 1033 : if (checkPoint.redo > CheckPointLoc)
853 0 : ereport(PANIC,
854 : (errmsg("invalid redo in checkpoint record")));
855 :
856 : /*
857 : * Check whether we need to force recovery from WAL. If it appears to
858 : * have been a clean shutdown and we did not have a recovery signal file,
859 : * then assume no recovery needed.
860 : */
861 1033 : if (checkPoint.redo < CheckPointLoc)
862 : {
863 123 : if (wasShutdown)
864 0 : ereport(PANIC,
865 : (errmsg("invalid redo record in shutdown checkpoint")));
866 123 : InRecovery = true;
867 : }
868 910 : else if (ControlFile->state != DB_SHUTDOWNED)
869 94 : InRecovery = true;
870 816 : else if (ArchiveRecoveryRequested)
871 : {
872 : /* force recovery due to presence of recovery signal file */
873 8 : InRecovery = true;
874 : }
875 :
876 : /*
877 : * If recovery is needed, update our in-memory copy of pg_control to show
878 : * that we are recovering and to show the selected checkpoint as the place
879 : * we are starting from. We also mark pg_control with any minimum recovery
880 : * stop point obtained from a backup history file.
881 : *
882 : * We don't write the changes to disk yet, though. Only do that after
883 : * initializing various subsystems.
884 : */
885 1033 : if (InRecovery)
886 : {
887 225 : if (InArchiveRecovery)
888 : {
889 123 : ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
890 : }
891 : else
892 : {
893 102 : ereport(LOG,
894 : (errmsg("database system was not properly shut down; "
895 : "automatic recovery in progress")));
896 102 : if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
897 1 : ereport(LOG,
898 : (errmsg("crash recovery starts in timeline %u "
899 : "and has target timeline %u",
900 : ControlFile->checkPointCopy.ThisTimeLineID,
901 : recoveryTargetTLI)));
902 102 : ControlFile->state = DB_IN_CRASH_RECOVERY;
903 : }
904 225 : ControlFile->checkPoint = CheckPointLoc;
905 225 : ControlFile->checkPointCopy = checkPoint;
906 225 : if (InArchiveRecovery)
907 : {
908 : /* initialize minRecoveryPoint if not set yet */
909 123 : if (ControlFile->minRecoveryPoint < checkPoint.redo)
910 : {
911 83 : ControlFile->minRecoveryPoint = checkPoint.redo;
912 83 : ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
913 : }
914 : }
915 :
916 : /*
917 : * Set backupStartPoint if we're starting recovery from a base backup.
918 : *
919 : * Also set backupEndPoint and use minRecoveryPoint as the backup end
920 : * location if we're starting recovery from a base backup which was
921 : * taken from a standby. In this case, the database system status in
922 : * pg_control must indicate that the database was already in recovery.
923 : * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
924 : * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
925 : * before reaching this point; e.g. because restore_command or
926 : * primary_conninfo were faulty.
927 : *
928 : * Any other state indicates that the backup somehow became corrupted
929 : * and we can't sensibly continue with recovery.
930 : */
931 225 : if (haveBackupLabel)
932 : {
933 80 : ControlFile->backupStartPoint = checkPoint.redo;
934 80 : ControlFile->backupEndRequired = backupEndRequired;
935 :
936 80 : if (backupFromStandby)
937 : {
938 5 : if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
939 : dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
940 0 : ereport(FATAL,
941 : (errmsg("backup_label contains data inconsistent with control file"),
942 : errhint("This means that the backup is corrupted and you will "
943 : "have to use another backup for recovery.")));
944 5 : ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
945 : }
946 : }
947 : }
948 :
949 : /* remember these, so that we know when we have reached consistency */
950 1033 : backupStartPoint = ControlFile->backupStartPoint;
951 1033 : backupEndRequired = ControlFile->backupEndRequired;
952 1033 : backupEndPoint = ControlFile->backupEndPoint;
953 1033 : if (InArchiveRecovery)
954 : {
955 123 : minRecoveryPoint = ControlFile->minRecoveryPoint;
956 123 : minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
957 : }
958 : else
959 : {
960 910 : minRecoveryPoint = InvalidXLogRecPtr;
961 910 : minRecoveryPointTLI = 0;
962 : }
963 :
964 : /*
965 : * Start recovery assuming that the final record isn't lost.
966 : */
967 1033 : abortedRecPtr = InvalidXLogRecPtr;
968 1033 : missingContrecPtr = InvalidXLogRecPtr;
969 :
970 1033 : *wasShutdown_ptr = wasShutdown;
971 1033 : *haveBackupLabel_ptr = haveBackupLabel;
972 1033 : *haveTblspcMap_ptr = haveTblspcMap;
973 1033 : }
974 :
975 : /*
976 : * See if there are any recovery signal files and if so, set state for
977 : * recovery.
978 : *
979 : * See if there is a recovery command file (recovery.conf), and if so
980 : * throw an ERROR since as of PG12 we no longer recognize that.
981 : */
982 : static void
983 1035 : readRecoverySignalFile(void)
984 : {
985 : struct stat stat_buf;
986 :
987 1035 : if (IsBootstrapProcessingMode())
988 918 : return;
989 :
990 : /*
991 : * Check for old recovery API file: recovery.conf
992 : */
993 984 : if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
994 0 : ereport(FATAL,
995 : (errcode_for_file_access(),
996 : errmsg("using recovery command file \"%s\" is not supported",
997 : RECOVERY_COMMAND_FILE)));
998 :
999 : /*
1000 : * Remove unused .done file, if present. Ignore if absent.
1001 : */
1002 984 : unlink(RECOVERY_COMMAND_DONE);
1003 :
1004 : /*
1005 : * Check for recovery signal files and if found, fsync them since they
1006 : * represent server state information. We don't sweat too much about the
1007 : * possibility of fsync failure, however.
1008 : */
1009 984 : if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1010 : {
1011 : int fd;
1012 :
1013 112 : fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1014 : S_IRUSR | S_IWUSR);
1015 112 : if (fd >= 0)
1016 : {
1017 112 : (void) pg_fsync(fd);
1018 112 : close(fd);
1019 : }
1020 112 : standby_signal_file_found = true;
1021 : }
1022 :
1023 984 : if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1024 : {
1025 : int fd;
1026 :
1027 6 : fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1028 : S_IRUSR | S_IWUSR);
1029 6 : if (fd >= 0)
1030 : {
1031 6 : (void) pg_fsync(fd);
1032 6 : close(fd);
1033 : }
1034 6 : recovery_signal_file_found = true;
1035 : }
1036 :
1037 : /*
1038 : * If both signal files are present, standby signal file takes precedence.
1039 : * If neither is present then we won't enter archive recovery.
1040 : */
1041 984 : StandbyModeRequested = false;
1042 984 : ArchiveRecoveryRequested = false;
1043 984 : if (standby_signal_file_found)
1044 : {
1045 112 : StandbyModeRequested = true;
1046 112 : ArchiveRecoveryRequested = true;
1047 : }
1048 872 : else if (recovery_signal_file_found)
1049 : {
1050 5 : StandbyModeRequested = false;
1051 5 : ArchiveRecoveryRequested = true;
1052 : }
1053 : else
1054 867 : return;
1055 :
1056 : /*
1057 : * We don't support standby mode in standalone backends; that requires
1058 : * other processes such as the WAL receiver to be alive.
1059 : */
1060 117 : if (StandbyModeRequested && !IsUnderPostmaster)
1061 0 : ereport(FATAL,
1062 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1063 : errmsg("standby mode is not supported by single-user servers")));
1064 : }
1065 :
1066 : static void
1067 1035 : validateRecoveryParameters(void)
1068 : {
1069 1035 : if (!ArchiveRecoveryRequested)
1070 918 : return;
1071 :
1072 : /*
1073 : * Check for compulsory parameters
1074 : */
1075 117 : if (StandbyModeRequested)
1076 : {
1077 112 : if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1078 11 : (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1079 2 : ereport(WARNING,
1080 : (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1081 : errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1082 : }
1083 : else
1084 : {
1085 5 : if (recoveryRestoreCommand == NULL ||
1086 5 : strcmp(recoveryRestoreCommand, "") == 0)
1087 0 : ereport(FATAL,
1088 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1089 : errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1090 : }
1091 :
1092 : /*
1093 : * Override any inconsistent requests. Note that this is a change of
1094 : * behaviour in 9.5; prior to this we simply ignored a request to pause if
1095 : * hot_standby = off, which was surprising behaviour.
1096 : */
1097 117 : if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1098 110 : !EnableHotStandby)
1099 3 : recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1100 :
1101 : /*
1102 : * Final parsing of recovery_target_time string; see also
1103 : * check_recovery_target_time().
1104 : */
1105 117 : if (recoveryTarget == RECOVERY_TARGET_TIME)
1106 : {
1107 0 : recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1108 : CStringGetDatum(recovery_target_time_string),
1109 : ObjectIdGetDatum(InvalidOid),
1110 : Int32GetDatum(-1)));
1111 : }
1112 :
1113 : /*
1114 : * If user specified recovery_target_timeline, validate it or compute the
1115 : * "latest" value. We can't do this until after we've gotten the restore
1116 : * command and set InArchiveRecovery, because we need to fetch timeline
1117 : * history files from the archive.
1118 : */
1119 117 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1120 : {
1121 0 : TimeLineID rtli = recoveryTargetTLIRequested;
1122 :
1123 : /* Timeline 1 does not have a history file, all else should */
1124 0 : if (rtli != 1 && !existsTimeLineHistory(rtli))
1125 0 : ereport(FATAL,
1126 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1127 : errmsg("recovery target timeline %u does not exist",
1128 : rtli)));
1129 0 : recoveryTargetTLI = rtli;
1130 : }
1131 117 : else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1132 : {
1133 : /* We start the "latest" search from pg_control's timeline */
1134 117 : recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1135 : }
1136 : else
1137 : {
1138 : /*
1139 : * else we just use the recoveryTargetTLI as already read from
1140 : * ControlFile
1141 : */
1142 : Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1143 : }
1144 : }
1145 :
1146 : /*
1147 : * read_backup_label: check to see if a backup_label file is present
1148 : *
1149 : * If we see a backup_label during recovery, we assume that we are recovering
1150 : * from a backup dump file, and we therefore roll forward from the checkpoint
1151 : * identified by the label file, NOT what pg_control says. This avoids the
1152 : * problem that pg_control might have been archived one or more checkpoints
1153 : * later than the start of the dump, and so if we rely on it as the start
1154 : * point, we will fail to restore a consistent database state.
1155 : *
1156 : * Returns true if a backup_label was found (and fills the checkpoint
1157 : * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1158 : * returns false if not. If this backup_label came from a streamed backup,
1159 : * *backupEndRequired is set to true. If this backup_label was created during
1160 : * recovery, *backupFromStandby is set to true.
1161 : *
1162 : * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1163 : * and TLI read from the backup file.
1164 : */
1165 : static bool
1166 1035 : read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1167 : bool *backupEndRequired, bool *backupFromStandby)
1168 : {
1169 : char startxlogfilename[MAXFNAMELEN];
1170 : TimeLineID tli_from_walseg,
1171 : tli_from_file;
1172 : FILE *lfp;
1173 : char ch;
1174 : char backuptype[20];
1175 : char backupfrom[20];
1176 : char backuplabel[MAXPGPATH];
1177 : char backuptime[128];
1178 : uint32 hi,
1179 : lo;
1180 :
1181 : /* suppress possible uninitialized-variable warnings */
1182 1035 : *checkPointLoc = InvalidXLogRecPtr;
1183 1035 : *backupLabelTLI = 0;
1184 1035 : *backupEndRequired = false;
1185 1035 : *backupFromStandby = false;
1186 :
1187 : /*
1188 : * See if label file is present
1189 : */
1190 1035 : lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1191 1035 : if (!lfp)
1192 : {
1193 955 : if (errno != ENOENT)
1194 0 : ereport(FATAL,
1195 : (errcode_for_file_access(),
1196 : errmsg("could not read file \"%s\": %m",
1197 : BACKUP_LABEL_FILE)));
1198 955 : return false; /* it's not there, all is fine */
1199 : }
1200 :
1201 : /*
1202 : * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1203 : * is pretty crude, but we are not expecting any variability in the file
1204 : * format).
1205 : */
1206 80 : if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1207 80 : &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1208 0 : ereport(FATAL,
1209 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1210 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1211 80 : RedoStartLSN = ((uint64) hi) << 32 | lo;
1212 80 : RedoStartTLI = tli_from_walseg;
1213 80 : if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1214 80 : &hi, &lo, &ch) != 3 || ch != '\n')
1215 0 : ereport(FATAL,
1216 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1217 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1218 80 : *checkPointLoc = ((uint64) hi) << 32 | lo;
1219 80 : *backupLabelTLI = tli_from_walseg;
1220 :
1221 : /*
1222 : * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1223 : * which could mean either pg_basebackup or the pg_backup_start/stop
1224 : * method was used) or if this label came from somewhere else (the only
1225 : * other option today being from pg_rewind). If this was a streamed
1226 : * backup then we know that we need to play through until we get to the
1227 : * end of the WAL which was generated during the backup (at which point we
1228 : * will have reached consistency and backupEndRequired will be reset to be
1229 : * false).
1230 : */
1231 80 : if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1232 : {
1233 80 : if (strcmp(backuptype, "streamed") == 0)
1234 79 : *backupEndRequired = true;
1235 : }
1236 :
1237 : /*
1238 : * BACKUP FROM lets us know if this was from a primary or a standby. If
1239 : * it was from a standby, we'll double-check that the control file state
1240 : * matches that of a standby.
1241 : */
1242 80 : if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1243 : {
1244 80 : if (strcmp(backupfrom, "standby") == 0)
1245 5 : *backupFromStandby = true;
1246 : }
1247 :
1248 : /*
1249 : * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1250 : * but checking for their presence is useful for debugging and the next
1251 : * sanity checks. Cope also with the fact that the result buffers have a
1252 : * pre-allocated size, hence if the backup_label file has been generated
1253 : * with strings longer than the maximum assumed here an incorrect parsing
1254 : * happens. That's fine as only minor consistency checks are done
1255 : * afterwards.
1256 : */
1257 80 : if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1258 80 : ereport(DEBUG1,
1259 : (errmsg_internal("backup time %s in file \"%s\"",
1260 : backuptime, BACKUP_LABEL_FILE)));
1261 :
1262 80 : if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1263 79 : ereport(DEBUG1,
1264 : (errmsg_internal("backup label %s in file \"%s\"",
1265 : backuplabel, BACKUP_LABEL_FILE)));
1266 :
1267 : /*
1268 : * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1269 : * it as a sanity check if present.
1270 : */
1271 80 : if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1272 : {
1273 79 : if (tli_from_walseg != tli_from_file)
1274 0 : ereport(FATAL,
1275 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1276 : errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1277 : errdetail("Timeline ID parsed is %u, but expected %u.",
1278 : tli_from_file, tli_from_walseg)));
1279 :
1280 79 : ereport(DEBUG1,
1281 : (errmsg_internal("backup timeline %u in file \"%s\"",
1282 : tli_from_file, BACKUP_LABEL_FILE)));
1283 : }
1284 :
1285 80 : if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1286 0 : ereport(FATAL,
1287 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1288 : errmsg("this is an incremental backup, not a data directory"),
1289 : errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1290 :
1291 80 : if (ferror(lfp) || FreeFile(lfp))
1292 0 : ereport(FATAL,
1293 : (errcode_for_file_access(),
1294 : errmsg("could not read file \"%s\": %m",
1295 : BACKUP_LABEL_FILE)));
1296 :
1297 80 : return true;
1298 : }
1299 :
1300 : /*
1301 : * read_tablespace_map: check to see if a tablespace_map file is present
1302 : *
1303 : * If we see a tablespace_map file during recovery, we assume that we are
1304 : * recovering from a backup dump file, and we therefore need to create symlinks
1305 : * as per the information present in tablespace_map file.
1306 : *
1307 : * Returns true if a tablespace_map file was found (and fills *tablespaces
1308 : * with a tablespaceinfo struct for each tablespace listed in the file);
1309 : * returns false if not.
1310 : */
1311 : static bool
1312 80 : read_tablespace_map(List **tablespaces)
1313 : {
1314 : tablespaceinfo *ti;
1315 : FILE *lfp;
1316 : char str[MAXPGPATH];
1317 : int ch,
1318 : i,
1319 : n;
1320 : bool was_backslash;
1321 :
1322 : /*
1323 : * See if tablespace_map file is present
1324 : */
1325 80 : lfp = AllocateFile(TABLESPACE_MAP, "r");
1326 80 : if (!lfp)
1327 : {
1328 78 : if (errno != ENOENT)
1329 0 : ereport(FATAL,
1330 : (errcode_for_file_access(),
1331 : errmsg("could not read file \"%s\": %m",
1332 : TABLESPACE_MAP)));
1333 78 : return false; /* it's not there, all is fine */
1334 : }
1335 :
1336 : /*
1337 : * Read and parse the link name and path lines from tablespace_map file
1338 : * (this code is pretty crude, but we are not expecting any variability in
1339 : * the file format). De-escape any backslashes that were inserted.
1340 : */
1341 2 : i = 0;
1342 2 : was_backslash = false;
1343 77 : while ((ch = fgetc(lfp)) != EOF)
1344 : {
1345 75 : if (!was_backslash && (ch == '\n' || ch == '\r'))
1346 2 : {
1347 : char *endp;
1348 :
1349 2 : if (i == 0)
1350 0 : continue; /* \r immediately followed by \n */
1351 :
1352 : /*
1353 : * The de-escaped line should contain an OID followed by exactly
1354 : * one space followed by a path. The path might start with
1355 : * spaces, so don't be too liberal about parsing.
1356 : */
1357 2 : str[i] = '\0';
1358 2 : n = 0;
1359 12 : while (str[n] && str[n] != ' ')
1360 10 : n++;
1361 2 : if (n < 1 || n >= i - 1)
1362 0 : ereport(FATAL,
1363 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1364 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1365 2 : str[n++] = '\0';
1366 :
1367 2 : ti = palloc0_object(tablespaceinfo);
1368 2 : errno = 0;
1369 2 : ti->oid = strtoul(str, &endp, 10);
1370 2 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1371 0 : ereport(FATAL,
1372 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1373 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1374 2 : ti->path = pstrdup(str + n);
1375 2 : *tablespaces = lappend(*tablespaces, ti);
1376 :
1377 2 : i = 0;
1378 2 : continue;
1379 : }
1380 73 : else if (!was_backslash && ch == '\\')
1381 0 : was_backslash = true;
1382 : else
1383 : {
1384 73 : if (i < sizeof(str) - 1)
1385 73 : str[i++] = ch;
1386 73 : was_backslash = false;
1387 : }
1388 : }
1389 :
1390 2 : if (i != 0 || was_backslash) /* last line not terminated? */
1391 0 : ereport(FATAL,
1392 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1393 : errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1394 :
1395 2 : if (ferror(lfp) || FreeFile(lfp))
1396 0 : ereport(FATAL,
1397 : (errcode_for_file_access(),
1398 : errmsg("could not read file \"%s\": %m",
1399 : TABLESPACE_MAP)));
1400 :
1401 2 : return true;
1402 : }
1403 :
1404 : /*
1405 : * Finish WAL recovery.
1406 : *
1407 : * This does not close the 'xlogreader' yet, because in some cases the caller
1408 : * still wants to re-read the last checkpoint record by calling
1409 : * ReadCheckpointRecord().
1410 : *
1411 : * Returns the position of the last valid or applied record, after which new
1412 : * WAL should be appended, information about why recovery was ended, and some
1413 : * other things. See the EndOfWalRecoveryInfo struct for details.
1414 : */
1415 : EndOfWalRecoveryInfo *
1416 970 : FinishWalRecovery(void)
1417 : {
1418 970 : EndOfWalRecoveryInfo *result = palloc_object(EndOfWalRecoveryInfo);
1419 : XLogRecPtr lastRec;
1420 : TimeLineID lastRecTLI;
1421 : XLogRecPtr endOfLog;
1422 :
1423 : /*
1424 : * Kill WAL receiver, if it's still running, before we continue to write
1425 : * the startup checkpoint and aborted-contrecord records. It will trump
1426 : * over these records and subsequent ones if it's still alive when we
1427 : * start writing WAL.
1428 : */
1429 970 : XLogShutdownWalRcv();
1430 :
1431 : /*
1432 : * Shutdown the slot sync worker to drop any temporary slots acquired by
1433 : * it and to prevent it from keep trying to fetch the failover slots.
1434 : *
1435 : * We do not update the 'synced' column in 'pg_replication_slots' system
1436 : * view from true to false here, as any failed update could leave 'synced'
1437 : * column false for some slots. This could cause issues during slot sync
1438 : * after restarting the server as a standby. While updating the 'synced'
1439 : * column after switching to the new timeline is an option, it does not
1440 : * simplify the handling for the 'synced' column. Therefore, we retain the
1441 : * 'synced' column as true after promotion as it may provide useful
1442 : * information about the slot origin.
1443 : */
1444 970 : ShutDownSlotSync();
1445 :
1446 : /*
1447 : * We are now done reading the xlog from stream. Turn off streaming
1448 : * recovery to force fetching the files (which would be required at end of
1449 : * recovery, e.g., timeline history file) from archive or pg_wal.
1450 : *
1451 : * Note that standby mode must be turned off after killing WAL receiver,
1452 : * i.e., calling XLogShutdownWalRcv().
1453 : */
1454 : Assert(!WalRcvStreaming());
1455 970 : StandbyMode = false;
1456 :
1457 : /*
1458 : * Determine where to start writing WAL next.
1459 : *
1460 : * Re-fetch the last valid or last applied record, so we can identify the
1461 : * exact endpoint of what we consider the valid portion of WAL. There may
1462 : * be an incomplete continuation record after that, in which case
1463 : * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1464 : * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1465 : * it is intentionally missing. See CreateOverwriteContrecordRecord().
1466 : *
1467 : * An important side-effect of this is to load the last page into
1468 : * xlogreader. The caller uses it to initialize the WAL for writing.
1469 : */
1470 970 : if (!InRecovery)
1471 : {
1472 807 : lastRec = CheckPointLoc;
1473 807 : lastRecTLI = CheckPointTLI;
1474 : }
1475 : else
1476 : {
1477 163 : lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1478 163 : lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1479 : }
1480 970 : XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1481 970 : (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1482 970 : endOfLog = xlogreader->EndRecPtr;
1483 :
1484 : /*
1485 : * Remember the TLI in the filename of the XLOG segment containing the
1486 : * end-of-log. It could be different from the timeline that endOfLog
1487 : * nominally belongs to, if there was a timeline switch in that segment,
1488 : * and we were reading the old WAL from a segment belonging to a higher
1489 : * timeline.
1490 : */
1491 970 : result->endOfLogTLI = xlogreader->seg.ws_tli;
1492 :
1493 970 : if (ArchiveRecoveryRequested)
1494 : {
1495 : /*
1496 : * We are no longer in archive recovery state.
1497 : *
1498 : * We are now done reading the old WAL. Turn off archive fetching if
1499 : * it was active.
1500 : */
1501 : Assert(InArchiveRecovery);
1502 55 : InArchiveRecovery = false;
1503 :
1504 : /*
1505 : * If the ending log segment is still open, close it (to avoid
1506 : * problems on Windows with trying to rename or delete an open file).
1507 : */
1508 55 : if (readFile >= 0)
1509 : {
1510 55 : close(readFile);
1511 55 : readFile = -1;
1512 : }
1513 : }
1514 :
1515 : /*
1516 : * Copy the last partial block to the caller, for initializing the WAL
1517 : * buffer for appending new WAL.
1518 : */
1519 970 : if (endOfLog % XLOG_BLCKSZ != 0)
1520 : {
1521 : char *page;
1522 : int len;
1523 : XLogRecPtr pageBeginPtr;
1524 :
1525 948 : pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1526 : Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1527 :
1528 : /* Copy the valid part of the last block */
1529 948 : len = endOfLog % XLOG_BLCKSZ;
1530 948 : page = palloc(len);
1531 948 : memcpy(page, xlogreader->readBuf, len);
1532 :
1533 948 : result->lastPageBeginPtr = pageBeginPtr;
1534 948 : result->lastPage = page;
1535 : }
1536 : else
1537 : {
1538 : /* There is no partial block to copy. */
1539 22 : result->lastPageBeginPtr = endOfLog;
1540 22 : result->lastPage = NULL;
1541 : }
1542 :
1543 : /*
1544 : * Create a comment for the history file to explain why and where timeline
1545 : * changed.
1546 : */
1547 970 : result->recoveryStopReason = getRecoveryStopReason();
1548 :
1549 970 : result->lastRec = lastRec;
1550 970 : result->lastRecTLI = lastRecTLI;
1551 970 : result->endOfLog = endOfLog;
1552 :
1553 970 : result->abortedRecPtr = abortedRecPtr;
1554 970 : result->missingContrecPtr = missingContrecPtr;
1555 :
1556 970 : result->standby_signal_file_found = standby_signal_file_found;
1557 970 : result->recovery_signal_file_found = recovery_signal_file_found;
1558 :
1559 970 : return result;
1560 : }
1561 :
1562 : /*
1563 : * Clean up the WAL reader and leftovers from restoring WAL from archive
1564 : */
1565 : void
1566 970 : ShutdownWalRecovery(void)
1567 : {
1568 : char recoveryPath[MAXPGPATH];
1569 :
1570 : /* Final update of pg_stat_recovery_prefetch. */
1571 970 : XLogPrefetcherComputeStats(xlogprefetcher);
1572 :
1573 : /* Shut down xlogreader */
1574 970 : if (readFile >= 0)
1575 : {
1576 915 : close(readFile);
1577 915 : readFile = -1;
1578 : }
1579 970 : pfree(xlogreader->private_data);
1580 970 : XLogReaderFree(xlogreader);
1581 970 : XLogPrefetcherFree(xlogprefetcher);
1582 :
1583 970 : if (ArchiveRecoveryRequested)
1584 : {
1585 : /*
1586 : * Since there might be a partial WAL segment named RECOVERYXLOG, get
1587 : * rid of it.
1588 : */
1589 55 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1590 55 : unlink(recoveryPath); /* ignore any error */
1591 :
1592 : /* Get rid of any remaining recovered timeline-history file, too */
1593 55 : snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1594 55 : unlink(recoveryPath); /* ignore any error */
1595 : }
1596 :
1597 : /*
1598 : * We don't need the latch anymore. It's not strictly necessary to disown
1599 : * it, but let's do it for the sake of tidiness.
1600 : */
1601 970 : if (ArchiveRecoveryRequested)
1602 55 : DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1603 970 : }
1604 :
1605 : /*
1606 : * Perform WAL recovery.
1607 : *
1608 : * If the system was shut down cleanly, this is never called.
1609 : */
1610 : void
1611 224 : PerformWalRecovery(void)
1612 : {
1613 : XLogRecord *record;
1614 224 : bool reachedRecoveryTarget = false;
1615 : TimeLineID replayTLI;
1616 :
1617 : /*
1618 : * Initialize shared variables for tracking progress of WAL replay, as if
1619 : * we had just replayed the record before the REDO location (or the
1620 : * checkpoint record itself, if it's a shutdown checkpoint).
1621 : */
1622 224 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1623 224 : if (RedoStartLSN < CheckPointLoc)
1624 : {
1625 122 : XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1626 122 : XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1627 122 : XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1628 : }
1629 : else
1630 : {
1631 102 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1632 102 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1633 102 : XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1634 : }
1635 224 : XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1636 224 : XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1637 224 : XLogRecoveryCtl->recoveryLastXTime = 0;
1638 224 : XLogRecoveryCtl->currentChunkStartTime = 0;
1639 224 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1640 224 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1641 :
1642 : /* Also ensure XLogReceiptTime has a sane value */
1643 224 : XLogReceiptTime = GetCurrentTimestamp();
1644 :
1645 : /*
1646 : * Let postmaster know we've started redo now, so that it can launch the
1647 : * archiver if necessary.
1648 : */
1649 224 : if (IsUnderPostmaster)
1650 215 : SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1651 :
1652 : /*
1653 : * Allow read-only connections immediately if we're consistent already.
1654 : */
1655 224 : CheckRecoveryConsistency();
1656 :
1657 : /*
1658 : * Find the first record that logically follows the checkpoint --- it
1659 : * might physically precede it, though.
1660 : */
1661 224 : if (RedoStartLSN < CheckPointLoc)
1662 : {
1663 : /* back up to find the record */
1664 122 : replayTLI = RedoStartTLI;
1665 122 : XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1666 122 : record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1667 :
1668 : /*
1669 : * If a checkpoint record's redo pointer points back to an earlier
1670 : * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1671 : * record.
1672 : */
1673 122 : if (record->xl_rmid != RM_XLOG_ID ||
1674 122 : (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1675 0 : ereport(FATAL,
1676 : errmsg("unexpected record type found at redo point %X/%08X",
1677 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1678 : }
1679 : else
1680 : {
1681 : /* just have to read next record after CheckPoint */
1682 : Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1683 102 : replayTLI = CheckPointTLI;
1684 102 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1685 : }
1686 :
1687 224 : if (record != NULL)
1688 : {
1689 : TimestampTz xtime;
1690 : PGRUsage ru0;
1691 :
1692 215 : pg_rusage_init(&ru0);
1693 :
1694 215 : InRedo = true;
1695 :
1696 215 : RmgrStartup();
1697 :
1698 215 : ereport(LOG,
1699 : errmsg("redo starts at %X/%08X",
1700 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1701 :
1702 : /* Prepare to report progress of the redo phase. */
1703 215 : if (!StandbyMode)
1704 108 : begin_startup_progress_phase();
1705 :
1706 : /*
1707 : * main redo apply loop
1708 : */
1709 : do
1710 : {
1711 2884372 : if (!StandbyMode)
1712 326007 : ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1713 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1714 :
1715 : #ifdef WAL_DEBUG
1716 : if (XLOG_DEBUG)
1717 : {
1718 : StringInfoData buf;
1719 :
1720 : initStringInfo(&buf);
1721 : appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1722 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1723 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1724 : xlog_outrec(&buf, xlogreader);
1725 : appendStringInfoString(&buf, " - ");
1726 : xlog_outdesc(&buf, xlogreader);
1727 : elog(LOG, "%s", buf.data);
1728 : pfree(buf.data);
1729 : }
1730 : #endif
1731 :
1732 : /* Handle interrupt signals of startup process */
1733 2884372 : ProcessStartupProcInterrupts();
1734 :
1735 : /*
1736 : * Pause WAL replay, if requested by a hot-standby session via
1737 : * SetRecoveryPause().
1738 : *
1739 : * Note that we intentionally don't take the info_lck spinlock
1740 : * here. We might therefore read a slightly stale value of the
1741 : * recoveryPause flag, but it can't be very stale (no worse than
1742 : * the last spinlock we did acquire). Since a pause request is a
1743 : * pretty asynchronous thing anyway, possibly responding to it one
1744 : * WAL record later than we otherwise would is a minor issue, so
1745 : * it doesn't seem worth adding another spinlock cycle to prevent
1746 : * that.
1747 : */
1748 2884372 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1749 : RECOVERY_NOT_PAUSED)
1750 0 : recoveryPausesHere(false);
1751 :
1752 : /*
1753 : * Have we reached our recovery target?
1754 : */
1755 2884372 : if (recoveryStopsBefore(xlogreader))
1756 : {
1757 2 : reachedRecoveryTarget = true;
1758 2 : break;
1759 : }
1760 :
1761 : /*
1762 : * If we've been asked to lag the primary, wait on latch until
1763 : * enough time has passed.
1764 : */
1765 2884370 : if (recoveryApplyDelay(xlogreader))
1766 : {
1767 : /*
1768 : * We test for paused recovery again here. If user sets
1769 : * delayed apply, it may be because they expect to pause
1770 : * recovery in case of problems, so we must test again here
1771 : * otherwise pausing during the delay-wait wouldn't work.
1772 : */
1773 19 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1774 : RECOVERY_NOT_PAUSED)
1775 1 : recoveryPausesHere(false);
1776 : }
1777 :
1778 : /*
1779 : * Apply the record
1780 : */
1781 2884370 : ApplyWalRecord(xlogreader, record, &replayTLI);
1782 :
1783 : /*
1784 : * If we replayed an LSN that someone was waiting for then walk
1785 : * over the shared memory array and set latches to notify the
1786 : * waiters.
1787 : */
1788 5768736 : if (waitLSNState &&
1789 2884368 : (XLogRecoveryCtl->lastReplayedEndRecPtr >=
1790 2884368 : pg_atomic_read_u64(&waitLSNState->minWaitedLSN[WAIT_LSN_TYPE_STANDBY_REPLAY])))
1791 8 : WaitLSNWakeup(WAIT_LSN_TYPE_STANDBY_REPLAY, XLogRecoveryCtl->lastReplayedEndRecPtr);
1792 :
1793 : /* Exit loop if we reached inclusive recovery target */
1794 2884368 : if (recoveryStopsAfter(xlogreader))
1795 : {
1796 5 : reachedRecoveryTarget = true;
1797 5 : break;
1798 : }
1799 :
1800 : /* Else, try to fetch the next WAL record */
1801 2884363 : record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1802 2884305 : } while (record != NULL);
1803 :
1804 : /*
1805 : * end of main redo apply loop
1806 : */
1807 :
1808 155 : if (reachedRecoveryTarget)
1809 : {
1810 7 : if (!reachedConsistency)
1811 0 : ereport(FATAL,
1812 : (errmsg("requested recovery stop point is before consistent recovery point")));
1813 :
1814 : /*
1815 : * This is the last point where we can restart recovery with a new
1816 : * recovery target, if we shutdown and begin again. After this,
1817 : * Resource Managers may choose to do permanent corrective actions
1818 : * at end of recovery.
1819 : */
1820 7 : switch (recoveryTargetAction)
1821 : {
1822 0 : case RECOVERY_TARGET_ACTION_SHUTDOWN:
1823 :
1824 : /*
1825 : * exit with special return code to request shutdown of
1826 : * postmaster. Log messages issued from postmaster.
1827 : */
1828 0 : proc_exit(3);
1829 :
1830 1 : case RECOVERY_TARGET_ACTION_PAUSE:
1831 1 : SetRecoveryPause(true);
1832 1 : recoveryPausesHere(true);
1833 :
1834 : /* drop into promote */
1835 : pg_fallthrough;
1836 :
1837 7 : case RECOVERY_TARGET_ACTION_PROMOTE:
1838 7 : break;
1839 : }
1840 : }
1841 :
1842 155 : RmgrCleanup();
1843 :
1844 155 : ereport(LOG,
1845 : errmsg("redo done at %X/%08X system usage: %s",
1846 : LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1847 : pg_rusage_show(&ru0)));
1848 155 : xtime = GetLatestXTime();
1849 155 : if (xtime)
1850 38 : ereport(LOG,
1851 : (errmsg("last completed transaction was at log time %s",
1852 : timestamptz_to_str(xtime))));
1853 :
1854 155 : InRedo = false;
1855 : }
1856 : else
1857 : {
1858 : /* there are no WAL records following the checkpoint */
1859 9 : ereport(LOG,
1860 : (errmsg("redo is not required")));
1861 : }
1862 :
1863 : /*
1864 : * This check is intentionally after the above log messages that indicate
1865 : * how far recovery went.
1866 : */
1867 164 : if (ArchiveRecoveryRequested &&
1868 56 : recoveryTarget != RECOVERY_TARGET_UNSET &&
1869 8 : !reachedRecoveryTarget)
1870 1 : ereport(FATAL,
1871 : (errcode(ERRCODE_CONFIG_FILE_ERROR),
1872 : errmsg("recovery ended before configured recovery target was reached")));
1873 163 : }
1874 :
1875 : /*
1876 : * Subroutine of PerformWalRecovery, to apply one WAL record.
1877 : */
1878 : static void
1879 2884370 : ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1880 : {
1881 : ErrorContextCallback errcallback;
1882 2884370 : bool switchedTLI = false;
1883 :
1884 : /* Setup error traceback support for ereport() */
1885 2884370 : errcallback.callback = rm_redo_error_callback;
1886 2884370 : errcallback.arg = xlogreader;
1887 2884370 : errcallback.previous = error_context_stack;
1888 2884370 : error_context_stack = &errcallback;
1889 :
1890 : /*
1891 : * TransamVariables->nextXid must be beyond record's xid.
1892 : */
1893 2884370 : AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1894 :
1895 : /*
1896 : * Before replaying this record, check if this record causes the current
1897 : * timeline to change. The record is already considered to be part of the
1898 : * new timeline, so we update replayTLI before replaying it. That's
1899 : * important so that replayEndTLI, which is recorded as the minimum
1900 : * recovery point's TLI if recovery stops after this record, is set
1901 : * correctly.
1902 : */
1903 2884370 : if (record->xl_rmid == RM_XLOG_ID)
1904 : {
1905 106857 : TimeLineID newReplayTLI = *replayTLI;
1906 106857 : TimeLineID prevReplayTLI = *replayTLI;
1907 106857 : uint8 info = record->xl_info & ~XLR_INFO_MASK;
1908 :
1909 106857 : if (info == XLOG_CHECKPOINT_SHUTDOWN)
1910 : {
1911 : CheckPoint checkPoint;
1912 :
1913 41 : memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1914 41 : newReplayTLI = checkPoint.ThisTimeLineID;
1915 41 : prevReplayTLI = checkPoint.PrevTimeLineID;
1916 : }
1917 106816 : else if (info == XLOG_END_OF_RECOVERY)
1918 : {
1919 : xl_end_of_recovery xlrec;
1920 :
1921 10 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1922 10 : newReplayTLI = xlrec.ThisTimeLineID;
1923 10 : prevReplayTLI = xlrec.PrevTimeLineID;
1924 : }
1925 :
1926 106857 : if (newReplayTLI != *replayTLI)
1927 : {
1928 : /* Check that it's OK to switch to this TLI */
1929 11 : checkTimeLineSwitch(xlogreader->EndRecPtr,
1930 : newReplayTLI, prevReplayTLI, *replayTLI);
1931 :
1932 : /* Following WAL records should be run with new TLI */
1933 11 : *replayTLI = newReplayTLI;
1934 11 : switchedTLI = true;
1935 : }
1936 : }
1937 :
1938 : /*
1939 : * Update shared replayEndRecPtr before replaying this record, so that
1940 : * XLogFlush will update minRecoveryPoint correctly.
1941 : */
1942 2884370 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1943 2884370 : XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1944 2884370 : XLogRecoveryCtl->replayEndTLI = *replayTLI;
1945 2884370 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1946 :
1947 : /*
1948 : * If we are attempting to enter Hot Standby mode, process XIDs we see
1949 : */
1950 2884370 : if (standbyState >= STANDBY_INITIALIZED &&
1951 2578129 : TransactionIdIsValid(record->xl_xid))
1952 2523495 : RecordKnownAssignedTransactionIds(record->xl_xid);
1953 :
1954 : /*
1955 : * Some XLOG record types that are related to recovery are processed
1956 : * directly here, rather than in xlog_redo()
1957 : */
1958 2884370 : if (record->xl_rmid == RM_XLOG_ID)
1959 106857 : xlogrecovery_redo(xlogreader, *replayTLI);
1960 :
1961 : /* Now apply the WAL record itself */
1962 2884370 : GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1963 :
1964 : /*
1965 : * After redo, check whether the backup pages associated with the WAL
1966 : * record are consistent with the existing pages. This check is done only
1967 : * if consistency check is enabled for this record.
1968 : */
1969 2884368 : if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1970 2234811 : verifyBackupPageConsistency(xlogreader);
1971 :
1972 : /* Pop the error context stack */
1973 2884368 : error_context_stack = errcallback.previous;
1974 :
1975 : /*
1976 : * Update lastReplayedEndRecPtr after this record has been successfully
1977 : * replayed.
1978 : */
1979 2884368 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1980 2884368 : XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1981 2884368 : XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1982 2884368 : XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1983 2884368 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
1984 :
1985 : /* ------
1986 : * Wakeup walsenders:
1987 : *
1988 : * On the standby, the WAL is flushed first (which will only wake up
1989 : * physical walsenders) and then applied, which will only wake up logical
1990 : * walsenders.
1991 : *
1992 : * Indeed, logical walsenders on standby can't decode and send data until
1993 : * it's been applied.
1994 : *
1995 : * Physical walsenders don't need to be woken up during replay unless
1996 : * cascading replication is allowed and time line change occurred (so that
1997 : * they can notice that they are on a new time line).
1998 : *
1999 : * That's why the wake up conditions are for:
2000 : *
2001 : * - physical walsenders in case of new time line and cascade
2002 : * replication is allowed
2003 : * - logical walsenders in case cascade replication is allowed (could not
2004 : * be created otherwise)
2005 : * ------
2006 : */
2007 2884368 : if (AllowCascadeReplication())
2008 2632682 : WalSndWakeup(switchedTLI, true);
2009 :
2010 : /*
2011 : * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2012 : * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2013 : * a reply to the primary.
2014 : */
2015 2884368 : if (doRequestWalReceiverReply)
2016 : {
2017 2 : doRequestWalReceiverReply = false;
2018 2 : WalRcvForceReply();
2019 : }
2020 :
2021 : /* Allow read-only connections if we're consistent now */
2022 2884368 : CheckRecoveryConsistency();
2023 :
2024 : /* Is this a timeline switch? */
2025 2884368 : if (switchedTLI)
2026 : {
2027 : /*
2028 : * Before we continue on the new timeline, clean up any (possibly
2029 : * bogus) future WAL segments on the old timeline.
2030 : */
2031 11 : RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2032 :
2033 : /* Reset the prefetcher. */
2034 11 : XLogPrefetchReconfigure();
2035 : }
2036 2884368 : }
2037 :
2038 : /*
2039 : * Some XLOG RM record types that are directly related to WAL recovery are
2040 : * handled here rather than in the xlog_redo()
2041 : */
2042 : static void
2043 106857 : xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2044 : {
2045 106857 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2046 106857 : XLogRecPtr lsn = record->EndRecPtr;
2047 :
2048 : Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2049 :
2050 106857 : if (info == XLOG_OVERWRITE_CONTRECORD)
2051 : {
2052 : /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2053 : xl_overwrite_contrecord xlrec;
2054 :
2055 1 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2056 1 : if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2057 0 : elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2058 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2059 : LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2060 :
2061 : /* We have safely skipped the aborted record */
2062 1 : abortedRecPtr = InvalidXLogRecPtr;
2063 1 : missingContrecPtr = InvalidXLogRecPtr;
2064 :
2065 1 : ereport(LOG,
2066 : errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2067 : LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2068 : timestamptz_to_str(xlrec.overwrite_time)));
2069 :
2070 : /* Verifying the record should only happen once */
2071 1 : record->overwrittenRecPtr = InvalidXLogRecPtr;
2072 : }
2073 106856 : else if (info == XLOG_BACKUP_END)
2074 : {
2075 : XLogRecPtr startpoint;
2076 :
2077 95 : memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2078 :
2079 95 : if (backupStartPoint == startpoint)
2080 : {
2081 : /*
2082 : * We have reached the end of base backup, the point where
2083 : * pg_backup_stop() was done. The data on disk is now consistent
2084 : * (assuming we have also reached minRecoveryPoint). Set
2085 : * backupEndPoint to the current LSN, so that the next call to
2086 : * CheckRecoveryConsistency() will notice it and do the
2087 : * end-of-backup processing.
2088 : */
2089 78 : elog(DEBUG1, "end of backup record reached");
2090 :
2091 78 : backupEndPoint = lsn;
2092 : }
2093 : else
2094 17 : elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2095 : LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2096 : }
2097 106857 : }
2098 :
2099 : /*
2100 : * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2101 : * directories.
2102 : *
2103 : * Replay of database creation XLOG records for databases that were later
2104 : * dropped can create fake directories in pg_tblspc. By the time consistency
2105 : * is reached these directories should have been removed; here we verify
2106 : * that this did indeed happen. This is to be called at the point where
2107 : * consistent state is reached.
2108 : *
2109 : * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2110 : * useful for testing purposes, and also allows for an escape hatch in case
2111 : * things go south.
2112 : */
2113 : static void
2114 123 : CheckTablespaceDirectory(void)
2115 : {
2116 : DIR *dir;
2117 : struct dirent *de;
2118 :
2119 123 : dir = AllocateDir(PG_TBLSPC_DIR);
2120 376 : while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2121 : {
2122 : char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2123 :
2124 : /* Skip entries of non-oid names */
2125 253 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2126 246 : continue;
2127 :
2128 7 : snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2129 :
2130 7 : if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2131 4 : ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2132 : (errcode(ERRCODE_DATA_CORRUPTED),
2133 : errmsg("unexpected directory entry \"%s\" found in %s",
2134 : de->d_name, PG_TBLSPC_DIR),
2135 : errdetail("All directory entries in %s/ should be symbolic links.",
2136 : PG_TBLSPC_DIR),
2137 : errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2138 : }
2139 123 : }
2140 :
2141 : /*
2142 : * Checks if recovery has reached a consistent state. When consistency is
2143 : * reached and we have a valid starting standby snapshot, tell postmaster
2144 : * that it can start accepting read-only connections.
2145 : */
2146 : static void
2147 2884593 : CheckRecoveryConsistency(void)
2148 : {
2149 : XLogRecPtr lastReplayedEndRecPtr;
2150 : TimeLineID lastReplayedTLI;
2151 :
2152 : /*
2153 : * During crash recovery, we don't reach a consistent state until we've
2154 : * replayed all the WAL.
2155 : */
2156 2884593 : if (!XLogRecPtrIsValid(minRecoveryPoint))
2157 320890 : return;
2158 :
2159 : Assert(InArchiveRecovery);
2160 :
2161 : /*
2162 : * assume that we are called in the startup process, and hence don't need
2163 : * a lock to read lastReplayedEndRecPtr
2164 : */
2165 2563703 : lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2166 2563703 : lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2167 :
2168 : /*
2169 : * Have we reached the point where our base backup was completed?
2170 : */
2171 2563703 : if (XLogRecPtrIsValid(backupEndPoint) &&
2172 112 : backupEndPoint <= lastReplayedEndRecPtr)
2173 : {
2174 80 : XLogRecPtr saveBackupStartPoint = backupStartPoint;
2175 80 : XLogRecPtr saveBackupEndPoint = backupEndPoint;
2176 :
2177 80 : elog(DEBUG1, "end of backup reached");
2178 :
2179 : /*
2180 : * We have reached the end of base backup, as indicated by pg_control.
2181 : * Update the control file accordingly.
2182 : */
2183 80 : ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2184 80 : backupStartPoint = InvalidXLogRecPtr;
2185 80 : backupEndPoint = InvalidXLogRecPtr;
2186 80 : backupEndRequired = false;
2187 :
2188 80 : ereport(LOG,
2189 : errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2190 : LSN_FORMAT_ARGS(saveBackupStartPoint),
2191 : LSN_FORMAT_ARGS(saveBackupEndPoint)));
2192 : }
2193 :
2194 : /*
2195 : * Have we passed our safe starting point? Note that minRecoveryPoint is
2196 : * known to be incorrectly set if recovering from a backup, until the
2197 : * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2198 : * All we know prior to that is that we're not consistent yet.
2199 : */
2200 2563703 : if (!reachedConsistency && !backupEndRequired &&
2201 7863 : minRecoveryPoint <= lastReplayedEndRecPtr)
2202 : {
2203 : /*
2204 : * Check to see if the XLOG sequence contained any unresolved
2205 : * references to uninitialized pages.
2206 : */
2207 123 : XLogCheckInvalidPages();
2208 :
2209 : /*
2210 : * Check that pg_tblspc doesn't contain any real directories. Replay
2211 : * of Database/CREATE_* records may have created fictitious tablespace
2212 : * directories that should have been removed by the time consistency
2213 : * was reached.
2214 : */
2215 123 : CheckTablespaceDirectory();
2216 :
2217 123 : reachedConsistency = true;
2218 123 : SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2219 123 : ereport(LOG,
2220 : errmsg("consistent recovery state reached at %X/%08X",
2221 : LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2222 : }
2223 :
2224 : /*
2225 : * Have we got a valid starting snapshot that will allow queries to be
2226 : * run? If so, we can tell postmaster that the database is consistent now,
2227 : * enabling connections.
2228 : */
2229 2563703 : if (standbyState == STANDBY_SNAPSHOT_READY &&
2230 2563460 : !LocalHotStandbyActive &&
2231 114 : reachedConsistency &&
2232 : IsUnderPostmaster)
2233 : {
2234 114 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2235 114 : XLogRecoveryCtl->SharedHotStandbyActive = true;
2236 114 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
2237 :
2238 114 : LocalHotStandbyActive = true;
2239 :
2240 114 : SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2241 : }
2242 : }
2243 :
2244 : /*
2245 : * Error context callback for errors occurring during rm_redo().
2246 : */
2247 : static void
2248 164 : rm_redo_error_callback(void *arg)
2249 : {
2250 164 : XLogReaderState *record = (XLogReaderState *) arg;
2251 : StringInfoData buf;
2252 :
2253 164 : initStringInfo(&buf);
2254 164 : xlog_outdesc(&buf, record);
2255 164 : xlog_block_info(&buf, record);
2256 :
2257 : /* translator: %s is a WAL record description */
2258 164 : errcontext("WAL redo at %X/%08X for %s",
2259 164 : LSN_FORMAT_ARGS(record->ReadRecPtr),
2260 : buf.data);
2261 :
2262 164 : pfree(buf.data);
2263 164 : }
2264 :
2265 : /*
2266 : * Returns a string describing an XLogRecord, consisting of its identity
2267 : * optionally followed by a colon, a space, and a further description.
2268 : */
2269 : void
2270 164 : xlog_outdesc(StringInfo buf, XLogReaderState *record)
2271 : {
2272 164 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2273 164 : uint8 info = XLogRecGetInfo(record);
2274 : const char *id;
2275 :
2276 164 : appendStringInfoString(buf, rmgr.rm_name);
2277 164 : appendStringInfoChar(buf, '/');
2278 :
2279 164 : id = rmgr.rm_identify(info);
2280 164 : if (id == NULL)
2281 0 : appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2282 : else
2283 164 : appendStringInfo(buf, "%s: ", id);
2284 :
2285 164 : rmgr.rm_desc(buf, record);
2286 164 : }
2287 :
2288 : #ifdef WAL_DEBUG
2289 :
2290 : static void
2291 : xlog_outrec(StringInfo buf, XLogReaderState *record)
2292 : {
2293 : appendStringInfo(buf, "prev %X/%08X; xid %u",
2294 : LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2295 : XLogRecGetXid(record));
2296 :
2297 : appendStringInfo(buf, "; len %u",
2298 : XLogRecGetDataLen(record));
2299 :
2300 : xlog_block_info(buf, record);
2301 : }
2302 : #endif /* WAL_DEBUG */
2303 :
2304 : /*
2305 : * Returns a string giving information about all the blocks in an
2306 : * XLogRecord.
2307 : */
2308 : static void
2309 164 : xlog_block_info(StringInfo buf, XLogReaderState *record)
2310 : {
2311 : int block_id;
2312 :
2313 : /* decode block references */
2314 224 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2315 : {
2316 : RelFileLocator rlocator;
2317 : ForkNumber forknum;
2318 : BlockNumber blk;
2319 :
2320 60 : if (!XLogRecGetBlockTagExtended(record, block_id,
2321 : &rlocator, &forknum, &blk, NULL))
2322 0 : continue;
2323 :
2324 60 : if (forknum != MAIN_FORKNUM)
2325 4 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2326 : block_id,
2327 : rlocator.spcOid, rlocator.dbOid,
2328 : rlocator.relNumber,
2329 : forknum,
2330 : blk);
2331 : else
2332 56 : appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2333 : block_id,
2334 : rlocator.spcOid, rlocator.dbOid,
2335 : rlocator.relNumber,
2336 : blk);
2337 60 : if (XLogRecHasBlockImage(record, block_id))
2338 41 : appendStringInfoString(buf, " FPW");
2339 : }
2340 164 : }
2341 :
2342 :
2343 : /*
2344 : * Check that it's OK to switch to new timeline during recovery.
2345 : *
2346 : * 'lsn' is the address of the shutdown checkpoint record we're about to
2347 : * replay. (Currently, timeline can only change at a shutdown checkpoint).
2348 : */
2349 : static void
2350 11 : checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2351 : TimeLineID replayTLI)
2352 : {
2353 : /* Check that the record agrees on what the current (old) timeline is */
2354 11 : if (prevTLI != replayTLI)
2355 0 : ereport(PANIC,
2356 : (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2357 : prevTLI, replayTLI)));
2358 :
2359 : /*
2360 : * The new timeline better be in the list of timelines we expect to see,
2361 : * according to the timeline history. It should also not decrease.
2362 : */
2363 11 : if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2364 0 : ereport(PANIC,
2365 : (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2366 : newTLI, replayTLI)));
2367 :
2368 : /*
2369 : * If we have not yet reached min recovery point, and we're about to
2370 : * switch to a timeline greater than the timeline of the min recovery
2371 : * point: trouble. After switching to the new timeline, we could not
2372 : * possibly visit the min recovery point on the correct timeline anymore.
2373 : * This can happen if there is a newer timeline in the archive that
2374 : * branched before the timeline the min recovery point is on, and you
2375 : * attempt to do PITR to the new timeline.
2376 : */
2377 11 : if (XLogRecPtrIsValid(minRecoveryPoint) &&
2378 10 : lsn < minRecoveryPoint &&
2379 1 : newTLI > minRecoveryPointTLI)
2380 0 : ereport(PANIC,
2381 : errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2382 : newTLI,
2383 : LSN_FORMAT_ARGS(minRecoveryPoint),
2384 : minRecoveryPointTLI));
2385 :
2386 : /* Looks good */
2387 11 : }
2388 :
2389 :
2390 : /*
2391 : * Extract timestamp from WAL record.
2392 : *
2393 : * If the record contains a timestamp, returns true, and saves the timestamp
2394 : * in *recordXtime. If the record type has no timestamp, returns false.
2395 : * Currently, only transaction commit/abort records and restore points contain
2396 : * timestamps.
2397 : */
2398 : static bool
2399 46021 : getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2400 : {
2401 46021 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2402 46021 : uint8 xact_info = info & XLOG_XACT_OPMASK;
2403 46021 : uint8 rmid = XLogRecGetRmid(record);
2404 :
2405 46021 : if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2406 : {
2407 2 : *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2408 2 : return true;
2409 : }
2410 46019 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2411 : xact_info == XLOG_XACT_COMMIT_PREPARED))
2412 : {
2413 42135 : *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2414 42135 : return true;
2415 : }
2416 3884 : if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2417 : xact_info == XLOG_XACT_ABORT_PREPARED))
2418 : {
2419 3884 : *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2420 3884 : return true;
2421 : }
2422 0 : return false;
2423 : }
2424 :
2425 : /*
2426 : * Checks whether the current buffer page and backup page stored in the
2427 : * WAL record are consistent or not. Before comparing the two pages, a
2428 : * masking can be applied to the pages to ignore certain areas like hint bits,
2429 : * unused space between pd_lower and pd_upper among other things. This
2430 : * function should be called once WAL replay has been completed for a
2431 : * given record.
2432 : */
2433 : static void
2434 2234811 : verifyBackupPageConsistency(XLogReaderState *record)
2435 : {
2436 2234811 : RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2437 : RelFileLocator rlocator;
2438 : ForkNumber forknum;
2439 : BlockNumber blkno;
2440 : int block_id;
2441 :
2442 : /* Records with no backup blocks have no need for consistency checks. */
2443 2234811 : if (!XLogRecHasAnyBlockRefs(record))
2444 72 : return;
2445 :
2446 : Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2447 :
2448 4639684 : for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2449 : {
2450 : Buffer buf;
2451 : Page page;
2452 :
2453 2404945 : if (!XLogRecGetBlockTagExtended(record, block_id,
2454 : &rlocator, &forknum, &blkno, NULL))
2455 : {
2456 : /*
2457 : * WAL record doesn't contain a block reference with the given id.
2458 : * Do nothing.
2459 : */
2460 2129 : continue;
2461 : }
2462 :
2463 : Assert(XLogRecHasBlockImage(record, block_id));
2464 :
2465 2402816 : if (XLogRecBlockImageApply(record, block_id))
2466 : {
2467 : /*
2468 : * WAL record has already applied the page, so bypass the
2469 : * consistency check as that would result in comparing the full
2470 : * page stored in the record with itself.
2471 : */
2472 28129 : continue;
2473 : }
2474 :
2475 : /*
2476 : * Read the contents from the current buffer and store it in a
2477 : * temporary page.
2478 : */
2479 2374687 : buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2480 : RBM_NORMAL_NO_LOG,
2481 : InvalidBuffer);
2482 2374687 : if (!BufferIsValid(buf))
2483 0 : continue;
2484 :
2485 2374687 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2486 2374687 : page = BufferGetPage(buf);
2487 :
2488 : /*
2489 : * Take a copy of the local page where WAL has been applied to have a
2490 : * comparison base before masking it...
2491 : */
2492 2374687 : memcpy(replay_image_masked, page, BLCKSZ);
2493 :
2494 : /* No need for this page anymore now that a copy is in. */
2495 2374687 : UnlockReleaseBuffer(buf);
2496 :
2497 : /*
2498 : * If the block LSN is already ahead of this WAL record, we can't
2499 : * expect contents to match. This can happen if recovery is
2500 : * restarted.
2501 : */
2502 2374687 : if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2503 0 : continue;
2504 :
2505 : /*
2506 : * Read the contents from the backup copy, stored in WAL record and
2507 : * store it in a temporary page. There is no need to allocate a new
2508 : * page here, a local buffer is fine to hold its contents and a mask
2509 : * can be directly applied on it.
2510 : */
2511 2374687 : if (!RestoreBlockImage(record, block_id, primary_image_masked))
2512 0 : ereport(ERROR,
2513 : (errcode(ERRCODE_INTERNAL_ERROR),
2514 : errmsg_internal("%s", record->errormsg_buf)));
2515 :
2516 : /*
2517 : * If masking function is defined, mask both the primary and replay
2518 : * images
2519 : */
2520 2374687 : if (rmgr.rm_mask != NULL)
2521 : {
2522 2374687 : rmgr.rm_mask(replay_image_masked, blkno);
2523 2374687 : rmgr.rm_mask(primary_image_masked, blkno);
2524 : }
2525 :
2526 : /* Time to compare the primary and replay images. */
2527 2374687 : if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2528 : {
2529 0 : elog(FATAL,
2530 : "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2531 : rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2532 : forknum, blkno);
2533 : }
2534 : }
2535 : }
2536 :
2537 : /*
2538 : * For point-in-time recovery, this function decides whether we want to
2539 : * stop applying the XLOG before the current record.
2540 : *
2541 : * Returns true if we are stopping, false otherwise. If stopping, some
2542 : * information is saved in recoveryStopXid et al for use in annotating the
2543 : * new timeline's history file.
2544 : */
2545 : static bool
2546 2884372 : recoveryStopsBefore(XLogReaderState *record)
2547 : {
2548 2884372 : bool stopsHere = false;
2549 : uint8 xact_info;
2550 : bool isCommit;
2551 2884372 : TimestampTz recordXtime = 0;
2552 : TransactionId recordXid;
2553 :
2554 : /*
2555 : * Ignore recovery target settings when not in archive recovery (meaning
2556 : * we are in crash recovery).
2557 : */
2558 2884372 : if (!ArchiveRecoveryRequested)
2559 306227 : return false;
2560 :
2561 : /* Check if we should stop as soon as reaching consistency */
2562 2578145 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2563 : {
2564 0 : ereport(LOG,
2565 : (errmsg("recovery stopping after reaching consistency")));
2566 :
2567 0 : recoveryStopAfter = false;
2568 0 : recoveryStopXid = InvalidTransactionId;
2569 0 : recoveryStopLSN = InvalidXLogRecPtr;
2570 0 : recoveryStopTime = 0;
2571 0 : recoveryStopName[0] = '\0';
2572 0 : return true;
2573 : }
2574 :
2575 : /* Check if target LSN has been reached */
2576 2578145 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2577 8564 : !recoveryTargetInclusive &&
2578 493 : record->ReadRecPtr >= recoveryTargetLSN)
2579 : {
2580 2 : recoveryStopAfter = false;
2581 2 : recoveryStopXid = InvalidTransactionId;
2582 2 : recoveryStopLSN = record->ReadRecPtr;
2583 2 : recoveryStopTime = 0;
2584 2 : recoveryStopName[0] = '\0';
2585 2 : ereport(LOG,
2586 : errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2587 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2588 2 : return true;
2589 : }
2590 :
2591 : /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2592 2578143 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2593 2554846 : return false;
2594 :
2595 23297 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2596 :
2597 23297 : if (xact_info == XLOG_XACT_COMMIT)
2598 : {
2599 21028 : isCommit = true;
2600 21028 : recordXid = XLogRecGetXid(record);
2601 : }
2602 2269 : else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2603 : {
2604 26 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2605 : xl_xact_parsed_commit parsed;
2606 :
2607 26 : isCommit = true;
2608 26 : ParseCommitRecord(XLogRecGetInfo(record),
2609 : xlrec,
2610 : &parsed);
2611 26 : recordXid = parsed.twophase_xid;
2612 : }
2613 2243 : else if (xact_info == XLOG_XACT_ABORT)
2614 : {
2615 1927 : isCommit = false;
2616 1927 : recordXid = XLogRecGetXid(record);
2617 : }
2618 316 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2619 : {
2620 15 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2621 : xl_xact_parsed_abort parsed;
2622 :
2623 15 : isCommit = false;
2624 15 : ParseAbortRecord(XLogRecGetInfo(record),
2625 : xlrec,
2626 : &parsed);
2627 15 : recordXid = parsed.twophase_xid;
2628 : }
2629 : else
2630 301 : return false;
2631 :
2632 22996 : if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2633 : {
2634 : /*
2635 : * There can be only one transaction end record with this exact
2636 : * transactionid
2637 : *
2638 : * when testing for an xid, we MUST test for equality only, since
2639 : * transactions are numbered in the order they start, not the order
2640 : * they complete. A higher numbered xid will complete before you about
2641 : * 50% of the time...
2642 : */
2643 0 : stopsHere = (recordXid == recoveryTargetXid);
2644 : }
2645 :
2646 : /*
2647 : * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2648 : * We don't expect getRecordTimestamp ever to fail, since we already know
2649 : * this is a commit or abort record; but test its result anyway.
2650 : */
2651 22996 : if (getRecordTimestamp(record, &recordXtime) &&
2652 22996 : recoveryTarget == RECOVERY_TARGET_TIME)
2653 : {
2654 : /*
2655 : * There can be many transactions that share the same commit time, so
2656 : * we stop after the last one, if we are inclusive, or stop at the
2657 : * first one if we are exclusive
2658 : */
2659 0 : if (recoveryTargetInclusive)
2660 0 : stopsHere = (recordXtime > recoveryTargetTime);
2661 : else
2662 0 : stopsHere = (recordXtime >= recoveryTargetTime);
2663 : }
2664 :
2665 22996 : if (stopsHere)
2666 : {
2667 0 : recoveryStopAfter = false;
2668 0 : recoveryStopXid = recordXid;
2669 0 : recoveryStopTime = recordXtime;
2670 0 : recoveryStopLSN = InvalidXLogRecPtr;
2671 0 : recoveryStopName[0] = '\0';
2672 :
2673 0 : if (isCommit)
2674 : {
2675 0 : ereport(LOG,
2676 : (errmsg("recovery stopping before commit of transaction %u, time %s",
2677 : recoveryStopXid,
2678 : timestamptz_to_str(recoveryStopTime))));
2679 : }
2680 : else
2681 : {
2682 0 : ereport(LOG,
2683 : (errmsg("recovery stopping before abort of transaction %u, time %s",
2684 : recoveryStopXid,
2685 : timestamptz_to_str(recoveryStopTime))));
2686 : }
2687 : }
2688 :
2689 22996 : return stopsHere;
2690 : }
2691 :
2692 : /*
2693 : * Same as recoveryStopsBefore, but called after applying the record.
2694 : *
2695 : * We also track the timestamp of the latest applied COMMIT/ABORT
2696 : * record in XLogRecoveryCtl->recoveryLastXTime.
2697 : */
2698 : static bool
2699 2884368 : recoveryStopsAfter(XLogReaderState *record)
2700 : {
2701 : uint8 info;
2702 : uint8 xact_info;
2703 : uint8 rmid;
2704 2884368 : TimestampTz recordXtime = 0;
2705 :
2706 : /*
2707 : * Ignore recovery target settings when not in archive recovery (meaning
2708 : * we are in crash recovery).
2709 : */
2710 2884368 : if (!ArchiveRecoveryRequested)
2711 306227 : return false;
2712 :
2713 2578141 : info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2714 2578141 : rmid = XLogRecGetRmid(record);
2715 :
2716 : /*
2717 : * There can be many restore points that share the same name; we stop at
2718 : * the first one.
2719 : */
2720 2578141 : if (recoveryTarget == RECOVERY_TARGET_NAME &&
2721 22 : rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2722 : {
2723 : xl_restore_point *recordRestorePointData;
2724 :
2725 3 : recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2726 :
2727 3 : if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2728 : {
2729 2 : recoveryStopAfter = true;
2730 2 : recoveryStopXid = InvalidTransactionId;
2731 2 : recoveryStopLSN = InvalidXLogRecPtr;
2732 2 : (void) getRecordTimestamp(record, &recoveryStopTime);
2733 2 : strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2734 :
2735 2 : ereport(LOG,
2736 : (errmsg("recovery stopping at restore point \"%s\", time %s",
2737 : recoveryStopName,
2738 : timestamptz_to_str(recoveryStopTime))));
2739 2 : return true;
2740 : }
2741 : }
2742 :
2743 : /* Check if the target LSN has been reached */
2744 2578139 : if (recoveryTarget == RECOVERY_TARGET_LSN &&
2745 8071 : recoveryTargetInclusive &&
2746 8071 : record->ReadRecPtr >= recoveryTargetLSN)
2747 : {
2748 3 : recoveryStopAfter = true;
2749 3 : recoveryStopXid = InvalidTransactionId;
2750 3 : recoveryStopLSN = record->ReadRecPtr;
2751 3 : recoveryStopTime = 0;
2752 3 : recoveryStopName[0] = '\0';
2753 3 : ereport(LOG,
2754 : errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2755 : LSN_FORMAT_ARGS(recoveryStopLSN)));
2756 3 : return true;
2757 : }
2758 :
2759 2578136 : if (rmid != RM_XACT_ID)
2760 2554841 : return false;
2761 :
2762 23295 : xact_info = info & XLOG_XACT_OPMASK;
2763 :
2764 23295 : if (xact_info == XLOG_XACT_COMMIT ||
2765 2243 : xact_info == XLOG_XACT_COMMIT_PREPARED ||
2766 316 : xact_info == XLOG_XACT_ABORT ||
2767 : xact_info == XLOG_XACT_ABORT_PREPARED)
2768 : {
2769 : TransactionId recordXid;
2770 :
2771 : /* Update the last applied transaction timestamp */
2772 22994 : if (getRecordTimestamp(record, &recordXtime))
2773 22994 : SetLatestXTime(recordXtime);
2774 :
2775 : /* Extract the XID of the committed/aborted transaction */
2776 22994 : if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2777 : {
2778 26 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2779 : xl_xact_parsed_commit parsed;
2780 :
2781 26 : ParseCommitRecord(XLogRecGetInfo(record),
2782 : xlrec,
2783 : &parsed);
2784 26 : recordXid = parsed.twophase_xid;
2785 : }
2786 22968 : else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2787 : {
2788 15 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2789 : xl_xact_parsed_abort parsed;
2790 :
2791 15 : ParseAbortRecord(XLogRecGetInfo(record),
2792 : xlrec,
2793 : &parsed);
2794 15 : recordXid = parsed.twophase_xid;
2795 : }
2796 : else
2797 22953 : recordXid = XLogRecGetXid(record);
2798 :
2799 : /*
2800 : * There can be only one transaction end record with this exact
2801 : * transactionid
2802 : *
2803 : * when testing for an xid, we MUST test for equality only, since
2804 : * transactions are numbered in the order they start, not the order
2805 : * they complete. A higher numbered xid will complete before you about
2806 : * 50% of the time...
2807 : */
2808 22994 : if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2809 0 : recordXid == recoveryTargetXid)
2810 : {
2811 0 : recoveryStopAfter = true;
2812 0 : recoveryStopXid = recordXid;
2813 0 : recoveryStopTime = recordXtime;
2814 0 : recoveryStopLSN = InvalidXLogRecPtr;
2815 0 : recoveryStopName[0] = '\0';
2816 :
2817 0 : if (xact_info == XLOG_XACT_COMMIT ||
2818 : xact_info == XLOG_XACT_COMMIT_PREPARED)
2819 : {
2820 0 : ereport(LOG,
2821 : (errmsg("recovery stopping after commit of transaction %u, time %s",
2822 : recoveryStopXid,
2823 : timestamptz_to_str(recoveryStopTime))));
2824 : }
2825 0 : else if (xact_info == XLOG_XACT_ABORT ||
2826 : xact_info == XLOG_XACT_ABORT_PREPARED)
2827 : {
2828 0 : ereport(LOG,
2829 : (errmsg("recovery stopping after abort of transaction %u, time %s",
2830 : recoveryStopXid,
2831 : timestamptz_to_str(recoveryStopTime))));
2832 : }
2833 0 : return true;
2834 : }
2835 : }
2836 :
2837 : /* Check if we should stop as soon as reaching consistency */
2838 23295 : if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2839 : {
2840 0 : ereport(LOG,
2841 : (errmsg("recovery stopping after reaching consistency")));
2842 :
2843 0 : recoveryStopAfter = true;
2844 0 : recoveryStopXid = InvalidTransactionId;
2845 0 : recoveryStopTime = 0;
2846 0 : recoveryStopLSN = InvalidXLogRecPtr;
2847 0 : recoveryStopName[0] = '\0';
2848 0 : return true;
2849 : }
2850 :
2851 23295 : return false;
2852 : }
2853 :
2854 : /*
2855 : * Create a comment for the history file to explain why and where
2856 : * timeline changed.
2857 : */
2858 : static char *
2859 970 : getRecoveryStopReason(void)
2860 : {
2861 : char reason[200];
2862 :
2863 970 : if (recoveryTarget == RECOVERY_TARGET_XID)
2864 0 : snprintf(reason, sizeof(reason),
2865 : "%s transaction %u",
2866 0 : recoveryStopAfter ? "after" : "before",
2867 : recoveryStopXid);
2868 970 : else if (recoveryTarget == RECOVERY_TARGET_TIME)
2869 0 : snprintf(reason, sizeof(reason),
2870 : "%s %s\n",
2871 0 : recoveryStopAfter ? "after" : "before",
2872 : timestamptz_to_str(recoveryStopTime));
2873 970 : else if (recoveryTarget == RECOVERY_TARGET_LSN)
2874 6 : snprintf(reason, sizeof(reason),
2875 : "%s LSN %X/%08X\n",
2876 6 : recoveryStopAfter ? "after" : "before",
2877 6 : LSN_FORMAT_ARGS(recoveryStopLSN));
2878 964 : else if (recoveryTarget == RECOVERY_TARGET_NAME)
2879 3 : snprintf(reason, sizeof(reason),
2880 : "at restore point \"%s\"",
2881 : recoveryStopName);
2882 961 : else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2883 0 : snprintf(reason, sizeof(reason), "reached consistency");
2884 : else
2885 961 : snprintf(reason, sizeof(reason), "no recovery target specified");
2886 :
2887 970 : return pstrdup(reason);
2888 : }
2889 :
2890 : /*
2891 : * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2892 : *
2893 : * endOfRecovery is true if the recovery target is reached and
2894 : * the paused state starts at the end of recovery because of
2895 : * recovery_target_action=pause, and false otherwise.
2896 : */
2897 : static void
2898 5 : recoveryPausesHere(bool endOfRecovery)
2899 : {
2900 : /* Don't pause unless users can connect! */
2901 5 : if (!LocalHotStandbyActive)
2902 0 : return;
2903 :
2904 : /* Don't pause after standby promotion has been triggered */
2905 5 : if (LocalPromoteIsTriggered)
2906 0 : return;
2907 :
2908 5 : if (endOfRecovery)
2909 1 : ereport(LOG,
2910 : (errmsg("pausing at the end of recovery"),
2911 : errhint("Execute pg_wal_replay_resume() to promote.")));
2912 : else
2913 4 : ereport(LOG,
2914 : (errmsg("recovery has paused"),
2915 : errhint("Execute pg_wal_replay_resume() to continue.")));
2916 :
2917 : /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2918 16 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2919 : {
2920 13 : ProcessStartupProcInterrupts();
2921 13 : if (CheckForStandbyTrigger())
2922 2 : return;
2923 :
2924 : /*
2925 : * If recovery pause is requested then set it paused. While we are in
2926 : * the loop, user might resume and pause again so set this every time.
2927 : */
2928 11 : ConfirmRecoveryPaused();
2929 :
2930 : /*
2931 : * We wait on a condition variable that will wake us as soon as the
2932 : * pause ends, but we use a timeout so we can check the above exit
2933 : * condition periodically too.
2934 : */
2935 11 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2936 : WAIT_EVENT_RECOVERY_PAUSE);
2937 : }
2938 3 : ConditionVariableCancelSleep();
2939 : }
2940 :
2941 : /*
2942 : * When recovery_min_apply_delay is set, we wait long enough to make sure
2943 : * certain record types are applied at least that interval behind the primary.
2944 : *
2945 : * Returns true if we waited.
2946 : *
2947 : * Note that the delay is calculated between the WAL record log time and
2948 : * the current time on standby. We would prefer to keep track of when this
2949 : * standby received each WAL record, which would allow a more consistent
2950 : * approach and one not affected by time synchronisation issues, but that
2951 : * is significantly more effort and complexity for little actual gain in
2952 : * usability.
2953 : */
2954 : static bool
2955 2884370 : recoveryApplyDelay(XLogReaderState *record)
2956 : {
2957 : uint8 xact_info;
2958 : TimestampTz xtime;
2959 : TimestampTz delayUntil;
2960 : long msecs;
2961 :
2962 : /* nothing to do if no delay configured */
2963 2884370 : if (recovery_min_apply_delay <= 0)
2964 2884227 : return false;
2965 :
2966 : /* no delay is applied on a database not yet consistent */
2967 143 : if (!reachedConsistency)
2968 4 : return false;
2969 :
2970 : /* nothing to do if crash recovery is requested */
2971 139 : if (!ArchiveRecoveryRequested)
2972 0 : return false;
2973 :
2974 : /*
2975 : * Is it a COMMIT record?
2976 : *
2977 : * We deliberately choose not to delay aborts since they have no effect on
2978 : * MVCC. We already allow replay of records that don't have a timestamp,
2979 : * so there is already opportunity for issues caused by early conflicts on
2980 : * standbys.
2981 : */
2982 139 : if (XLogRecGetRmid(record) != RM_XACT_ID)
2983 110 : return false;
2984 :
2985 29 : xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2986 :
2987 29 : if (xact_info != XLOG_XACT_COMMIT &&
2988 : xact_info != XLOG_XACT_COMMIT_PREPARED)
2989 0 : return false;
2990 :
2991 29 : if (!getRecordTimestamp(record, &xtime))
2992 0 : return false;
2993 :
2994 29 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
2995 :
2996 : /*
2997 : * Exit without arming the latch if it's already past time to apply this
2998 : * record
2999 : */
3000 29 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3001 29 : if (msecs <= 0)
3002 10 : return false;
3003 :
3004 : while (true)
3005 : {
3006 51 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3007 :
3008 : /* This might change recovery_min_apply_delay. */
3009 51 : ProcessStartupProcInterrupts();
3010 :
3011 51 : if (CheckForStandbyTrigger())
3012 0 : break;
3013 :
3014 : /*
3015 : * Recalculate delayUntil as recovery_min_apply_delay could have
3016 : * changed while waiting in this loop.
3017 : */
3018 51 : delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3019 :
3020 : /*
3021 : * Wait for difference between GetCurrentTimestamp() and delayUntil.
3022 : */
3023 51 : msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3024 : delayUntil);
3025 :
3026 51 : if (msecs <= 0)
3027 19 : break;
3028 :
3029 32 : elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3030 :
3031 32 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3032 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3033 : msecs,
3034 : WAIT_EVENT_RECOVERY_APPLY_DELAY);
3035 : }
3036 19 : return true;
3037 : }
3038 :
3039 : /*
3040 : * Get the current state of the recovery pause request.
3041 : */
3042 : RecoveryPauseState
3043 22 : GetRecoveryPauseState(void)
3044 : {
3045 : RecoveryPauseState state;
3046 :
3047 22 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3048 22 : state = XLogRecoveryCtl->recoveryPauseState;
3049 22 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3050 :
3051 22 : return state;
3052 : }
3053 :
3054 : /*
3055 : * Set the recovery pause state.
3056 : *
3057 : * If recovery pause is requested then sets the recovery pause state to
3058 : * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3059 : * to 'not paused' to resume the recovery. The recovery pause will be
3060 : * confirmed by the ConfirmRecoveryPaused.
3061 : */
3062 : void
3063 56 : SetRecoveryPause(bool recoveryPause)
3064 : {
3065 56 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3066 :
3067 56 : if (!recoveryPause)
3068 51 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3069 5 : else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3070 5 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3071 :
3072 56 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3073 :
3074 56 : if (!recoveryPause)
3075 51 : ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3076 56 : }
3077 :
3078 : /*
3079 : * Confirm the recovery pause by setting the recovery pause state to
3080 : * RECOVERY_PAUSED.
3081 : */
3082 : static void
3083 11 : ConfirmRecoveryPaused(void)
3084 : {
3085 : /* If recovery pause is requested then set it paused */
3086 11 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3087 11 : if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3088 5 : XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3089 11 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
3090 11 : }
3091 :
3092 :
3093 : /*
3094 : * Attempt to read the next XLOG record.
3095 : *
3096 : * Before first call, the reader needs to be positioned to the first record
3097 : * by calling XLogPrefetcherBeginRead().
3098 : *
3099 : * If no valid record is available, returns NULL, or fails if emode is PANIC.
3100 : * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3101 : * record is available.
3102 : */
3103 : static XLogRecord *
3104 2886716 : ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3105 : bool fetching_ckpt, TimeLineID replayTLI)
3106 : {
3107 : XLogRecord *record;
3108 2886716 : XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3109 2886716 : XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3110 :
3111 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3112 :
3113 : /* Pass through parameters to XLogPageRead */
3114 2886716 : private->fetching_ckpt = fetching_ckpt;
3115 2886716 : private->emode = emode;
3116 2886716 : private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3117 2886716 : private->replayTLI = replayTLI;
3118 :
3119 : /* This is the first attempt to read this page. */
3120 2886716 : lastSourceFailed = false;
3121 :
3122 : for (;;)
3123 141 : {
3124 : char *errormsg;
3125 :
3126 2886857 : record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3127 2886799 : if (record == NULL)
3128 : {
3129 : /*
3130 : * When we find that WAL ends in an incomplete record, keep track
3131 : * of that record. After recovery is done, we'll write a record
3132 : * to indicate to downstream WAL readers that that portion is to
3133 : * be ignored.
3134 : *
3135 : * However, when ArchiveRecoveryRequested = true, we're going to
3136 : * switch to a new timeline at the end of recovery. We will only
3137 : * copy WAL over to the new timeline up to the end of the last
3138 : * complete record, so if we did this, we would later create an
3139 : * overwrite contrecord in the wrong place, breaking everything.
3140 : */
3141 300 : if (!ArchiveRecoveryRequested &&
3142 110 : XLogRecPtrIsValid(xlogreader->abortedRecPtr))
3143 : {
3144 13 : abortedRecPtr = xlogreader->abortedRecPtr;
3145 13 : missingContrecPtr = xlogreader->missingContrecPtr;
3146 : }
3147 :
3148 300 : if (readFile >= 0)
3149 : {
3150 273 : close(readFile);
3151 273 : readFile = -1;
3152 : }
3153 :
3154 : /*
3155 : * We only end up here without a message when XLogPageRead()
3156 : * failed - in that case we already logged something. In
3157 : * StandbyMode that only happens if we have been triggered, so we
3158 : * shouldn't loop anymore in that case.
3159 : */
3160 300 : if (errormsg)
3161 273 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3162 : (errmsg_internal("%s", errormsg) /* already translated */ ));
3163 : }
3164 :
3165 : /*
3166 : * Check page TLI is one of the expected values.
3167 : */
3168 2886499 : else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3169 : {
3170 : char fname[MAXFNAMELEN];
3171 : XLogSegNo segno;
3172 : int32 offset;
3173 :
3174 0 : XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3175 0 : offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3176 : wal_segment_size);
3177 0 : XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3178 : wal_segment_size);
3179 0 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3180 : errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3181 : xlogreader->latestPageTLI,
3182 : fname,
3183 : LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3184 : offset));
3185 0 : record = NULL;
3186 : }
3187 :
3188 2886799 : if (record)
3189 : {
3190 : /* Great, got a record */
3191 2886658 : return record;
3192 : }
3193 : else
3194 : {
3195 : /* No valid record available from this source */
3196 300 : lastSourceFailed = true;
3197 :
3198 : /*
3199 : * If archive recovery was requested, but we were still doing
3200 : * crash recovery, switch to archive recovery and retry using the
3201 : * offline archive. We have now replayed all the valid WAL in
3202 : * pg_wal, so we are presumably now consistent.
3203 : *
3204 : * We require that there's at least some valid WAL present in
3205 : * pg_wal, however (!fetching_ckpt). We could recover using the
3206 : * WAL from the archive, even if pg_wal is completely empty, but
3207 : * we'd have no idea how far we'd have to replay to reach
3208 : * consistency. So err on the safe side and give up.
3209 : */
3210 300 : if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3211 1 : !fetching_ckpt)
3212 : {
3213 1 : ereport(DEBUG1,
3214 : (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3215 1 : InArchiveRecovery = true;
3216 1 : if (StandbyModeRequested)
3217 1 : EnableStandbyMode();
3218 :
3219 1 : SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3220 1 : minRecoveryPoint = xlogreader->EndRecPtr;
3221 1 : minRecoveryPointTLI = replayTLI;
3222 :
3223 1 : CheckRecoveryConsistency();
3224 :
3225 : /*
3226 : * Before we retry, reset lastSourceFailed and currentSource
3227 : * so that we will check the archive next.
3228 : */
3229 1 : lastSourceFailed = false;
3230 1 : currentSource = XLOG_FROM_ANY;
3231 :
3232 141 : continue;
3233 : }
3234 :
3235 : /* In standby mode, loop back to retry. Otherwise, give up. */
3236 299 : if (StandbyMode && !CheckForStandbyTrigger())
3237 140 : continue;
3238 : else
3239 159 : return NULL;
3240 : }
3241 : }
3242 : }
3243 :
3244 : /*
3245 : * Read the XLOG page containing targetPagePtr into readBuf (if not read
3246 : * already). Returns number of bytes read, if the page is read successfully,
3247 : * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3248 : * but only if they have not been previously reported.
3249 : *
3250 : * See XLogReaderRoutine.page_read for more details.
3251 : *
3252 : * While prefetching, xlogreader->nonblocking may be set. In that case,
3253 : * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3254 : *
3255 : * This is responsible for restoring files from archive as needed, as well
3256 : * as for waiting for the requested WAL record to arrive in standby mode.
3257 : *
3258 : * xlogreader->private_data->emode specifies the log level used for reporting
3259 : * "file not found" or "end of WAL" situations in archive recovery, or in
3260 : * standby mode when promotion is triggered. If set to WARNING or below,
3261 : * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3262 : * levels the ereport() won't return.
3263 : *
3264 : * In standby mode, if after a successful return of XLogPageRead() the
3265 : * caller finds the record it's interested in to be broken, it should
3266 : * ereport the error with the level determined by
3267 : * emode_for_corrupt_record(), and then set lastSourceFailed
3268 : * and call XLogPageRead() again with the same arguments. This lets
3269 : * XLogPageRead() to try fetching the record from another source, or to
3270 : * sleep and retry.
3271 : */
3272 : static int
3273 1487880 : XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3274 : XLogRecPtr targetRecPtr, char *readBuf)
3275 : {
3276 1487880 : XLogPageReadPrivate *private =
3277 : (XLogPageReadPrivate *) xlogreader->private_data;
3278 1487880 : int emode = private->emode;
3279 : uint32 targetPageOff;
3280 : XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3281 : int r;
3282 : instr_time io_start;
3283 :
3284 : Assert(AmStartupProcess() || !IsUnderPostmaster);
3285 :
3286 1487880 : XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3287 1487880 : targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3288 :
3289 : /*
3290 : * See if we need to switch to a new segment because the requested record
3291 : * is not in the currently open one.
3292 : */
3293 1487880 : if (readFile >= 0 &&
3294 1486107 : !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3295 : {
3296 : /*
3297 : * Request a restartpoint if we've replayed too much xlog since the
3298 : * last one.
3299 : */
3300 1471 : if (ArchiveRecoveryRequested && IsUnderPostmaster)
3301 : {
3302 1455 : if (XLogCheckpointNeeded(readSegNo))
3303 : {
3304 1333 : (void) GetRedoRecPtr();
3305 1333 : if (XLogCheckpointNeeded(readSegNo))
3306 1325 : RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3307 : }
3308 : }
3309 :
3310 1471 : close(readFile);
3311 1471 : readFile = -1;
3312 1471 : readSource = XLOG_FROM_ANY;
3313 : }
3314 :
3315 1487880 : XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3316 :
3317 1487882 : retry:
3318 : /* See if we need to retrieve more data */
3319 1487882 : if (readFile < 0 ||
3320 1484636 : (readSource == XLOG_FROM_STREAM &&
3321 1472282 : flushedUpto < targetPagePtr + reqLen))
3322 : {
3323 35113 : if (readFile >= 0 &&
3324 31867 : xlogreader->nonblocking &&
3325 15789 : readSource == XLOG_FROM_STREAM &&
3326 15789 : flushedUpto < targetPagePtr + reqLen)
3327 15789 : return XLREAD_WOULDBLOCK;
3328 :
3329 19266 : switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3330 19324 : private->randAccess,
3331 19324 : private->fetching_ckpt,
3332 : targetRecPtr,
3333 : private->replayTLI,
3334 : xlogreader->EndRecPtr,
3335 19324 : xlogreader->nonblocking))
3336 : {
3337 416 : case XLREAD_WOULDBLOCK:
3338 416 : return XLREAD_WOULDBLOCK;
3339 52 : case XLREAD_FAIL:
3340 52 : if (readFile >= 0)
3341 0 : close(readFile);
3342 52 : readFile = -1;
3343 52 : readLen = 0;
3344 52 : readSource = XLOG_FROM_ANY;
3345 52 : return XLREAD_FAIL;
3346 18798 : case XLREAD_SUCCESS:
3347 18798 : break;
3348 : }
3349 : }
3350 :
3351 : /*
3352 : * At this point, we have the right segment open and if we're streaming we
3353 : * know the requested record is in it.
3354 : */
3355 : Assert(readFile != -1);
3356 :
3357 : /*
3358 : * If the current segment is being streamed from the primary, calculate
3359 : * how much of the current page we have received already. We know the
3360 : * requested record has been received, but this is for the benefit of
3361 : * future calls, to allow quick exit at the top of this function.
3362 : */
3363 1471567 : if (readSource == XLOG_FROM_STREAM)
3364 : {
3365 1457534 : if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3366 1446271 : readLen = XLOG_BLCKSZ;
3367 : else
3368 11263 : readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3369 : targetPageOff;
3370 : }
3371 : else
3372 14033 : readLen = XLOG_BLCKSZ;
3373 :
3374 : /* Read the requested page */
3375 1471567 : readOff = targetPageOff;
3376 :
3377 : /* Measure I/O timing when reading segment */
3378 1471567 : io_start = pgstat_prepare_io_time(track_wal_io_timing);
3379 :
3380 1471567 : pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3381 1471567 : r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3382 1471567 : if (r != XLOG_BLCKSZ)
3383 : {
3384 : char fname[MAXFNAMELEN];
3385 0 : int save_errno = errno;
3386 :
3387 0 : pgstat_report_wait_end();
3388 :
3389 0 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3390 : io_start, 1, r);
3391 :
3392 0 : XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3393 0 : if (r < 0)
3394 : {
3395 0 : errno = save_errno;
3396 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3397 : (errcode_for_file_access(),
3398 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3399 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3400 : readOff)));
3401 : }
3402 : else
3403 0 : ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3404 : (errcode(ERRCODE_DATA_CORRUPTED),
3405 : errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3406 : fname, LSN_FORMAT_ARGS(targetPagePtr),
3407 : readOff, r, (Size) XLOG_BLCKSZ)));
3408 0 : goto next_record_is_invalid;
3409 : }
3410 1471567 : pgstat_report_wait_end();
3411 :
3412 1471567 : pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3413 : io_start, 1, r);
3414 :
3415 : Assert(targetSegNo == readSegNo);
3416 : Assert(targetPageOff == readOff);
3417 : Assert(reqLen <= readLen);
3418 :
3419 1471567 : xlogreader->seg.ws_tli = curFileTLI;
3420 :
3421 : /*
3422 : * Check the page header immediately, so that we can retry immediately if
3423 : * it's not valid. This may seem unnecessary, because ReadPageInternal()
3424 : * validates the page header anyway, and would propagate the failure up to
3425 : * ReadRecord(), which would retry. However, there's a corner case with
3426 : * continuation records, if a record is split across two pages such that
3427 : * we would need to read the two pages from different sources across two
3428 : * WAL segments.
3429 : *
3430 : * The first page is only available locally, in pg_wal, because it's
3431 : * already been recycled on the primary. The second page, however, is not
3432 : * present in pg_wal, and we should stream it from the primary. There is a
3433 : * recycled WAL segment present in pg_wal, with garbage contents, however.
3434 : * We would read the first page from the local WAL segment, but when
3435 : * reading the second page, we would read the bogus, recycled, WAL
3436 : * segment. If we didn't catch that case here, we would never recover,
3437 : * because ReadRecord() would retry reading the whole record from the
3438 : * beginning.
3439 : *
3440 : * Of course, this only catches errors in the page header, which is what
3441 : * happens in the case of a recycled WAL segment. Other kinds of errors or
3442 : * corruption still has the same problem. But this at least fixes the
3443 : * common case, which can happen as part of normal operation.
3444 : *
3445 : * Validating the page header is cheap enough that doing it twice
3446 : * shouldn't be a big deal from a performance point of view.
3447 : *
3448 : * When not in standby mode, an invalid page header should cause recovery
3449 : * to end, not retry reading the page, so we don't need to validate the
3450 : * page header here for the retry. Instead, ReadPageInternal() is
3451 : * responsible for the validation.
3452 : */
3453 1471567 : if (StandbyMode &&
3454 1461338 : (targetPagePtr % wal_segment_size) == 0 &&
3455 1429 : !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3456 : {
3457 : /*
3458 : * Emit this error right now then retry this page immediately. Use
3459 : * errmsg_internal() because the message was already translated.
3460 : */
3461 3 : if (xlogreader->errormsg_buf[0])
3462 3 : ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3463 : (errmsg_internal("%s", xlogreader->errormsg_buf)));
3464 :
3465 : /* reset any error XLogReaderValidatePageHeader() might have set */
3466 3 : XLogReaderResetError(xlogreader);
3467 3 : goto next_record_is_invalid;
3468 : }
3469 :
3470 1471564 : return readLen;
3471 :
3472 3 : next_record_is_invalid:
3473 :
3474 : /*
3475 : * If we're reading ahead, give up fast. Retries and error reporting will
3476 : * be handled by a later read when recovery catches up to this point.
3477 : */
3478 3 : if (xlogreader->nonblocking)
3479 1 : return XLREAD_WOULDBLOCK;
3480 :
3481 2 : lastSourceFailed = true;
3482 :
3483 2 : if (readFile >= 0)
3484 2 : close(readFile);
3485 2 : readFile = -1;
3486 2 : readLen = 0;
3487 2 : readSource = XLOG_FROM_ANY;
3488 :
3489 : /* In standby-mode, keep trying */
3490 2 : if (StandbyMode)
3491 2 : goto retry;
3492 : else
3493 0 : return XLREAD_FAIL;
3494 : }
3495 :
3496 : /*
3497 : * Open the WAL segment containing WAL location 'RecPtr'.
3498 : *
3499 : * The segment can be fetched via restore_command, or via walreceiver having
3500 : * streamed the record, or it can already be present in pg_wal. Checking
3501 : * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3502 : * too, in case someone copies a new segment directly to pg_wal. That is not
3503 : * documented or recommended, though.
3504 : *
3505 : * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3506 : * prepare to read WAL starting from RedoStartLSN after this.
3507 : *
3508 : * 'RecPtr' might not point to the beginning of the record we're interested
3509 : * in, it might also point to the page or segment header. In that case,
3510 : * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3511 : * used to decide which timeline to stream the requested WAL from.
3512 : *
3513 : * 'replayLSN' is the current replay LSN, so that if we scan for new
3514 : * timelines, we can reject a switch to a timeline that branched off before
3515 : * this point.
3516 : *
3517 : * If the record is not immediately available, the function returns XLREAD_FAIL
3518 : * if we're not in standby mode. In standby mode, the function waits for it to
3519 : * become available.
3520 : *
3521 : * When the requested record becomes available, the function opens the file
3522 : * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3523 : * of standby mode is triggered by the user, and there is no more WAL
3524 : * available, returns XLREAD_FAIL.
3525 : *
3526 : * If nonblocking is true, then give up immediately if we can't satisfy the
3527 : * request, returning XLREAD_WOULDBLOCK instead of waiting.
3528 : */
3529 : static XLogPageReadResult
3530 19324 : WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3531 : bool fetching_ckpt, XLogRecPtr tliRecPtr,
3532 : TimeLineID replayTLI, XLogRecPtr replayLSN,
3533 : bool nonblocking)
3534 : {
3535 : static TimestampTz last_fail_time = 0;
3536 : TimestampTz now;
3537 19324 : bool streaming_reply_sent = false;
3538 :
3539 : /*-------
3540 : * Standby mode is implemented by a state machine:
3541 : *
3542 : * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3543 : * pg_wal (XLOG_FROM_PG_WAL)
3544 : * 2. Check for promotion trigger request
3545 : * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3546 : * 4. Rescan timelines
3547 : * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3548 : *
3549 : * Failure to read from the current source advances the state machine to
3550 : * the next state.
3551 : *
3552 : * 'currentSource' indicates the current state. There are no currentSource
3553 : * values for "check trigger", "rescan timelines", and "sleep" states,
3554 : * those actions are taken when reading from the previous source fails, as
3555 : * part of advancing to the next state.
3556 : *
3557 : * If standby mode is turned off while reading WAL from stream, we move
3558 : * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3559 : * the files (which would be required at end of recovery, e.g., timeline
3560 : * history file) from archive or pg_wal. We don't need to kill WAL receiver
3561 : * here because it's already stopped when standby mode is turned off at
3562 : * the end of recovery.
3563 : *-------
3564 : */
3565 19324 : if (!InArchiveRecovery)
3566 1034 : currentSource = XLOG_FROM_PG_WAL;
3567 18290 : else if (currentSource == XLOG_FROM_ANY ||
3568 18166 : (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3569 : {
3570 124 : lastSourceFailed = false;
3571 124 : currentSource = XLOG_FROM_ARCHIVE;
3572 : }
3573 :
3574 : for (;;)
3575 16849 : {
3576 36173 : XLogSource oldSource = currentSource;
3577 36173 : bool startWalReceiver = false;
3578 :
3579 : /*
3580 : * First check if we failed to read from the current source, and
3581 : * advance the state machine if so. The failure to read might've
3582 : * happened outside this function, e.g when a CRC check fails on a
3583 : * record, or within this loop.
3584 : */
3585 36173 : if (lastSourceFailed)
3586 : {
3587 : /*
3588 : * Don't allow any retry loops to occur during nonblocking
3589 : * readahead. Let the caller process everything that has been
3590 : * decoded already first.
3591 : */
3592 537 : if (nonblocking)
3593 82 : return XLREAD_WOULDBLOCK;
3594 :
3595 455 : switch (currentSource)
3596 : {
3597 275 : case XLOG_FROM_ARCHIVE:
3598 : case XLOG_FROM_PG_WAL:
3599 :
3600 : /*
3601 : * Check to see if promotion is requested. Note that we do
3602 : * this only after failure, so when you promote, we still
3603 : * finish replaying as much as we can from archive and
3604 : * pg_wal before failover.
3605 : */
3606 275 : if (StandbyMode && CheckForStandbyTrigger())
3607 : {
3608 22 : XLogShutdownWalRcv();
3609 22 : return XLREAD_FAIL;
3610 : }
3611 :
3612 : /*
3613 : * Not in standby mode, and we've now tried the archive
3614 : * and pg_wal.
3615 : */
3616 253 : if (!StandbyMode)
3617 30 : return XLREAD_FAIL;
3618 :
3619 : /*
3620 : * Move to XLOG_FROM_STREAM state, and set to start a
3621 : * walreceiver if necessary.
3622 : */
3623 223 : currentSource = XLOG_FROM_STREAM;
3624 223 : startWalReceiver = true;
3625 223 : break;
3626 :
3627 180 : case XLOG_FROM_STREAM:
3628 :
3629 : /*
3630 : * Failure while streaming. Most likely, we got here
3631 : * because streaming replication was terminated, or
3632 : * promotion was triggered. But we also get here if we
3633 : * find an invalid record in the WAL streamed from the
3634 : * primary, in which case something is seriously wrong.
3635 : * There's little chance that the problem will just go
3636 : * away, but PANIC is not good for availability either,
3637 : * especially in hot standby mode. So, we treat that the
3638 : * same as disconnection, and retry from archive/pg_wal
3639 : * again. The WAL in the archive should be identical to
3640 : * what was streamed, so it's unlikely that it helps, but
3641 : * one can hope...
3642 : */
3643 :
3644 : /*
3645 : * We should be able to move to XLOG_FROM_STREAM only in
3646 : * standby mode.
3647 : */
3648 : Assert(StandbyMode);
3649 :
3650 : /*
3651 : * Before we leave XLOG_FROM_STREAM state, make sure that
3652 : * walreceiver is not active, so that it won't overwrite
3653 : * WAL that we restore from archive.
3654 : *
3655 : * If walreceiver is actively streaming (or attempting to
3656 : * connect), we must shut it down. However, if it's
3657 : * already in WAITING state (e.g., due to timeline
3658 : * divergence), we only need to reset the install flag to
3659 : * allow archive restoration.
3660 : */
3661 180 : if (WalRcvStreaming())
3662 32 : XLogShutdownWalRcv();
3663 : else
3664 : {
3665 : /*
3666 : * WALRCV_STOPPING state is a transient state while
3667 : * the startup process is in ShutdownWalRcv(). It
3668 : * should never appear here since we would be waiting
3669 : * for the walreceiver to reach WALRCV_STOPPED in that
3670 : * case.
3671 : */
3672 : Assert(WalRcvGetState() != WALRCV_STOPPING);
3673 148 : ResetInstallXLogFileSegmentActive();
3674 : }
3675 :
3676 : /*
3677 : * Before we sleep, re-scan for possible new timelines if
3678 : * we were requested to recover to the latest timeline.
3679 : */
3680 180 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3681 : {
3682 180 : if (rescanLatestTimeLine(replayTLI, replayLSN))
3683 : {
3684 7 : currentSource = XLOG_FROM_ARCHIVE;
3685 7 : break;
3686 : }
3687 : }
3688 :
3689 : /*
3690 : * XLOG_FROM_STREAM is the last state in our state
3691 : * machine, so we've exhausted all the options for
3692 : * obtaining the requested WAL. We're going to loop back
3693 : * and retry from the archive, but if it hasn't been long
3694 : * since last attempt, sleep wal_retrieve_retry_interval
3695 : * milliseconds to avoid busy-waiting.
3696 : */
3697 173 : now = GetCurrentTimestamp();
3698 173 : if (!TimestampDifferenceExceeds(last_fail_time, now,
3699 : wal_retrieve_retry_interval))
3700 : {
3701 : long wait_time;
3702 :
3703 184 : wait_time = wal_retrieve_retry_interval -
3704 92 : TimestampDifferenceMilliseconds(last_fail_time, now);
3705 :
3706 92 : elog(LOG, "waiting for WAL to become available at %X/%08X",
3707 : LSN_FORMAT_ARGS(RecPtr));
3708 :
3709 : /* Do background tasks that might benefit us later. */
3710 92 : KnownAssignedTransactionIdsIdleMaintenance();
3711 :
3712 92 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3713 : WL_LATCH_SET | WL_TIMEOUT |
3714 : WL_EXIT_ON_PM_DEATH,
3715 : wait_time,
3716 : WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3717 92 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3718 92 : now = GetCurrentTimestamp();
3719 :
3720 : /* Handle interrupt signals of startup process */
3721 92 : ProcessStartupProcInterrupts();
3722 : }
3723 157 : last_fail_time = now;
3724 157 : currentSource = XLOG_FROM_ARCHIVE;
3725 157 : break;
3726 :
3727 0 : default:
3728 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3729 : }
3730 : }
3731 35636 : else if (currentSource == XLOG_FROM_PG_WAL)
3732 : {
3733 : /*
3734 : * We just successfully read a file in pg_wal. We prefer files in
3735 : * the archive over ones in pg_wal, so try the next file again
3736 : * from the archive first.
3737 : */
3738 1028 : if (InArchiveRecovery)
3739 0 : currentSource = XLOG_FROM_ARCHIVE;
3740 : }
3741 :
3742 36023 : if (currentSource != oldSource)
3743 387 : elog(DEBUG2, "switched WAL source from %s to %s after %s",
3744 : xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3745 : lastSourceFailed ? "failure" : "success");
3746 :
3747 : /*
3748 : * We've now handled possible failure. Try to read from the chosen
3749 : * source.
3750 : */
3751 36023 : lastSourceFailed = false;
3752 :
3753 36023 : switch (currentSource)
3754 : {
3755 1870 : case XLOG_FROM_ARCHIVE:
3756 : case XLOG_FROM_PG_WAL:
3757 :
3758 : /*
3759 : * WAL receiver must not be running when reading WAL from
3760 : * archive or pg_wal.
3761 : */
3762 : Assert(!WalRcvStreaming());
3763 :
3764 : /* Close any old file we might have open. */
3765 1870 : if (readFile >= 0)
3766 : {
3767 86 : close(readFile);
3768 86 : readFile = -1;
3769 : }
3770 : /* Reset curFileTLI if random fetch. */
3771 1870 : if (randAccess)
3772 1196 : curFileTLI = 0;
3773 :
3774 : /*
3775 : * Try to restore the file from archive, or read an existing
3776 : * file from pg_wal.
3777 : */
3778 1870 : readFile = XLogFileReadAnyTLI(readSegNo,
3779 1870 : currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3780 : currentSource);
3781 1870 : if (readFile >= 0)
3782 1679 : return XLREAD_SUCCESS; /* success! */
3783 :
3784 : /*
3785 : * Nope, not found in archive or pg_wal.
3786 : */
3787 191 : lastSourceFailed = true;
3788 191 : break;
3789 :
3790 34153 : case XLOG_FROM_STREAM:
3791 : {
3792 : bool havedata;
3793 :
3794 : /*
3795 : * We should be able to move to XLOG_FROM_STREAM only in
3796 : * standby mode.
3797 : */
3798 : Assert(StandbyMode);
3799 :
3800 : /*
3801 : * First, shutdown walreceiver if its restart has been
3802 : * requested -- but no point if we're already slated for
3803 : * starting it.
3804 : */
3805 34153 : if (pendingWalRcvRestart && !startWalReceiver)
3806 : {
3807 7 : XLogShutdownWalRcv();
3808 :
3809 : /*
3810 : * Re-scan for possible new timelines if we were
3811 : * requested to recover to the latest timeline.
3812 : */
3813 7 : if (recoveryTargetTimeLineGoal ==
3814 : RECOVERY_TARGET_TIMELINE_LATEST)
3815 7 : rescanLatestTimeLine(replayTLI, replayLSN);
3816 :
3817 7 : startWalReceiver = true;
3818 : }
3819 34153 : pendingWalRcvRestart = false;
3820 :
3821 : /*
3822 : * Launch walreceiver if needed.
3823 : *
3824 : * If fetching_ckpt is true, RecPtr points to the initial
3825 : * checkpoint location. In that case, we use RedoStartLSN
3826 : * as the streaming start position instead of RecPtr, so
3827 : * that when we later jump backwards to start redo at
3828 : * RedoStartLSN, we will have the logs streamed already.
3829 : */
3830 34153 : if (startWalReceiver &&
3831 230 : PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3832 : {
3833 : XLogRecPtr ptr;
3834 : TimeLineID tli;
3835 :
3836 187 : if (fetching_ckpt)
3837 : {
3838 0 : ptr = RedoStartLSN;
3839 0 : tli = RedoStartTLI;
3840 : }
3841 : else
3842 : {
3843 187 : ptr = RecPtr;
3844 :
3845 : /*
3846 : * Use the record begin position to determine the
3847 : * TLI, rather than the position we're reading.
3848 : */
3849 187 : tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3850 :
3851 187 : if (curFileTLI > 0 && tli < curFileTLI)
3852 0 : elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3853 : LSN_FORMAT_ARGS(tliRecPtr),
3854 : tli, curFileTLI);
3855 : }
3856 187 : curFileTLI = tli;
3857 187 : SetInstallXLogFileSegmentActive();
3858 187 : RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3859 : PrimarySlotName,
3860 : wal_receiver_create_temp_slot);
3861 187 : flushedUpto = InvalidXLogRecPtr;
3862 : }
3863 :
3864 : /*
3865 : * Check if WAL receiver is active or wait to start up.
3866 : */
3867 34153 : if (!WalRcvStreaming())
3868 : {
3869 148 : lastSourceFailed = true;
3870 148 : break;
3871 : }
3872 :
3873 : /*
3874 : * Walreceiver is active, so see if new data has arrived.
3875 : *
3876 : * We only advance XLogReceiptTime when we obtain fresh
3877 : * WAL from walreceiver and observe that we had already
3878 : * processed everything before the most recent "chunk"
3879 : * that it flushed to disk. In steady state where we are
3880 : * keeping up with the incoming data, XLogReceiptTime will
3881 : * be updated on each cycle. When we are behind,
3882 : * XLogReceiptTime will not advance, so the grace time
3883 : * allotted to conflicting queries will decrease.
3884 : */
3885 34005 : if (RecPtr < flushedUpto)
3886 1726 : havedata = true;
3887 : else
3888 : {
3889 : XLogRecPtr latestChunkStart;
3890 :
3891 32279 : flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3892 32279 : if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3893 : {
3894 16552 : havedata = true;
3895 16552 : if (latestChunkStart <= RecPtr)
3896 : {
3897 12606 : XLogReceiptTime = GetCurrentTimestamp();
3898 12606 : SetCurrentChunkStartTime(XLogReceiptTime);
3899 : }
3900 : }
3901 : else
3902 15727 : havedata = false;
3903 : }
3904 34005 : if (havedata)
3905 : {
3906 : /*
3907 : * Great, streamed far enough. Open the file if it's
3908 : * not open already. Also read the timeline history
3909 : * file if we haven't initialized timeline history
3910 : * yet; it should be streamed over and present in
3911 : * pg_wal by now. Use XLOG_FROM_STREAM so that source
3912 : * info is set correctly and XLogReceiptTime isn't
3913 : * changed.
3914 : *
3915 : * NB: We must set readTimeLineHistory based on
3916 : * recoveryTargetTLI, not receiveTLI. Normally they'll
3917 : * be the same, but if recovery_target_timeline is
3918 : * 'latest' and archiving is configured, then it's
3919 : * possible that we managed to retrieve one or more
3920 : * new timeline history files from the archive,
3921 : * updating recoveryTargetTLI.
3922 : */
3923 18278 : if (readFile < 0)
3924 : {
3925 1159 : if (!expectedTLEs)
3926 0 : expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3927 1159 : readFile = XLogFileRead(readSegNo, receiveTLI,
3928 : XLOG_FROM_STREAM, false);
3929 : Assert(readFile >= 0);
3930 : }
3931 : else
3932 : {
3933 : /* just make sure source info is correct... */
3934 17119 : readSource = XLOG_FROM_STREAM;
3935 17119 : XLogReceiptSource = XLOG_FROM_STREAM;
3936 17119 : return XLREAD_SUCCESS;
3937 : }
3938 1159 : break;
3939 : }
3940 :
3941 : /* In nonblocking mode, return rather than sleeping. */
3942 15727 : if (nonblocking)
3943 334 : return XLREAD_WOULDBLOCK;
3944 :
3945 : /*
3946 : * Data not here yet. Check for trigger, then wait for
3947 : * walreceiver to wake us up when new WAL arrives.
3948 : */
3949 15393 : if (CheckForStandbyTrigger())
3950 : {
3951 : /*
3952 : * Note that we don't return XLREAD_FAIL immediately
3953 : * here. After being triggered, we still want to
3954 : * replay all the WAL that was already streamed. It's
3955 : * in pg_wal now, so we just treat this as a failure,
3956 : * and the state machine will move on to replay the
3957 : * streamed WAL from pg_wal, and then recheck the
3958 : * trigger and exit replay.
3959 : */
3960 32 : lastSourceFailed = true;
3961 32 : break;
3962 : }
3963 :
3964 : /*
3965 : * Since we have replayed everything we have received so
3966 : * far and are about to start waiting for more WAL, let's
3967 : * tell the upstream server our replay location now so
3968 : * that pg_stat_replication doesn't show stale
3969 : * information.
3970 : */
3971 15361 : if (!streaming_reply_sent)
3972 : {
3973 13084 : WalRcvForceReply();
3974 13084 : streaming_reply_sent = true;
3975 : }
3976 :
3977 : /* Do any background tasks that might benefit us later. */
3978 15361 : KnownAssignedTransactionIdsIdleMaintenance();
3979 :
3980 : /* Update pg_stat_recovery_prefetch before sleeping. */
3981 15361 : XLogPrefetcherComputeStats(xlogprefetcher);
3982 :
3983 : /*
3984 : * Wait for more WAL to arrive, when we will be woken
3985 : * immediately by the WAL receiver.
3986 : */
3987 15361 : (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3988 : WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
3989 : -1L,
3990 : WAIT_EVENT_RECOVERY_WAL_STREAM);
3991 15361 : ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3992 15361 : break;
3993 : }
3994 :
3995 0 : default:
3996 0 : elog(ERROR, "unexpected WAL source %d", currentSource);
3997 : }
3998 :
3999 : /*
4000 : * Check for recovery pause here so that we can confirm more quickly
4001 : * that a requested pause has actually taken effect.
4002 : */
4003 16891 : if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4004 : RECOVERY_NOT_PAUSED)
4005 3 : recoveryPausesHere(false);
4006 :
4007 : /*
4008 : * This possibly-long loop needs to handle interrupts of startup
4009 : * process.
4010 : */
4011 16891 : ProcessStartupProcInterrupts();
4012 : }
4013 :
4014 : return XLREAD_FAIL; /* not reached */
4015 : }
4016 :
4017 :
4018 : /*
4019 : * Determine what log level should be used to report a corrupt WAL record
4020 : * in the current WAL page, previously read by XLogPageRead().
4021 : *
4022 : * 'emode' is the error mode that would be used to report a file-not-found
4023 : * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4024 : * we're retrying the exact same record that we've tried previously, only
4025 : * complain the first time to keep the noise down. However, we only do when
4026 : * reading from pg_wal, because we don't expect any invalid records in archive
4027 : * or in records streamed from the primary. Files in the archive should be complete,
4028 : * and we should never hit the end of WAL because we stop and wait for more WAL
4029 : * to arrive before replaying it.
4030 : *
4031 : * NOTE: This function remembers the RecPtr value it was last called with,
4032 : * to suppress repeated messages about the same record. Only call this when
4033 : * you are about to ereport(), or you might cause a later message to be
4034 : * erroneously suppressed.
4035 : */
4036 : static int
4037 276 : emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4038 : {
4039 : static XLogRecPtr lastComplaint = InvalidXLogRecPtr;
4040 :
4041 276 : if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4042 : {
4043 272 : if (RecPtr == lastComplaint)
4044 71 : emode = DEBUG1;
4045 : else
4046 201 : lastComplaint = RecPtr;
4047 : }
4048 276 : return emode;
4049 : }
4050 :
4051 :
4052 : /*
4053 : * Subroutine to try to fetch and validate a prior checkpoint record.
4054 : */
4055 : static XLogRecord *
4056 1035 : ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4057 : TimeLineID replayTLI)
4058 : {
4059 : XLogRecord *record;
4060 : uint8 info;
4061 :
4062 : Assert(xlogreader != NULL);
4063 :
4064 1035 : if (!XRecOffIsValid(RecPtr))
4065 : {
4066 0 : ereport(LOG,
4067 : (errmsg("invalid checkpoint location")));
4068 0 : return NULL;
4069 : }
4070 :
4071 1035 : XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4072 1035 : record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4073 :
4074 1035 : if (record == NULL)
4075 : {
4076 1 : ereport(LOG,
4077 : (errmsg("invalid checkpoint record")));
4078 1 : return NULL;
4079 : }
4080 1034 : if (record->xl_rmid != RM_XLOG_ID)
4081 : {
4082 0 : ereport(LOG,
4083 : (errmsg("invalid resource manager ID in checkpoint record")));
4084 0 : return NULL;
4085 : }
4086 1034 : info = record->xl_info & ~XLR_INFO_MASK;
4087 1034 : if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4088 : info != XLOG_CHECKPOINT_ONLINE)
4089 : {
4090 0 : ereport(LOG,
4091 : (errmsg("invalid xl_info in checkpoint record")));
4092 0 : return NULL;
4093 : }
4094 1034 : if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4095 : {
4096 0 : ereport(LOG,
4097 : (errmsg("invalid length of checkpoint record")));
4098 0 : return NULL;
4099 : }
4100 1034 : return record;
4101 : }
4102 :
4103 : /*
4104 : * Scan for new timelines that might have appeared in the archive since we
4105 : * started recovery.
4106 : *
4107 : * If there are any, the function changes recovery target TLI to the latest
4108 : * one and returns 'true'.
4109 : */
4110 : static bool
4111 187 : rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4112 : {
4113 : List *newExpectedTLEs;
4114 : bool found;
4115 : ListCell *cell;
4116 : TimeLineID newtarget;
4117 187 : TimeLineID oldtarget = recoveryTargetTLI;
4118 187 : TimeLineHistoryEntry *currentTle = NULL;
4119 :
4120 187 : newtarget = findNewestTimeLine(recoveryTargetTLI);
4121 187 : if (newtarget == recoveryTargetTLI)
4122 : {
4123 : /* No new timelines found */
4124 180 : return false;
4125 : }
4126 :
4127 : /*
4128 : * Determine the list of expected TLIs for the new TLI
4129 : */
4130 :
4131 7 : newExpectedTLEs = readTimeLineHistory(newtarget);
4132 :
4133 : /*
4134 : * If the current timeline is not part of the history of the new timeline,
4135 : * we cannot proceed to it.
4136 : */
4137 7 : found = false;
4138 14 : foreach(cell, newExpectedTLEs)
4139 : {
4140 14 : currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4141 :
4142 14 : if (currentTle->tli == recoveryTargetTLI)
4143 : {
4144 7 : found = true;
4145 7 : break;
4146 : }
4147 : }
4148 7 : if (!found)
4149 : {
4150 0 : ereport(LOG,
4151 : (errmsg("new timeline %u is not a child of database system timeline %u",
4152 : newtarget,
4153 : replayTLI)));
4154 0 : return false;
4155 : }
4156 :
4157 : /*
4158 : * The current timeline was found in the history file, but check that the
4159 : * next timeline was forked off from it *after* the current recovery
4160 : * location.
4161 : */
4162 7 : if (currentTle->end < replayLSN)
4163 : {
4164 0 : ereport(LOG,
4165 : errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4166 : newtarget,
4167 : replayTLI,
4168 : LSN_FORMAT_ARGS(replayLSN)));
4169 0 : return false;
4170 : }
4171 :
4172 : /* The new timeline history seems valid. Switch target */
4173 7 : recoveryTargetTLI = newtarget;
4174 7 : list_free_deep(expectedTLEs);
4175 7 : expectedTLEs = newExpectedTLEs;
4176 :
4177 : /*
4178 : * As in StartupXLOG(), try to ensure we have all the history files
4179 : * between the old target and new target in pg_wal.
4180 : */
4181 7 : restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4182 :
4183 7 : ereport(LOG,
4184 : (errmsg("new target timeline is %u",
4185 : recoveryTargetTLI)));
4186 :
4187 7 : return true;
4188 : }
4189 :
4190 :
4191 : /*
4192 : * Open a logfile segment for reading (during recovery).
4193 : *
4194 : * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4195 : * Otherwise, it's assumed to be already available in pg_wal.
4196 : */
4197 : static int
4198 3530 : XLogFileRead(XLogSegNo segno, TimeLineID tli,
4199 : XLogSource source, bool notfoundOk)
4200 : {
4201 : char xlogfname[MAXFNAMELEN];
4202 : char activitymsg[MAXFNAMELEN + 16];
4203 : char path[MAXPGPATH];
4204 : int fd;
4205 :
4206 3530 : XLogFileName(xlogfname, tli, segno, wal_segment_size);
4207 :
4208 3530 : switch (source)
4209 : {
4210 853 : case XLOG_FROM_ARCHIVE:
4211 : /* Report recovery progress in PS display */
4212 853 : snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4213 : xlogfname);
4214 853 : set_ps_display(activitymsg);
4215 :
4216 853 : if (!RestoreArchivedFile(path, xlogfname,
4217 : "RECOVERYXLOG",
4218 : wal_segment_size,
4219 : InRedo))
4220 490 : return -1;
4221 363 : break;
4222 :
4223 2677 : case XLOG_FROM_PG_WAL:
4224 : case XLOG_FROM_STREAM:
4225 2677 : XLogFilePath(path, tli, segno, wal_segment_size);
4226 2677 : break;
4227 :
4228 0 : default:
4229 0 : elog(ERROR, "invalid XLogFileRead source %d", source);
4230 : }
4231 :
4232 : /*
4233 : * If the segment was fetched from archival storage, replace the existing
4234 : * xlog segment (if any) with the archival version.
4235 : */
4236 3040 : if (source == XLOG_FROM_ARCHIVE)
4237 : {
4238 : Assert(!IsInstallXLogFileSegmentActive());
4239 363 : KeepFileRestoredFromArchive(path, xlogfname);
4240 :
4241 : /*
4242 : * Set path to point at the new file in pg_wal.
4243 : */
4244 363 : snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4245 : }
4246 :
4247 3040 : fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4248 3040 : if (fd >= 0)
4249 : {
4250 : /* Success! */
4251 2838 : curFileTLI = tli;
4252 :
4253 : /* Report recovery progress in PS display */
4254 2838 : snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4255 : xlogfname);
4256 2838 : set_ps_display(activitymsg);
4257 :
4258 : /* Track source of data in assorted state variables */
4259 2838 : readSource = source;
4260 2838 : XLogReceiptSource = source;
4261 : /* In FROM_STREAM case, caller tracks receipt time, not me */
4262 2838 : if (source != XLOG_FROM_STREAM)
4263 1679 : XLogReceiptTime = GetCurrentTimestamp();
4264 :
4265 2838 : return fd;
4266 : }
4267 202 : if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4268 0 : ereport(PANIC,
4269 : (errcode_for_file_access(),
4270 : errmsg("could not open file \"%s\": %m", path)));
4271 202 : return -1;
4272 : }
4273 :
4274 : /*
4275 : * Open a logfile segment for reading (during recovery).
4276 : *
4277 : * This version searches for the segment with any TLI listed in expectedTLEs.
4278 : */
4279 : static int
4280 1870 : XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4281 : {
4282 : char path[MAXPGPATH];
4283 : ListCell *cell;
4284 : int fd;
4285 : List *tles;
4286 :
4287 : /*
4288 : * Loop looking for a suitable timeline ID: we might need to read any of
4289 : * the timelines listed in expectedTLEs.
4290 : *
4291 : * We expect curFileTLI on entry to be the TLI of the preceding file in
4292 : * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4293 : * to go backwards; this prevents us from picking up the wrong file when a
4294 : * parent timeline extends to higher segment numbers than the child we
4295 : * want to read.
4296 : *
4297 : * If we haven't read the timeline history file yet, read it now, so that
4298 : * we know which TLIs to scan. We don't save the list in expectedTLEs,
4299 : * however, unless we actually find a valid segment. That way if there is
4300 : * neither a timeline history file nor a WAL segment in the archive, and
4301 : * streaming replication is set up, we'll read the timeline history file
4302 : * streamed from the primary when we start streaming, instead of
4303 : * recovering with a dummy history generated here.
4304 : */
4305 1870 : if (expectedTLEs)
4306 835 : tles = expectedTLEs;
4307 : else
4308 1035 : tles = readTimeLineHistory(recoveryTargetTLI);
4309 :
4310 2078 : foreach(cell, tles)
4311 : {
4312 1893 : TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4313 1893 : TimeLineID tli = hent->tli;
4314 :
4315 1893 : if (tli < curFileTLI)
4316 6 : break; /* don't bother looking at too-old TLIs */
4317 :
4318 : /*
4319 : * Skip scanning the timeline ID that the logfile segment to read
4320 : * doesn't belong to
4321 : */
4322 1887 : if (XLogRecPtrIsValid(hent->begin))
4323 : {
4324 76 : XLogSegNo beginseg = 0;
4325 :
4326 76 : XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4327 :
4328 : /*
4329 : * The logfile segment that doesn't belong to the timeline is
4330 : * older or newer than the segment that the timeline started or
4331 : * ended at, respectively. It's sufficient to check only the
4332 : * starting segment of the timeline here. Since the timelines are
4333 : * scanned in descending order in this loop, any segments newer
4334 : * than the ending segment should belong to newer timeline and
4335 : * have already been read before. So it's not necessary to check
4336 : * the ending segment of the timeline here.
4337 : */
4338 76 : if (segno < beginseg)
4339 6 : continue;
4340 : }
4341 :
4342 1881 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4343 : {
4344 853 : fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4345 853 : if (fd != -1)
4346 : {
4347 363 : elog(DEBUG1, "got WAL segment from archive");
4348 363 : if (!expectedTLEs)
4349 18 : expectedTLEs = tles;
4350 1679 : return fd;
4351 : }
4352 : }
4353 :
4354 1518 : if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4355 : {
4356 1518 : fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4357 1518 : if (fd != -1)
4358 : {
4359 1316 : if (!expectedTLEs)
4360 1016 : expectedTLEs = tles;
4361 1316 : return fd;
4362 : }
4363 : }
4364 : }
4365 :
4366 : /* Couldn't find it. For simplicity, complain about front timeline */
4367 191 : XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4368 191 : errno = ENOENT;
4369 191 : ereport(DEBUG2,
4370 : (errcode_for_file_access(),
4371 : errmsg("could not open file \"%s\": %m", path)));
4372 191 : return -1;
4373 : }
4374 :
4375 : /*
4376 : * Set flag to signal the walreceiver to restart. (The startup process calls
4377 : * this on noticing a relevant configuration change.)
4378 : */
4379 : void
4380 11 : StartupRequestWalReceiverRestart(void)
4381 : {
4382 11 : if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4383 : {
4384 7 : ereport(LOG,
4385 : (errmsg("WAL receiver process shutdown requested")));
4386 :
4387 7 : pendingWalRcvRestart = true;
4388 : }
4389 11 : }
4390 :
4391 :
4392 : /*
4393 : * Has a standby promotion already been triggered?
4394 : *
4395 : * Unlike CheckForStandbyTrigger(), this works in any process
4396 : * that's connected to shared memory.
4397 : */
4398 : bool
4399 72 : PromoteIsTriggered(void)
4400 : {
4401 : /*
4402 : * We check shared state each time only until a standby promotion is
4403 : * triggered. We can't trigger a promotion again, so there's no need to
4404 : * keep checking after the shared variable has once been seen true.
4405 : */
4406 72 : if (LocalPromoteIsTriggered)
4407 51 : return true;
4408 :
4409 21 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4410 21 : LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4411 21 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4412 :
4413 21 : return LocalPromoteIsTriggered;
4414 : }
4415 :
4416 : static void
4417 48 : SetPromoteIsTriggered(void)
4418 : {
4419 48 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4420 48 : XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4421 48 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4422 :
4423 : /*
4424 : * Mark the recovery pause state as 'not paused' because the paused state
4425 : * ends and promotion continues if a promotion is triggered while recovery
4426 : * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4427 : * return 'paused' while a promotion is ongoing.
4428 : */
4429 48 : SetRecoveryPause(false);
4430 :
4431 48 : LocalPromoteIsTriggered = true;
4432 48 : }
4433 :
4434 : /*
4435 : * Check whether a promote request has arrived.
4436 : */
4437 : static bool
4438 15889 : CheckForStandbyTrigger(void)
4439 : {
4440 15889 : if (LocalPromoteIsTriggered)
4441 55 : return true;
4442 :
4443 15834 : if (IsPromoteSignaled() && CheckPromoteSignal())
4444 : {
4445 48 : ereport(LOG, (errmsg("received promote request")));
4446 48 : RemovePromoteSignalFiles();
4447 48 : ResetPromoteSignaled();
4448 48 : SetPromoteIsTriggered();
4449 48 : return true;
4450 : }
4451 :
4452 15786 : return false;
4453 : }
4454 :
4455 : /*
4456 : * Remove the files signaling a standby promotion request.
4457 : */
4458 : void
4459 1000 : RemovePromoteSignalFiles(void)
4460 : {
4461 1000 : unlink(PROMOTE_SIGNAL_FILE);
4462 1000 : }
4463 :
4464 : /*
4465 : * Check to see if a promote request has arrived.
4466 : */
4467 : bool
4468 686 : CheckPromoteSignal(void)
4469 : {
4470 : struct stat stat_buf;
4471 :
4472 686 : if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4473 96 : return true;
4474 :
4475 590 : return false;
4476 : }
4477 :
4478 : /*
4479 : * Wake up startup process to replay newly arrived WAL, or to notice that
4480 : * failover has been requested.
4481 : */
4482 : void
4483 41439 : WakeupRecovery(void)
4484 : {
4485 41439 : SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4486 41439 : }
4487 :
4488 : /*
4489 : * Schedule a walreceiver wakeup in the main recovery loop.
4490 : */
4491 : void
4492 2 : XLogRequestWalReceiverReply(void)
4493 : {
4494 2 : doRequestWalReceiverReply = true;
4495 2 : }
4496 :
4497 : /*
4498 : * Is HotStandby active yet? This is only important in special backends
4499 : * since normal backends won't ever be able to connect until this returns
4500 : * true. Postmaster knows this by way of signal, not via shared memory.
4501 : *
4502 : * Unlike testing standbyState, this works in any process that's connected to
4503 : * shared memory. (And note that standbyState alone doesn't tell the truth
4504 : * anyway.)
4505 : */
4506 : bool
4507 171 : HotStandbyActive(void)
4508 : {
4509 : /*
4510 : * We check shared state each time only until Hot Standby is active. We
4511 : * can't de-activate Hot Standby, so there's no need to keep checking
4512 : * after the shared variable has once been seen true.
4513 : */
4514 171 : if (LocalHotStandbyActive)
4515 25 : return true;
4516 : else
4517 : {
4518 : /* spinlock is essential on machines with weak memory ordering! */
4519 146 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4520 146 : LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4521 146 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4522 :
4523 146 : return LocalHotStandbyActive;
4524 : }
4525 : }
4526 :
4527 : /*
4528 : * Like HotStandbyActive(), but to be used only in WAL replay code,
4529 : * where we don't need to ask any other process what the state is.
4530 : */
4531 : static bool
4532 0 : HotStandbyActiveInReplay(void)
4533 : {
4534 : Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4535 0 : return LocalHotStandbyActive;
4536 : }
4537 :
4538 : /*
4539 : * Get latest redo apply position.
4540 : *
4541 : * Exported to allow WALReceiver to read the pointer directly.
4542 : */
4543 : XLogRecPtr
4544 108348 : GetXLogReplayRecPtr(TimeLineID *replayTLI)
4545 : {
4546 : XLogRecPtr recptr;
4547 : TimeLineID tli;
4548 :
4549 108348 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4550 108348 : recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4551 108348 : tli = XLogRecoveryCtl->lastReplayedTLI;
4552 108348 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4553 :
4554 108348 : if (replayTLI)
4555 3494 : *replayTLI = tli;
4556 108348 : return recptr;
4557 : }
4558 :
4559 :
4560 : /*
4561 : * Get position of last applied, or the record being applied.
4562 : *
4563 : * This is different from GetXLogReplayRecPtr() in that if a WAL
4564 : * record is currently being applied, this includes that record.
4565 : */
4566 : XLogRecPtr
4567 6454 : GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4568 : {
4569 : XLogRecPtr recptr;
4570 : TimeLineID tli;
4571 :
4572 6454 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4573 6454 : recptr = XLogRecoveryCtl->replayEndRecPtr;
4574 6454 : tli = XLogRecoveryCtl->replayEndTLI;
4575 6454 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4576 :
4577 6454 : if (replayEndTLI)
4578 6454 : *replayEndTLI = tli;
4579 6454 : return recptr;
4580 : }
4581 :
4582 : /*
4583 : * Save timestamp of latest processed commit/abort record.
4584 : *
4585 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4586 : * seen by processes other than the startup process. Note in particular
4587 : * that CreateRestartPoint is executed in the checkpointer.
4588 : */
4589 : static void
4590 22994 : SetLatestXTime(TimestampTz xtime)
4591 : {
4592 22994 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4593 22994 : XLogRecoveryCtl->recoveryLastXTime = xtime;
4594 22994 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4595 22994 : }
4596 :
4597 : /*
4598 : * Fetch timestamp of latest processed commit/abort record.
4599 : */
4600 : TimestampTz
4601 360 : GetLatestXTime(void)
4602 : {
4603 : TimestampTz xtime;
4604 :
4605 360 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4606 360 : xtime = XLogRecoveryCtl->recoveryLastXTime;
4607 360 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4608 :
4609 360 : return xtime;
4610 : }
4611 :
4612 : /*
4613 : * Save timestamp of the next chunk of WAL records to apply.
4614 : *
4615 : * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4616 : * seen by all backends.
4617 : */
4618 : static void
4619 12606 : SetCurrentChunkStartTime(TimestampTz xtime)
4620 : {
4621 12606 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4622 12606 : XLogRecoveryCtl->currentChunkStartTime = xtime;
4623 12606 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4624 12606 : }
4625 :
4626 : /*
4627 : * Fetch timestamp of latest processed commit/abort record.
4628 : * Startup process maintains an accurate local copy in XLogReceiptTime
4629 : */
4630 : TimestampTz
4631 258 : GetCurrentChunkReplayStartTime(void)
4632 : {
4633 : TimestampTz xtime;
4634 :
4635 258 : SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4636 258 : xtime = XLogRecoveryCtl->currentChunkStartTime;
4637 258 : SpinLockRelease(&XLogRecoveryCtl->info_lck);
4638 :
4639 258 : return xtime;
4640 : }
4641 :
4642 : /*
4643 : * Returns time of receipt of current chunk of XLOG data, as well as
4644 : * whether it was received from streaming replication or from archives.
4645 : */
4646 : void
4647 27 : GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4648 : {
4649 : /*
4650 : * This must be executed in the startup process, since we don't export the
4651 : * relevant state to shared memory.
4652 : */
4653 : Assert(InRecovery);
4654 :
4655 27 : *rtime = XLogReceiptTime;
4656 27 : *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4657 27 : }
4658 :
4659 : /*
4660 : * Note that text field supplied is a parameter name and does not require
4661 : * translation
4662 : */
4663 : void
4664 680 : RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4665 : {
4666 680 : if (currValue < minValue)
4667 : {
4668 0 : if (HotStandbyActiveInReplay())
4669 : {
4670 0 : bool warned_for_promote = false;
4671 :
4672 0 : ereport(WARNING,
4673 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4674 : errmsg("hot standby is not possible because of insufficient parameter settings"),
4675 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4676 : param_name,
4677 : currValue,
4678 : minValue)));
4679 :
4680 0 : SetRecoveryPause(true);
4681 :
4682 0 : ereport(LOG,
4683 : (errmsg("recovery has paused"),
4684 : errdetail("If recovery is unpaused, the server will shut down."),
4685 : errhint("You can then restart the server after making the necessary configuration changes.")));
4686 :
4687 0 : while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4688 : {
4689 0 : ProcessStartupProcInterrupts();
4690 :
4691 0 : if (CheckForStandbyTrigger())
4692 : {
4693 0 : if (!warned_for_promote)
4694 0 : ereport(WARNING,
4695 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4696 : errmsg("promotion is not possible because of insufficient parameter settings"),
4697 :
4698 : /*
4699 : * Repeat the detail from above so it's easy to find
4700 : * in the log.
4701 : */
4702 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4703 : param_name,
4704 : currValue,
4705 : minValue),
4706 : errhint("Restart the server after making the necessary configuration changes.")));
4707 0 : warned_for_promote = true;
4708 : }
4709 :
4710 : /*
4711 : * If recovery pause is requested then set it paused. While
4712 : * we are in the loop, user might resume and pause again so
4713 : * set this every time.
4714 : */
4715 0 : ConfirmRecoveryPaused();
4716 :
4717 : /*
4718 : * We wait on a condition variable that will wake us as soon
4719 : * as the pause ends, but we use a timeout so we can check the
4720 : * above conditions periodically too.
4721 : */
4722 0 : ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4723 : WAIT_EVENT_RECOVERY_PAUSE);
4724 : }
4725 0 : ConditionVariableCancelSleep();
4726 : }
4727 :
4728 0 : ereport(FATAL,
4729 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4730 : errmsg("recovery aborted because of insufficient parameter settings"),
4731 : /* Repeat the detail from above so it's easy to find in the log. */
4732 : errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4733 : param_name,
4734 : currValue,
4735 : minValue),
4736 : errhint("You can restart the server after making the necessary configuration changes.")));
4737 : }
4738 680 : }
4739 :
4740 :
4741 : /*
4742 : * GUC check_hook for primary_slot_name
4743 : */
4744 : bool
4745 1426 : check_primary_slot_name(char **newval, void **extra, GucSource source)
4746 : {
4747 : int err_code;
4748 1426 : char *err_msg = NULL;
4749 1426 : char *err_hint = NULL;
4750 :
4751 1426 : if (*newval && strcmp(*newval, "") != 0 &&
4752 202 : !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
4753 : &err_msg, &err_hint))
4754 : {
4755 0 : GUC_check_errcode(err_code);
4756 0 : GUC_check_errdetail("%s", err_msg);
4757 0 : if (err_hint != NULL)
4758 0 : GUC_check_errhint("%s", err_hint);
4759 0 : return false;
4760 : }
4761 :
4762 1426 : return true;
4763 : }
4764 :
4765 : /*
4766 : * Recovery target settings: Only one of the several recovery_target* settings
4767 : * may be set. Setting a second one results in an error. The global variable
4768 : * recoveryTarget tracks which kind of recovery target was chosen. Other
4769 : * variables store the actual target value (for example a string or a xid).
4770 : * The assign functions of the parameters check whether a competing parameter
4771 : * was already set. But we want to allow setting the same parameter multiple
4772 : * times. We also want to allow unsetting a parameter and setting a different
4773 : * one, so we unset recoveryTarget when the parameter is set to an empty
4774 : * string.
4775 : *
4776 : * XXX this code is broken by design. Throwing an error from a GUC assign
4777 : * hook breaks fundamental assumptions of guc.c. So long as all the variables
4778 : * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4779 : * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4780 : * that we have odd behaviors such as unexpected GUC ordering dependencies.
4781 : */
4782 :
4783 : pg_noreturn static void
4784 1 : error_multiple_recovery_targets(void)
4785 : {
4786 1 : ereport(ERROR,
4787 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4788 : errmsg("multiple recovery targets specified"),
4789 : errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4790 : }
4791 :
4792 : /*
4793 : * GUC check_hook for recovery_target
4794 : */
4795 : bool
4796 1224 : check_recovery_target(char **newval, void **extra, GucSource source)
4797 : {
4798 1224 : if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4799 : {
4800 0 : GUC_check_errdetail("The only allowed value is \"immediate\".");
4801 0 : return false;
4802 : }
4803 1224 : return true;
4804 : }
4805 :
4806 : /*
4807 : * GUC assign_hook for recovery_target
4808 : */
4809 : void
4810 1224 : assign_recovery_target(const char *newval, void *extra)
4811 : {
4812 1224 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4813 0 : recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4814 0 : error_multiple_recovery_targets();
4815 :
4816 1224 : if (newval && strcmp(newval, "") != 0)
4817 1 : recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4818 : else
4819 1223 : recoveryTarget = RECOVERY_TARGET_UNSET;
4820 1224 : }
4821 :
4822 : /*
4823 : * GUC check_hook for recovery_target_lsn
4824 : */
4825 : bool
4826 1230 : check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4827 : {
4828 1230 : if (strcmp(*newval, "") != 0)
4829 : {
4830 : XLogRecPtr lsn;
4831 : XLogRecPtr *myextra;
4832 8 : ErrorSaveContext escontext = {T_ErrorSaveContext};
4833 :
4834 8 : lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4835 8 : if (escontext.error_occurred)
4836 0 : return false;
4837 :
4838 8 : myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4839 8 : if (!myextra)
4840 0 : return false;
4841 8 : *myextra = lsn;
4842 8 : *extra = myextra;
4843 : }
4844 1230 : return true;
4845 : }
4846 :
4847 : /*
4848 : * GUC assign_hook for recovery_target_lsn
4849 : */
4850 : void
4851 1230 : assign_recovery_target_lsn(const char *newval, void *extra)
4852 : {
4853 1230 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4854 0 : recoveryTarget != RECOVERY_TARGET_LSN)
4855 0 : error_multiple_recovery_targets();
4856 :
4857 1230 : if (newval && strcmp(newval, "") != 0)
4858 : {
4859 8 : recoveryTarget = RECOVERY_TARGET_LSN;
4860 8 : recoveryTargetLSN = *((XLogRecPtr *) extra);
4861 : }
4862 : else
4863 1222 : recoveryTarget = RECOVERY_TARGET_UNSET;
4864 1230 : }
4865 :
4866 : /*
4867 : * GUC check_hook for recovery_target_name
4868 : */
4869 : bool
4870 1230 : check_recovery_target_name(char **newval, void **extra, GucSource source)
4871 : {
4872 : /* Use the value of newval directly */
4873 1230 : if (strlen(*newval) >= MAXFNAMELEN)
4874 : {
4875 0 : GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4876 : "recovery_target_name", MAXFNAMELEN - 1);
4877 0 : return false;
4878 : }
4879 1230 : return true;
4880 : }
4881 :
4882 : /*
4883 : * GUC assign_hook for recovery_target_name
4884 : */
4885 : void
4886 1230 : assign_recovery_target_name(const char *newval, void *extra)
4887 : {
4888 1230 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4889 0 : recoveryTarget != RECOVERY_TARGET_NAME)
4890 0 : error_multiple_recovery_targets();
4891 :
4892 1230 : if (newval && strcmp(newval, "") != 0)
4893 : {
4894 6 : recoveryTarget = RECOVERY_TARGET_NAME;
4895 6 : recoveryTargetName = newval;
4896 : }
4897 : else
4898 1224 : recoveryTarget = RECOVERY_TARGET_UNSET;
4899 1230 : }
4900 :
4901 : /*
4902 : * GUC check_hook for recovery_target_time
4903 : *
4904 : * The interpretation of the recovery_target_time string can depend on the
4905 : * time zone setting, so we need to wait until after all GUC processing is
4906 : * done before we can do the final parsing of the string. This check function
4907 : * only does a parsing pass to catch syntax errors, but we store the string
4908 : * and parse it again when we need to use it.
4909 : */
4910 : bool
4911 1226 : check_recovery_target_time(char **newval, void **extra, GucSource source)
4912 : {
4913 1226 : if (strcmp(*newval, "") != 0)
4914 : {
4915 : /* reject some special values */
4916 3 : if (strcmp(*newval, "now") == 0 ||
4917 3 : strcmp(*newval, "today") == 0 ||
4918 3 : strcmp(*newval, "tomorrow") == 0 ||
4919 3 : strcmp(*newval, "yesterday") == 0)
4920 : {
4921 0 : return false;
4922 : }
4923 :
4924 : /*
4925 : * parse timestamp value (see also timestamptz_in())
4926 : */
4927 : {
4928 3 : char *str = *newval;
4929 : fsec_t fsec;
4930 : struct pg_tm tt,
4931 3 : *tm = &tt;
4932 : int tz;
4933 : int dtype;
4934 : int nf;
4935 : int dterr;
4936 : char *field[MAXDATEFIELDS];
4937 : int ftype[MAXDATEFIELDS];
4938 : char workbuf[MAXDATELEN + MAXDATEFIELDS];
4939 : DateTimeErrorExtra dtextra;
4940 : TimestampTz timestamp;
4941 :
4942 3 : dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4943 : field, ftype, MAXDATEFIELDS, &nf);
4944 3 : if (dterr == 0)
4945 3 : dterr = DecodeDateTime(field, ftype, nf,
4946 : &dtype, tm, &fsec, &tz, &dtextra);
4947 3 : if (dterr != 0)
4948 0 : return false;
4949 3 : if (dtype != DTK_DATE)
4950 0 : return false;
4951 :
4952 3 : if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0)
4953 : {
4954 0 : GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4955 0 : return false;
4956 : }
4957 : }
4958 : }
4959 1226 : return true;
4960 : }
4961 :
4962 : /*
4963 : * GUC assign_hook for recovery_target_time
4964 : */
4965 : void
4966 1226 : assign_recovery_target_time(const char *newval, void *extra)
4967 : {
4968 1226 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4969 1 : recoveryTarget != RECOVERY_TARGET_TIME)
4970 1 : error_multiple_recovery_targets();
4971 :
4972 1225 : if (newval && strcmp(newval, "") != 0)
4973 2 : recoveryTarget = RECOVERY_TARGET_TIME;
4974 : else
4975 1223 : recoveryTarget = RECOVERY_TARGET_UNSET;
4976 1225 : }
4977 :
4978 : /*
4979 : * GUC check_hook for recovery_target_timeline
4980 : */
4981 : bool
4982 1227 : check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4983 : {
4984 : RecoveryTargetTimeLineGoal rttg;
4985 : RecoveryTargetTimeLineGoal *myextra;
4986 :
4987 1227 : if (strcmp(*newval, "current") == 0)
4988 0 : rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4989 1227 : else if (strcmp(*newval, "latest") == 0)
4990 1224 : rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4991 : else
4992 : {
4993 : char *endp;
4994 : uint64 timeline;
4995 :
4996 3 : rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
4997 :
4998 3 : errno = 0;
4999 3 : timeline = strtou64(*newval, &endp, 0);
5000 :
5001 3 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5002 : {
5003 1 : GUC_check_errdetail("\"%s\" is not a valid number.",
5004 : "recovery_target_timeline");
5005 3 : return false;
5006 : }
5007 :
5008 2 : if (timeline < 1 || timeline > PG_UINT32_MAX)
5009 : {
5010 2 : GUC_check_errdetail("\"%s\" must be between %u and %u.",
5011 : "recovery_target_timeline", 1, PG_UINT32_MAX);
5012 2 : return false;
5013 : }
5014 : }
5015 :
5016 1224 : myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5017 1224 : if (!myextra)
5018 0 : return false;
5019 1224 : *myextra = rttg;
5020 1224 : *extra = myextra;
5021 :
5022 1224 : return true;
5023 : }
5024 :
5025 : /*
5026 : * GUC assign_hook for recovery_target_timeline
5027 : */
5028 : void
5029 1224 : assign_recovery_target_timeline(const char *newval, void *extra)
5030 : {
5031 1224 : recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5032 1224 : if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5033 0 : recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5034 : else
5035 1224 : recoveryTargetTLIRequested = 0;
5036 1224 : }
5037 :
5038 : /*
5039 : * GUC check_hook for recovery_target_xid
5040 : */
5041 : bool
5042 1226 : check_recovery_target_xid(char **newval, void **extra, GucSource source)
5043 : {
5044 1226 : if (strcmp(*newval, "") != 0)
5045 : {
5046 : TransactionId xid;
5047 : TransactionId *myextra;
5048 : char *endp;
5049 : char *val;
5050 :
5051 3 : errno = 0;
5052 :
5053 : /*
5054 : * Consume leading whitespace to determine if number is negative
5055 : */
5056 3 : val = *newval;
5057 :
5058 3 : while (isspace((unsigned char) *val))
5059 0 : val++;
5060 :
5061 : /*
5062 : * This cast will remove the epoch, if any
5063 : */
5064 3 : xid = (TransactionId) strtou64(val, &endp, 0);
5065 :
5066 3 : if (*endp != '\0' || errno == EINVAL || errno == ERANGE || *val == '-')
5067 : {
5068 2 : GUC_check_errdetail("\"%s\" is not a valid number.",
5069 : "recovery_target_xid");
5070 2 : return false;
5071 : }
5072 :
5073 1 : if (xid < FirstNormalTransactionId)
5074 : {
5075 0 : GUC_check_errdetail("\"%s\" without epoch must be greater than or equal to %u.",
5076 : "recovery_target_xid",
5077 : FirstNormalTransactionId);
5078 0 : return false;
5079 : }
5080 :
5081 1 : myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5082 1 : if (!myextra)
5083 0 : return false;
5084 1 : *myextra = xid;
5085 1 : *extra = myextra;
5086 : }
5087 1224 : return true;
5088 : }
5089 :
5090 : /*
5091 : * GUC assign_hook for recovery_target_xid
5092 : */
5093 : void
5094 1224 : assign_recovery_target_xid(const char *newval, void *extra)
5095 : {
5096 1224 : if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5097 0 : recoveryTarget != RECOVERY_TARGET_XID)
5098 0 : error_multiple_recovery_targets();
5099 :
5100 1224 : if (newval && strcmp(newval, "") != 0)
5101 : {
5102 1 : recoveryTarget = RECOVERY_TARGET_XID;
5103 1 : recoveryTargetXid = *((TransactionId *) extra);
5104 : }
5105 : else
5106 1223 : recoveryTarget = RECOVERY_TARGET_UNSET;
5107 1224 : }
|