Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * walsummarizer.c
4 : *
5 : * Background process to perform WAL summarization, if it is enabled.
6 : * It continuously scans the write-ahead log and periodically emits a
7 : * summary file which indicates which blocks in which relation forks
8 : * were modified by WAL records in the LSN range covered by the summary
9 : * file. See walsummary.c and blkreftable.c for more details on the
10 : * naming and contents of WAL summary files.
11 : *
12 : * If configured to do, this background process will also remove WAL
13 : * summary files when the file timestamp is older than a configurable
14 : * threshold (but only if the WAL has been removed first).
15 : *
16 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
17 : *
18 : * IDENTIFICATION
19 : * src/backend/postmaster/walsummarizer.c
20 : *
21 : *-------------------------------------------------------------------------
22 : */
23 : #include "postgres.h"
24 :
25 : #include "access/timeline.h"
26 : #include "access/xlog.h"
27 : #include "access/xlog_internal.h"
28 : #include "access/xlogrecovery.h"
29 : #include "access/xlogutils.h"
30 : #include "backup/walsummary.h"
31 : #include "catalog/storage_xlog.h"
32 : #include "commands/dbcommands_xlog.h"
33 : #include "common/blkreftable.h"
34 : #include "libpq/pqsignal.h"
35 : #include "miscadmin.h"
36 : #include "pgstat.h"
37 : #include "postmaster/auxprocess.h"
38 : #include "postmaster/interrupt.h"
39 : #include "postmaster/walsummarizer.h"
40 : #include "replication/walreceiver.h"
41 : #include "storage/aio_subsys.h"
42 : #include "storage/fd.h"
43 : #include "storage/ipc.h"
44 : #include "storage/latch.h"
45 : #include "storage/lwlock.h"
46 : #include "storage/proc.h"
47 : #include "storage/procsignal.h"
48 : #include "storage/shmem.h"
49 : #include "utils/guc.h"
50 : #include "utils/memutils.h"
51 : #include "utils/wait_event.h"
52 :
53 : /*
54 : * Data in shared memory related to WAL summarization.
55 : */
56 : typedef struct
57 : {
58 : /*
59 : * These fields are protected by WALSummarizerLock.
60 : *
61 : * Until we've discovered what summary files already exist on disk and
62 : * stored that information in shared memory, initialized is false and the
63 : * other fields here contain no meaningful information. After that has
64 : * been done, initialized is true.
65 : *
66 : * summarized_tli and summarized_lsn indicate the last LSN and TLI at
67 : * which the next summary file will start. Normally, these are the LSN and
68 : * TLI at which the last file ended; in such case, lsn_is_exact is true.
69 : * If, however, the LSN is just an approximation, then lsn_is_exact is
70 : * false. This can happen if, for example, there are no existing WAL
71 : * summary files at startup. In that case, we have to derive the position
72 : * at which to start summarizing from the WAL files that exist on disk,
73 : * and so the LSN might point to the start of the next file even though
74 : * that might happen to be in the middle of a WAL record.
75 : *
76 : * summarizer_pgprocno is the proc number of the summarizer process, if
77 : * one is running, or else INVALID_PROC_NUMBER.
78 : *
79 : * pending_lsn is used by the summarizer to advertise the ending LSN of a
80 : * record it has recently read. It shouldn't ever be less than
81 : * summarized_lsn, but might be greater, because the summarizer buffers
82 : * data for a range of LSNs in memory before writing out a new file.
83 : */
84 : bool initialized;
85 : TimeLineID summarized_tli;
86 : XLogRecPtr summarized_lsn;
87 : bool lsn_is_exact;
88 : ProcNumber summarizer_pgprocno;
89 : XLogRecPtr pending_lsn;
90 :
91 : /*
92 : * This field handles its own synchronization.
93 : */
94 : ConditionVariable summary_file_cv;
95 : } WalSummarizerData;
96 :
97 : /*
98 : * Private data for our xlogreader's page read callback.
99 : */
100 : typedef struct
101 : {
102 : TimeLineID tli;
103 : bool historic;
104 : XLogRecPtr read_upto;
105 : bool end_of_wal;
106 : } SummarizerReadLocalXLogPrivate;
107 :
108 : /* Pointer to shared memory state. */
109 : static WalSummarizerData *WalSummarizerCtl;
110 :
111 : /*
112 : * When we reach end of WAL and need to read more, we sleep for a number of
113 : * milliseconds that is an integer multiple of MS_PER_SLEEP_QUANTUM. This is
114 : * the multiplier. It should vary between 1 and MAX_SLEEP_QUANTA, depending
115 : * on system activity. See summarizer_wait_for_wal() for how we adjust this.
116 : */
117 : static long sleep_quanta = 1;
118 :
119 : /*
120 : * The sleep time will always be a multiple of 200ms and will not exceed
121 : * thirty seconds (150 * 200 = 30 * 1000). Note that the timeout here needs
122 : * to be substantially less than the maximum amount of time for which an
123 : * incremental backup will wait for this process to catch up. Otherwise, an
124 : * incremental backup might time out on an idle system just because we sleep
125 : * for too long.
126 : */
127 : #define MAX_SLEEP_QUANTA 150
128 : #define MS_PER_SLEEP_QUANTUM 200
129 :
130 : /*
131 : * This is a count of the number of pages of WAL that we've read since the
132 : * last time we waited for more WAL to appear.
133 : */
134 : static long pages_read_since_last_sleep = 0;
135 :
136 : /*
137 : * Most recent RedoRecPtr value observed by MaybeRemoveOldWalSummaries.
138 : */
139 : static XLogRecPtr redo_pointer_at_last_summary_removal = InvalidXLogRecPtr;
140 :
141 : /*
142 : * GUC parameters
143 : */
144 : bool summarize_wal = false;
145 : int wal_summary_keep_time = 10 * HOURS_PER_DAY * MINS_PER_HOUR;
146 :
147 : static void WalSummarizerShutdown(int code, Datum arg);
148 : static XLogRecPtr GetLatestLSN(TimeLineID *tli);
149 : static void ProcessWalSummarizerInterrupts(void);
150 : static XLogRecPtr SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn,
151 : bool exact, XLogRecPtr switch_lsn,
152 : XLogRecPtr maximum_lsn);
153 : static void SummarizeDbaseRecord(XLogReaderState *xlogreader,
154 : BlockRefTable *brtab);
155 : static void SummarizeSmgrRecord(XLogReaderState *xlogreader,
156 : BlockRefTable *brtab);
157 : static void SummarizeXactRecord(XLogReaderState *xlogreader,
158 : BlockRefTable *brtab);
159 : static bool SummarizeXlogRecord(XLogReaderState *xlogreader,
160 : bool *new_fast_forward);
161 : static int summarizer_read_local_xlog_page(XLogReaderState *state,
162 : XLogRecPtr targetPagePtr,
163 : int reqLen,
164 : XLogRecPtr targetRecPtr,
165 : char *cur_page);
166 : static void summarizer_wait_for_wal(void);
167 : static void MaybeRemoveOldWalSummaries(void);
168 :
169 : /*
170 : * Amount of shared memory required for this module.
171 : */
172 : Size
173 6534 : WalSummarizerShmemSize(void)
174 : {
175 6534 : return sizeof(WalSummarizerData);
176 : }
177 :
178 : /*
179 : * Create or attach to shared memory segment for this module.
180 : */
181 : void
182 2280 : WalSummarizerShmemInit(void)
183 : {
184 : bool found;
185 :
186 2280 : WalSummarizerCtl = (WalSummarizerData *)
187 2280 : ShmemInitStruct("Wal Summarizer Ctl", WalSummarizerShmemSize(),
188 : &found);
189 :
190 2280 : if (!found)
191 : {
192 : /*
193 : * First time through, so initialize.
194 : *
195 : * We're just filling in dummy values here -- the real initialization
196 : * will happen when GetOldestUnsummarizedLSN() is called for the first
197 : * time.
198 : */
199 2280 : WalSummarizerCtl->initialized = false;
200 2280 : WalSummarizerCtl->summarized_tli = 0;
201 2280 : WalSummarizerCtl->summarized_lsn = InvalidXLogRecPtr;
202 2280 : WalSummarizerCtl->lsn_is_exact = false;
203 2280 : WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER;
204 2280 : WalSummarizerCtl->pending_lsn = InvalidXLogRecPtr;
205 2280 : ConditionVariableInit(&WalSummarizerCtl->summary_file_cv);
206 : }
207 2280 : }
208 :
209 : /*
210 : * Entry point for walsummarizer process.
211 : */
212 : void
213 6 : WalSummarizerMain(const void *startup_data, size_t startup_data_len)
214 : {
215 : sigjmp_buf local_sigjmp_buf;
216 : MemoryContext context;
217 :
218 : /*
219 : * Within this function, 'current_lsn' and 'current_tli' refer to the
220 : * point from which the next WAL summary file should start. 'exact' is
221 : * true if 'current_lsn' is known to be the start of a WAL record or WAL
222 : * segment, and false if it might be in the middle of a record someplace.
223 : *
224 : * 'switch_lsn' and 'switch_tli', if set, are the LSN at which we need to
225 : * switch to a new timeline and the timeline to which we need to switch.
226 : * If not set, we either haven't figured out the answers yet or we're
227 : * already on the latest timeline.
228 : */
229 : XLogRecPtr current_lsn;
230 : TimeLineID current_tli;
231 : bool exact;
232 6 : XLogRecPtr switch_lsn = InvalidXLogRecPtr;
233 6 : TimeLineID switch_tli = 0;
234 :
235 : Assert(startup_data_len == 0);
236 :
237 6 : AuxiliaryProcessMainCommon();
238 :
239 6 : ereport(DEBUG1,
240 : (errmsg_internal("WAL summarizer started")));
241 :
242 : /*
243 : * Properly accept or ignore signals the postmaster might send us
244 : *
245 : * We have no particular use for SIGINT at the moment, but seems
246 : * reasonable to treat like SIGTERM.
247 : */
248 6 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
249 6 : pqsignal(SIGINT, SignalHandlerForShutdownRequest);
250 6 : pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
251 : /* SIGQUIT handler was already set up by InitPostmasterChild */
252 6 : pqsignal(SIGALRM, SIG_IGN);
253 6 : pqsignal(SIGPIPE, SIG_IGN);
254 6 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
255 6 : pqsignal(SIGUSR2, SIG_IGN); /* not used */
256 :
257 : /* Advertise ourselves. */
258 6 : on_shmem_exit(WalSummarizerShutdown, (Datum) 0);
259 6 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
260 6 : WalSummarizerCtl->summarizer_pgprocno = MyProcNumber;
261 6 : LWLockRelease(WALSummarizerLock);
262 :
263 : /* Create and switch to a memory context that we can reset on error. */
264 6 : context = AllocSetContextCreate(TopMemoryContext,
265 : "Wal Summarizer",
266 : ALLOCSET_DEFAULT_SIZES);
267 6 : MemoryContextSwitchTo(context);
268 :
269 : /*
270 : * Reset some signals that are accepted by postmaster but not here
271 : */
272 6 : pqsignal(SIGCHLD, SIG_DFL);
273 :
274 : /*
275 : * If an exception is encountered, processing resumes here.
276 : */
277 6 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
278 : {
279 : /* Since not using PG_TRY, must reset error stack by hand */
280 0 : error_context_stack = NULL;
281 :
282 : /* Prevent interrupts while cleaning up */
283 0 : HOLD_INTERRUPTS();
284 :
285 : /* Report the error to the server log */
286 0 : EmitErrorReport();
287 :
288 : /* Release resources we might have acquired. */
289 0 : LWLockReleaseAll();
290 0 : ConditionVariableCancelSleep();
291 0 : pgstat_report_wait_end();
292 0 : pgaio_error_cleanup();
293 0 : ReleaseAuxProcessResources(false);
294 0 : AtEOXact_Files(false);
295 0 : AtEOXact_HashTables(false);
296 :
297 : /*
298 : * Now return to normal top-level context and clear ErrorContext for
299 : * next time.
300 : */
301 0 : MemoryContextSwitchTo(context);
302 0 : FlushErrorState();
303 :
304 : /* Flush any leaked data in the top-level context */
305 0 : MemoryContextReset(context);
306 :
307 : /* Now we can allow interrupts again */
308 0 : RESUME_INTERRUPTS();
309 :
310 : /*
311 : * Sleep for 10 seconds before attempting to resume operations in
312 : * order to avoid excessive logging.
313 : *
314 : * Many of the likely error conditions are things that will repeat
315 : * every time. For example, if the WAL can't be read or the summary
316 : * can't be written, only administrator action will cure the problem.
317 : * So a really fast retry time doesn't seem to be especially
318 : * beneficial, and it will clutter the logs.
319 : */
320 0 : (void) WaitLatch(NULL,
321 : WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
322 : 10000,
323 : WAIT_EVENT_WAL_SUMMARIZER_ERROR);
324 : }
325 :
326 : /* We can now handle ereport(ERROR) */
327 6 : PG_exception_stack = &local_sigjmp_buf;
328 :
329 : /*
330 : * Unblock signals (they were blocked when the postmaster forked us)
331 : */
332 6 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
333 :
334 : /*
335 : * Fetch information about previous progress from shared memory, and ask
336 : * GetOldestUnsummarizedLSN to reset pending_lsn to summarized_lsn. We
337 : * might be recovering from an error, and if so, pending_lsn might have
338 : * advanced past summarized_lsn, but any WAL we read previously has been
339 : * lost and will need to be reread.
340 : *
341 : * If we discover that WAL summarization is not enabled, just exit.
342 : */
343 6 : current_lsn = GetOldestUnsummarizedLSN(¤t_tli, &exact);
344 6 : if (!XLogRecPtrIsValid(current_lsn))
345 0 : proc_exit(0);
346 :
347 : /*
348 : * Loop forever
349 : */
350 : for (;;)
351 36 : {
352 : XLogRecPtr latest_lsn;
353 : TimeLineID latest_tli;
354 : XLogRecPtr end_of_summary_lsn;
355 :
356 : /* Flush any leaked data in the top-level context */
357 42 : MemoryContextReset(context);
358 :
359 : /* Process any signals received recently. */
360 42 : ProcessWalSummarizerInterrupts();
361 :
362 : /* If it's time to remove any old WAL summaries, do that now. */
363 42 : MaybeRemoveOldWalSummaries();
364 :
365 : /* Find the LSN and TLI up to which we can safely summarize. */
366 42 : latest_lsn = GetLatestLSN(&latest_tli);
367 :
368 : /*
369 : * If we're summarizing a historic timeline and we haven't yet
370 : * computed the point at which to switch to the next timeline, do that
371 : * now.
372 : *
373 : * Note that if this is a standby, what was previously the current
374 : * timeline could become historic at any time.
375 : *
376 : * We could try to make this more efficient by caching the results of
377 : * readTimeLineHistory when latest_tli has not changed, but since we
378 : * only have to do this once per timeline switch, we probably wouldn't
379 : * save any significant amount of work in practice.
380 : */
381 42 : if (current_tli != latest_tli && !XLogRecPtrIsValid(switch_lsn))
382 : {
383 0 : List *tles = readTimeLineHistory(latest_tli);
384 :
385 0 : switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli);
386 0 : ereport(DEBUG1,
387 : errmsg_internal("switch point from TLI %u to TLI %u is at %X/%08X",
388 : current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn)));
389 : }
390 :
391 : /*
392 : * If we've reached the switch LSN, we can't summarize anything else
393 : * on this timeline. Switch to the next timeline and go around again,
394 : * backing up to the exact switch point if we passed it.
395 : */
396 42 : if (XLogRecPtrIsValid(switch_lsn) && current_lsn >= switch_lsn)
397 : {
398 : /* Restart summarization from switch point. */
399 0 : current_tli = switch_tli;
400 0 : current_lsn = switch_lsn;
401 :
402 : /* Next timeline and switch point, if any, not yet known. */
403 0 : switch_lsn = InvalidXLogRecPtr;
404 0 : switch_tli = 0;
405 :
406 : /* Update (really, rewind, if needed) state in shared memory. */
407 0 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
408 0 : WalSummarizerCtl->summarized_lsn = current_lsn;
409 0 : WalSummarizerCtl->summarized_tli = current_tli;
410 0 : WalSummarizerCtl->lsn_is_exact = true;
411 0 : WalSummarizerCtl->pending_lsn = current_lsn;
412 0 : LWLockRelease(WALSummarizerLock);
413 :
414 0 : continue;
415 : }
416 :
417 : /* Summarize WAL. */
418 42 : end_of_summary_lsn = SummarizeWAL(current_tli,
419 : current_lsn, exact,
420 : switch_lsn, latest_lsn);
421 : Assert(XLogRecPtrIsValid(end_of_summary_lsn));
422 : Assert(end_of_summary_lsn >= current_lsn);
423 :
424 : /*
425 : * Update state for next loop iteration.
426 : *
427 : * Next summary file should start from exactly where this one ended.
428 : */
429 36 : current_lsn = end_of_summary_lsn;
430 36 : exact = true;
431 :
432 : /* Update state in shared memory. */
433 36 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
434 36 : WalSummarizerCtl->summarized_lsn = end_of_summary_lsn;
435 36 : WalSummarizerCtl->summarized_tli = current_tli;
436 36 : WalSummarizerCtl->lsn_is_exact = true;
437 36 : WalSummarizerCtl->pending_lsn = end_of_summary_lsn;
438 36 : LWLockRelease(WALSummarizerLock);
439 :
440 : /* Wake up anyone waiting for more summary files to be written. */
441 36 : ConditionVariableBroadcast(&WalSummarizerCtl->summary_file_cv);
442 : }
443 : }
444 :
445 : /*
446 : * Get information about the state of the WAL summarizer.
447 : */
448 : void
449 0 : GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn,
450 : XLogRecPtr *pending_lsn, int *summarizer_pid)
451 : {
452 0 : LWLockAcquire(WALSummarizerLock, LW_SHARED);
453 0 : if (!WalSummarizerCtl->initialized)
454 : {
455 : /*
456 : * If initialized is false, the rest of the structure contents are
457 : * undefined.
458 : */
459 0 : *summarized_tli = 0;
460 0 : *summarized_lsn = InvalidXLogRecPtr;
461 0 : *pending_lsn = InvalidXLogRecPtr;
462 0 : *summarizer_pid = -1;
463 : }
464 : else
465 : {
466 0 : int summarizer_pgprocno = WalSummarizerCtl->summarizer_pgprocno;
467 :
468 0 : *summarized_tli = WalSummarizerCtl->summarized_tli;
469 0 : *summarized_lsn = WalSummarizerCtl->summarized_lsn;
470 0 : if (summarizer_pgprocno == INVALID_PROC_NUMBER)
471 : {
472 : /*
473 : * If the summarizer has exited, the fact that it had processed
474 : * beyond summarized_lsn is irrelevant now.
475 : */
476 0 : *pending_lsn = WalSummarizerCtl->summarized_lsn;
477 0 : *summarizer_pid = -1;
478 : }
479 : else
480 : {
481 0 : *pending_lsn = WalSummarizerCtl->pending_lsn;
482 :
483 : /*
484 : * We're not fussed about inexact answers here, since they could
485 : * become stale instantly, so we don't bother taking the lock, but
486 : * make sure that invalid PID values are normalized to -1.
487 : */
488 0 : *summarizer_pid = GetPGProcByNumber(summarizer_pgprocno)->pid;
489 0 : if (*summarizer_pid <= 0)
490 0 : *summarizer_pid = -1;
491 : }
492 : }
493 0 : LWLockRelease(WALSummarizerLock);
494 0 : }
495 :
496 : /*
497 : * Get the oldest LSN in this server's timeline history that has not yet been
498 : * summarized, and update shared memory state as appropriate.
499 : *
500 : * If *tli != NULL, it will be set to the TLI for the LSN that is returned.
501 : *
502 : * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is
503 : * necessarily the start of a WAL record and false if it's just the beginning
504 : * of a WAL segment.
505 : */
506 : XLogRecPtr
507 4766 : GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact)
508 : {
509 : TimeLineID latest_tli;
510 : int n;
511 : List *tles;
512 4766 : XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr;
513 4766 : TimeLineID unsummarized_tli = 0;
514 4766 : bool should_make_exact = false;
515 : List *existing_summaries;
516 : ListCell *lc;
517 4766 : bool am_wal_summarizer = AmWalSummarizerProcess();
518 :
519 : /* If not summarizing WAL, do nothing. */
520 4766 : if (!summarize_wal)
521 4746 : return InvalidXLogRecPtr;
522 :
523 : /*
524 : * If we are not the WAL summarizer process, then we normally just want to
525 : * read the values from shared memory. However, as an exception, if shared
526 : * memory hasn't been initialized yet, then we need to do that so that we
527 : * can read legal values and not remove any WAL too early.
528 : */
529 20 : if (!am_wal_summarizer)
530 : {
531 14 : LWLockAcquire(WALSummarizerLock, LW_SHARED);
532 :
533 14 : if (WalSummarizerCtl->initialized)
534 : {
535 12 : unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
536 12 : if (tli != NULL)
537 0 : *tli = WalSummarizerCtl->summarized_tli;
538 12 : if (lsn_is_exact != NULL)
539 0 : *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
540 12 : LWLockRelease(WALSummarizerLock);
541 12 : return unsummarized_lsn;
542 : }
543 :
544 2 : LWLockRelease(WALSummarizerLock);
545 : }
546 :
547 : /*
548 : * Find the oldest timeline on which WAL still exists, and the earliest
549 : * segment for which it exists.
550 : *
551 : * Note that we do this every time the WAL summarizer process restarts or
552 : * recovers from an error, in case the contents of pg_wal have changed
553 : * under us e.g. if some files were removed, either manually - which
554 : * shouldn't really happen, but might - or by postgres itself, if
555 : * summarize_wal was turned off and then back on again.
556 : */
557 8 : (void) GetLatestLSN(&latest_tli);
558 8 : tles = readTimeLineHistory(latest_tli);
559 8 : for (n = list_length(tles) - 1; n >= 0; --n)
560 : {
561 8 : TimeLineHistoryEntry *tle = list_nth(tles, n);
562 : XLogSegNo oldest_segno;
563 :
564 8 : oldest_segno = XLogGetOldestSegno(tle->tli);
565 8 : if (oldest_segno != 0)
566 : {
567 : /* Compute oldest LSN that still exists on disk. */
568 8 : XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size,
569 : unsummarized_lsn);
570 :
571 8 : unsummarized_tli = tle->tli;
572 8 : break;
573 : }
574 : }
575 :
576 : /*
577 : * Don't try to summarize anything older than the end LSN of the newest
578 : * summary file that exists for this timeline.
579 : */
580 : existing_summaries =
581 8 : GetWalSummaries(unsummarized_tli,
582 : InvalidXLogRecPtr, InvalidXLogRecPtr);
583 8 : foreach(lc, existing_summaries)
584 : {
585 0 : WalSummaryFile *ws = lfirst(lc);
586 :
587 0 : if (ws->end_lsn > unsummarized_lsn)
588 : {
589 0 : unsummarized_lsn = ws->end_lsn;
590 0 : should_make_exact = true;
591 : }
592 : }
593 :
594 : /* It really should not be possible for us to find no WAL. */
595 8 : if (unsummarized_tli == 0)
596 0 : ereport(ERROR,
597 : errcode(ERRCODE_INTERNAL_ERROR),
598 : errmsg_internal("no WAL found on timeline %u", latest_tli));
599 :
600 : /*
601 : * If we're the WAL summarizer, we always want to store the values we just
602 : * computed into shared memory, because those are the values we're going
603 : * to use to drive our operation, and so they are the authoritative
604 : * values. Otherwise, we only store values into shared memory if shared
605 : * memory is uninitialized. Our values are not canonical in such a case,
606 : * but it's better to have something than nothing, to guide WAL retention.
607 : */
608 8 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
609 8 : if (am_wal_summarizer || !WalSummarizerCtl->initialized)
610 : {
611 8 : WalSummarizerCtl->initialized = true;
612 8 : WalSummarizerCtl->summarized_lsn = unsummarized_lsn;
613 8 : WalSummarizerCtl->summarized_tli = unsummarized_tli;
614 8 : WalSummarizerCtl->lsn_is_exact = should_make_exact;
615 8 : WalSummarizerCtl->pending_lsn = unsummarized_lsn;
616 : }
617 : else
618 0 : unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
619 :
620 : /* Also return the to the caller as required. */
621 8 : if (tli != NULL)
622 6 : *tli = WalSummarizerCtl->summarized_tli;
623 8 : if (lsn_is_exact != NULL)
624 6 : *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
625 8 : LWLockRelease(WALSummarizerLock);
626 :
627 8 : return unsummarized_lsn;
628 : }
629 :
630 : /*
631 : * Wake up the WAL summarizer process.
632 : *
633 : * This might not work, because there's no guarantee that the WAL summarizer
634 : * process was successfully started, and it also might have started but
635 : * subsequently terminated. So, under normal circumstances, this will get the
636 : * latch set, but there's no guarantee.
637 : */
638 : void
639 3178 : WakeupWalSummarizer(void)
640 : {
641 : ProcNumber pgprocno;
642 :
643 3178 : if (WalSummarizerCtl == NULL)
644 0 : return;
645 :
646 3178 : LWLockAcquire(WALSummarizerLock, LW_SHARED);
647 3178 : pgprocno = WalSummarizerCtl->summarizer_pgprocno;
648 3178 : LWLockRelease(WALSummarizerLock);
649 :
650 3178 : if (pgprocno != INVALID_PROC_NUMBER)
651 6 : SetLatch(&GetPGProcByNumber(pgprocno)->procLatch);
652 : }
653 :
654 : /*
655 : * Wait until WAL summarization reaches the given LSN, but time out with an
656 : * error if the summarizer seems to be stick.
657 : *
658 : * Returns immediately if summarize_wal is turned off while we wait. Caller
659 : * is expected to handle this case, if necessary.
660 : */
661 : void
662 24 : WaitForWalSummarization(XLogRecPtr lsn)
663 : {
664 : TimestampTz initial_time,
665 : cycle_time,
666 : current_time;
667 24 : XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
668 24 : int deadcycles = 0;
669 :
670 24 : initial_time = cycle_time = GetCurrentTimestamp();
671 :
672 : while (1)
673 8 : {
674 32 : long timeout_in_ms = 10000;
675 : XLogRecPtr summarized_lsn;
676 : XLogRecPtr pending_lsn;
677 :
678 32 : CHECK_FOR_INTERRUPTS();
679 :
680 : /* If WAL summarization is disabled while we're waiting, give up. */
681 32 : if (!summarize_wal)
682 0 : return;
683 :
684 : /*
685 : * If the LSN summarized on disk has reached the target value, stop.
686 : */
687 32 : LWLockAcquire(WALSummarizerLock, LW_SHARED);
688 32 : summarized_lsn = WalSummarizerCtl->summarized_lsn;
689 32 : pending_lsn = WalSummarizerCtl->pending_lsn;
690 32 : LWLockRelease(WALSummarizerLock);
691 :
692 : /* If WAL summarization has progressed sufficiently, stop waiting. */
693 32 : if (summarized_lsn >= lsn)
694 24 : break;
695 :
696 : /* Recheck current time. */
697 8 : current_time = GetCurrentTimestamp();
698 :
699 : /* Have we finished the current cycle of waiting? */
700 8 : if (TimestampDifferenceMilliseconds(cycle_time,
701 : current_time) >= timeout_in_ms)
702 : {
703 : long elapsed_seconds;
704 :
705 : /* Begin new wait cycle. */
706 0 : cycle_time = TimestampTzPlusMilliseconds(cycle_time,
707 : timeout_in_ms);
708 :
709 : /*
710 : * Keep track of the number of cycles during which there has been
711 : * no progression of pending_lsn. If pending_lsn is not advancing,
712 : * that means that not only are no new files appearing on disk,
713 : * but we're not even incorporating new records into the in-memory
714 : * state.
715 : */
716 0 : if (pending_lsn > prior_pending_lsn)
717 : {
718 0 : prior_pending_lsn = pending_lsn;
719 0 : deadcycles = 0;
720 : }
721 : else
722 0 : ++deadcycles;
723 :
724 : /*
725 : * If we've managed to wait for an entire minute without the WAL
726 : * summarizer absorbing a single WAL record, error out; probably
727 : * something is wrong.
728 : *
729 : * We could consider also erroring out if the summarizer is taking
730 : * too long to catch up, but it's not clear what rate of progress
731 : * would be acceptable and what would be too slow. So instead, we
732 : * just try to error out in the case where there's no progress at
733 : * all. That seems likely to catch a reasonable number of the
734 : * things that can go wrong in practice (e.g. the summarizer
735 : * process is completely hung, say because somebody hooked up a
736 : * debugger to it or something) without giving up too quickly when
737 : * the system is just slow.
738 : */
739 0 : if (deadcycles >= 6)
740 0 : ereport(ERROR,
741 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
742 : errmsg("WAL summarization is not progressing"),
743 : errdetail("Summarization is needed through %X/%08X, but is stuck at %X/%08X on disk and %X/%08X in memory.",
744 : LSN_FORMAT_ARGS(lsn),
745 : LSN_FORMAT_ARGS(summarized_lsn),
746 : LSN_FORMAT_ARGS(pending_lsn))));
747 :
748 :
749 : /*
750 : * Otherwise, just let the user know what's happening.
751 : */
752 0 : elapsed_seconds =
753 0 : TimestampDifferenceMilliseconds(initial_time,
754 : current_time) / 1000;
755 0 : ereport(WARNING,
756 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
757 : errmsg_plural("still waiting for WAL summarization through %X/%08X after %ld second",
758 : "still waiting for WAL summarization through %X/%08X after %ld seconds",
759 : elapsed_seconds,
760 : LSN_FORMAT_ARGS(lsn),
761 : elapsed_seconds),
762 : errdetail("Summarization has reached %X/%08X on disk and %X/%08X in memory.",
763 : LSN_FORMAT_ARGS(summarized_lsn),
764 : LSN_FORMAT_ARGS(pending_lsn))));
765 : }
766 :
767 : /*
768 : * Align the wait time to prevent drift. This doesn't really matter,
769 : * but we'd like the warnings about how long we've been waiting to say
770 : * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
771 : * drifting to something that is not a multiple of ten.
772 : */
773 8 : timeout_in_ms -=
774 8 : TimestampDifferenceMilliseconds(cycle_time, current_time);
775 :
776 : /* Wait and see. */
777 8 : ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
778 : timeout_in_ms,
779 : WAIT_EVENT_WAL_SUMMARY_READY);
780 : }
781 :
782 24 : ConditionVariableCancelSleep();
783 : }
784 :
785 : /*
786 : * On exit, update shared memory to make it clear that we're no longer
787 : * running.
788 : */
789 : static void
790 6 : WalSummarizerShutdown(int code, Datum arg)
791 : {
792 6 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
793 6 : WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER;
794 6 : LWLockRelease(WALSummarizerLock);
795 6 : }
796 :
797 : /*
798 : * Get the latest LSN that is eligible to be summarized, and set *tli to the
799 : * corresponding timeline.
800 : */
801 : static XLogRecPtr
802 90 : GetLatestLSN(TimeLineID *tli)
803 : {
804 90 : if (!RecoveryInProgress())
805 : {
806 : /* Don't summarize WAL before it's flushed. */
807 88 : return GetFlushRecPtr(tli);
808 : }
809 : else
810 : {
811 : XLogRecPtr flush_lsn;
812 : TimeLineID flush_tli;
813 : XLogRecPtr replay_lsn;
814 : TimeLineID replay_tli;
815 : TimeLineID insert_tli;
816 :
817 : /*
818 : * After the insert TLI has been set and before the control file has
819 : * been updated to show the DB in production, RecoveryInProgress()
820 : * will return true, because it's not yet safe for all backends to
821 : * begin writing WAL. However, replay has already ceased, so from our
822 : * point of view, recovery is already over. We should summarize up to
823 : * where replay stopped and then prepare to resume at the start of the
824 : * insert timeline.
825 : */
826 2 : if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
827 : {
828 2 : *tli = insert_tli;
829 2 : return GetXLogReplayRecPtr(NULL);
830 : }
831 :
832 : /*
833 : * What we really want to know is how much WAL has been flushed to
834 : * disk, but the only flush position available is the one provided by
835 : * the walreceiver, which may not be running, because this could be
836 : * crash recovery or recovery via restore_command. So use either the
837 : * WAL receiver's flush position or the replay position, whichever is
838 : * further ahead, on the theory that if the WAL has been replayed then
839 : * it must also have been flushed to disk.
840 : */
841 0 : flush_lsn = GetWalRcvFlushRecPtr(NULL, &flush_tli);
842 0 : replay_lsn = GetXLogReplayRecPtr(&replay_tli);
843 0 : if (flush_lsn > replay_lsn)
844 : {
845 0 : *tli = flush_tli;
846 0 : return flush_lsn;
847 : }
848 : else
849 : {
850 0 : *tli = replay_tli;
851 0 : return replay_lsn;
852 : }
853 : }
854 : }
855 :
856 : /*
857 : * Interrupt handler for main loop of WAL summarizer process.
858 : */
859 : static void
860 103020 : ProcessWalSummarizerInterrupts(void)
861 : {
862 103020 : if (ProcSignalBarrierPending)
863 0 : ProcessProcSignalBarrier();
864 :
865 103020 : if (ConfigReloadPending)
866 : {
867 0 : ConfigReloadPending = false;
868 0 : ProcessConfigFile(PGC_SIGHUP);
869 : }
870 :
871 103020 : if (ShutdownRequestPending || !summarize_wal)
872 : {
873 6 : ereport(DEBUG1,
874 : errmsg_internal("WAL summarizer shutting down"));
875 6 : proc_exit(0);
876 : }
877 :
878 : /* Perform logging of memory contexts of this process */
879 103014 : if (LogMemoryContextPending)
880 0 : ProcessLogMemoryContextInterrupt();
881 103014 : }
882 :
883 : /*
884 : * Summarize a range of WAL records on a single timeline.
885 : *
886 : * 'tli' is the timeline to be summarized.
887 : *
888 : * 'start_lsn' is the point at which we should start summarizing. If this
889 : * value comes from the end LSN of the previous record as returned by the
890 : * xlogreader machinery, 'exact' should be true; otherwise, 'exact' should
891 : * be false, and this function will search forward for the start of a valid
892 : * WAL record.
893 : *
894 : * 'switch_lsn' is the point at which we should switch to a later timeline,
895 : * if we're summarizing a historic timeline.
896 : *
897 : * 'maximum_lsn' identifies the point beyond which we can't count on being
898 : * able to read any more WAL. It should be the switch point when reading a
899 : * historic timeline, or the most-recently-measured end of WAL when reading
900 : * the current timeline.
901 : *
902 : * The return value is the LSN at which the WAL summary actually ends. Most
903 : * often, a summary file ends because we notice that a checkpoint has
904 : * occurred and reach the redo pointer of that checkpoint, but sometimes
905 : * we stop for other reasons, such as a timeline switch.
906 : */
907 : static XLogRecPtr
908 42 : SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
909 : XLogRecPtr switch_lsn, XLogRecPtr maximum_lsn)
910 : {
911 : SummarizerReadLocalXLogPrivate *private_data;
912 : XLogReaderState *xlogreader;
913 : XLogRecPtr summary_start_lsn;
914 42 : XLogRecPtr summary_end_lsn = switch_lsn;
915 : char temp_path[MAXPGPATH];
916 : char final_path[MAXPGPATH];
917 : WalSummaryIO io;
918 42 : BlockRefTable *brtab = CreateEmptyBlockRefTable();
919 42 : bool fast_forward = true;
920 :
921 : /* Initialize private data for xlogreader. */
922 42 : private_data = palloc0_object(SummarizerReadLocalXLogPrivate);
923 42 : private_data->tli = tli;
924 42 : private_data->historic = XLogRecPtrIsValid(switch_lsn);
925 42 : private_data->read_upto = maximum_lsn;
926 :
927 : /* Create xlogreader. */
928 42 : xlogreader = XLogReaderAllocate(wal_segment_size, NULL,
929 42 : XL_ROUTINE(.page_read = &summarizer_read_local_xlog_page,
930 : .segment_open = &wal_segment_open,
931 : .segment_close = &wal_segment_close),
932 : private_data);
933 42 : if (xlogreader == NULL)
934 0 : ereport(ERROR,
935 : (errcode(ERRCODE_OUT_OF_MEMORY),
936 : errmsg("out of memory"),
937 : errdetail("Failed while allocating a WAL reading processor.")));
938 :
939 : /*
940 : * When exact = false, we're starting from an arbitrary point in the WAL
941 : * and must search forward for the start of the next record.
942 : *
943 : * When exact = true, start_lsn should be either the LSN where a record
944 : * begins, or the LSN of a page where the page header is immediately
945 : * followed by the start of a new record. XLogBeginRead should tolerate
946 : * either case.
947 : *
948 : * We need to allow for both cases because the behavior of xlogreader
949 : * varies. When a record spans two or more xlog pages, the ending LSN
950 : * reported by xlogreader will be the starting LSN of the following
951 : * record, but when an xlog page boundary falls between two records, the
952 : * end LSN for the first will be reported as the first byte of the
953 : * following page. We can't know until we read that page how large the
954 : * header will be, but we'll have to skip over it to find the next record.
955 : */
956 42 : if (exact)
957 : {
958 : /*
959 : * Even if start_lsn is the beginning of a page rather than the
960 : * beginning of the first record on that page, we should still use it
961 : * as the start LSN for the summary file. That's because we detect
962 : * missing summary files by looking for cases where the end LSN of one
963 : * file is less than the start LSN of the next file. When only a page
964 : * header is skipped, nothing has been missed.
965 : */
966 36 : XLogBeginRead(xlogreader, start_lsn);
967 36 : summary_start_lsn = start_lsn;
968 : }
969 : else
970 : {
971 6 : summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn);
972 6 : if (!XLogRecPtrIsValid(summary_start_lsn))
973 : {
974 : /*
975 : * If we hit end-of-WAL while trying to find the next valid
976 : * record, we must be on a historic timeline that has no valid
977 : * records that begin after start_lsn and before end of WAL.
978 : */
979 0 : if (private_data->end_of_wal)
980 : {
981 0 : ereport(DEBUG1,
982 : errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X",
983 : tli,
984 : LSN_FORMAT_ARGS(start_lsn),
985 : LSN_FORMAT_ARGS(private_data->read_upto)));
986 :
987 : /*
988 : * The timeline ends at or after start_lsn, without containing
989 : * any records. Thus, we must make sure the main loop does not
990 : * iterate. If start_lsn is the end of the timeline, then we
991 : * won't actually emit an empty summary file, but otherwise,
992 : * we must, to capture the fact that the LSN range in question
993 : * contains no interesting WAL records.
994 : */
995 0 : summary_start_lsn = start_lsn;
996 0 : summary_end_lsn = private_data->read_upto;
997 0 : switch_lsn = xlogreader->EndRecPtr;
998 : }
999 : else
1000 0 : ereport(ERROR,
1001 : errmsg("could not find a valid record after %X/%08X",
1002 : LSN_FORMAT_ARGS(start_lsn)));
1003 : }
1004 :
1005 : /* We shouldn't go backward. */
1006 : Assert(summary_start_lsn >= start_lsn);
1007 : }
1008 :
1009 : /*
1010 : * Main loop: read xlog records one by one.
1011 : */
1012 : while (1)
1013 98874 : {
1014 : int block_id;
1015 : char *errormsg;
1016 : XLogRecord *record;
1017 : uint8 rmid;
1018 :
1019 98916 : ProcessWalSummarizerInterrupts();
1020 :
1021 : /* We shouldn't go backward. */
1022 : Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1023 :
1024 : /* Now read the next record. */
1025 98914 : record = XLogReadRecord(xlogreader, &errormsg);
1026 98910 : if (record == NULL)
1027 : {
1028 0 : if (private_data->end_of_wal)
1029 : {
1030 : /*
1031 : * This timeline must be historic and must end before we were
1032 : * able to read a complete record.
1033 : */
1034 0 : ereport(DEBUG1,
1035 : errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X",
1036 : tli,
1037 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr),
1038 : LSN_FORMAT_ARGS(private_data->read_upto)));
1039 : /* Summary ends at end of WAL. */
1040 0 : summary_end_lsn = private_data->read_upto;
1041 0 : break;
1042 : }
1043 0 : if (errormsg)
1044 0 : ereport(ERROR,
1045 : (errcode_for_file_access(),
1046 : errmsg("could not read WAL from timeline %u at %X/%08X: %s",
1047 : tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr),
1048 : errormsg)));
1049 : else
1050 0 : ereport(ERROR,
1051 : (errcode_for_file_access(),
1052 : errmsg("could not read WAL from timeline %u at %X/%08X",
1053 : tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr))));
1054 : }
1055 :
1056 : /* We shouldn't go backward. */
1057 : Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1058 :
1059 98910 : if (XLogRecPtrIsValid(switch_lsn) &&
1060 0 : xlogreader->ReadRecPtr >= switch_lsn)
1061 : {
1062 : /*
1063 : * Whoops! We've read a record that *starts* after the switch LSN,
1064 : * contrary to our goal of reading only until we hit the first
1065 : * record that ends at or after the switch LSN. Pretend we didn't
1066 : * read it after all by bailing out of this loop right here,
1067 : * before we do anything with this record.
1068 : *
1069 : * This can happen because the last record before the switch LSN
1070 : * might be continued across multiple pages, and then we might
1071 : * come to a page with XLP_FIRST_IS_OVERWRITE_CONTRECORD set. In
1072 : * that case, the record that was continued across multiple pages
1073 : * is incomplete and will be disregarded, and the read will
1074 : * restart from the beginning of the page that is flagged
1075 : * XLP_FIRST_IS_OVERWRITE_CONTRECORD.
1076 : *
1077 : * If this case occurs, we can fairly say that the current summary
1078 : * file ends at the switch LSN exactly. The first record on the
1079 : * page marked XLP_FIRST_IS_OVERWRITE_CONTRECORD will be
1080 : * discovered when generating the next summary file.
1081 : */
1082 0 : summary_end_lsn = switch_lsn;
1083 0 : break;
1084 : }
1085 :
1086 : /*
1087 : * Certain types of records require special handling. Redo points and
1088 : * shutdown checkpoints trigger creation of new summary files and can
1089 : * also cause us to enter or exit "fast forward" mode. Other types of
1090 : * records can require special updates to the block reference table.
1091 : */
1092 98910 : rmid = XLogRecGetRmid(xlogreader);
1093 98910 : if (rmid == RM_XLOG_ID)
1094 : {
1095 : bool new_fast_forward;
1096 :
1097 : /*
1098 : * If we've already processed some WAL records when we hit a redo
1099 : * point or shutdown checkpoint, then we stop summarization before
1100 : * including this record in the current file, so that it will be
1101 : * the first record in the next file.
1102 : *
1103 : * When we hit one of those record types as the first record in a
1104 : * file, we adjust our notion of whether we're fast-forwarding.
1105 : * Any WAL generated with wal_level=minimal must be skipped
1106 : * without actually generating any summary file, because an
1107 : * incremental backup that crosses such WAL would be unsafe.
1108 : */
1109 1382 : if (SummarizeXlogRecord(xlogreader, &new_fast_forward))
1110 : {
1111 78 : if (xlogreader->ReadRecPtr > summary_start_lsn)
1112 : {
1113 36 : summary_end_lsn = xlogreader->ReadRecPtr;
1114 36 : break;
1115 : }
1116 : else
1117 42 : fast_forward = new_fast_forward;
1118 : }
1119 : }
1120 97528 : else if (!fast_forward)
1121 : {
1122 : /*
1123 : * This switch handles record types that require extra updates to
1124 : * the contents of the block reference table.
1125 : */
1126 97528 : switch (rmid)
1127 : {
1128 8 : case RM_DBASE_ID:
1129 8 : SummarizeDbaseRecord(xlogreader, brtab);
1130 8 : break;
1131 58 : case RM_SMGR_ID:
1132 58 : SummarizeSmgrRecord(xlogreader, brtab);
1133 58 : break;
1134 3072 : case RM_XACT_ID:
1135 3072 : SummarizeXactRecord(xlogreader, brtab);
1136 3072 : break;
1137 : }
1138 : }
1139 :
1140 : /*
1141 : * If we're in fast-forward mode, we don't really need to do anything.
1142 : * Otherwise, feed block references from xlog record to block
1143 : * reference table.
1144 : */
1145 98874 : if (!fast_forward)
1146 : {
1147 196906 : for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader);
1148 98032 : block_id++)
1149 : {
1150 : RelFileLocator rlocator;
1151 : ForkNumber forknum;
1152 : BlockNumber blocknum;
1153 :
1154 98032 : if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator,
1155 : &forknum, &blocknum, NULL))
1156 80 : continue;
1157 :
1158 : /*
1159 : * As we do elsewhere, ignore the FSM fork, because it's not
1160 : * fully WAL-logged.
1161 : */
1162 97952 : if (forknum != FSM_FORKNUM)
1163 97378 : BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
1164 : blocknum);
1165 : }
1166 : }
1167 :
1168 : /* Update our notion of where this summary file ends. */
1169 98874 : summary_end_lsn = xlogreader->EndRecPtr;
1170 :
1171 : /* Also update shared memory. */
1172 98874 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
1173 : Assert(summary_end_lsn >= WalSummarizerCtl->summarized_lsn);
1174 98874 : WalSummarizerCtl->pending_lsn = summary_end_lsn;
1175 98874 : LWLockRelease(WALSummarizerLock);
1176 :
1177 : /*
1178 : * If we have a switch LSN and have reached it, stop before reading
1179 : * the next record.
1180 : */
1181 98874 : if (XLogRecPtrIsValid(switch_lsn) &&
1182 0 : xlogreader->EndRecPtr >= switch_lsn)
1183 0 : break;
1184 : }
1185 :
1186 : /* Destroy xlogreader. */
1187 36 : pfree(xlogreader->private_data);
1188 36 : XLogReaderFree(xlogreader);
1189 :
1190 : /*
1191 : * If a timeline switch occurs, we may fail to make any progress at all
1192 : * before exiting the loop above. If that happens, we don't write a WAL
1193 : * summary file at all. We can also skip writing a file if we're in
1194 : * fast-forward mode.
1195 : */
1196 36 : if (summary_end_lsn > summary_start_lsn && !fast_forward)
1197 : {
1198 : /* Generate temporary and final path name. */
1199 36 : snprintf(temp_path, MAXPGPATH,
1200 : XLOGDIR "/summaries/temp.summary");
1201 36 : snprintf(final_path, MAXPGPATH,
1202 : XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary",
1203 : tli,
1204 36 : LSN_FORMAT_ARGS(summary_start_lsn),
1205 36 : LSN_FORMAT_ARGS(summary_end_lsn));
1206 :
1207 : /* Open the temporary file for writing. */
1208 36 : io.filepos = 0;
1209 36 : io.file = PathNameOpenFile(temp_path, O_WRONLY | O_CREAT | O_TRUNC);
1210 36 : if (io.file < 0)
1211 0 : ereport(ERROR,
1212 : (errcode_for_file_access(),
1213 : errmsg("could not create file \"%s\": %m", temp_path)));
1214 :
1215 : /* Write the data. */
1216 36 : WriteBlockRefTable(brtab, WriteWalSummary, &io);
1217 :
1218 : /* Close temporary file and shut down xlogreader. */
1219 36 : FileClose(io.file);
1220 :
1221 : /* Tell the user what we did. */
1222 36 : ereport(DEBUG1,
1223 : errmsg_internal("summarized WAL on TLI %u from %X/%08X to %X/%08X",
1224 : tli,
1225 : LSN_FORMAT_ARGS(summary_start_lsn),
1226 : LSN_FORMAT_ARGS(summary_end_lsn)));
1227 :
1228 : /* Durably rename the new summary into place. */
1229 36 : durable_rename(temp_path, final_path, ERROR);
1230 : }
1231 :
1232 : /* If we skipped a non-zero amount of WAL, log a debug message. */
1233 36 : if (summary_end_lsn > summary_start_lsn && fast_forward)
1234 0 : ereport(DEBUG1,
1235 : errmsg_internal("skipped summarizing WAL on TLI %u from %X/%08X to %X/%08X",
1236 : tli,
1237 : LSN_FORMAT_ARGS(summary_start_lsn),
1238 : LSN_FORMAT_ARGS(summary_end_lsn)));
1239 :
1240 36 : return summary_end_lsn;
1241 : }
1242 :
1243 : /*
1244 : * Special handling for WAL records with RM_DBASE_ID.
1245 : */
1246 : static void
1247 8 : SummarizeDbaseRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1248 : {
1249 8 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1250 :
1251 : /*
1252 : * We use relfilenode zero for a given database OID and tablespace OID to
1253 : * indicate that all relations with that pair of IDs have been recreated
1254 : * if they exist at all. Effectively, we're setting a limit block of 0 for
1255 : * all such relfilenodes.
1256 : *
1257 : * Technically, this special handling is only needed in the case of
1258 : * XLOG_DBASE_CREATE_FILE_COPY, because that can create a whole bunch of
1259 : * relation files in a directory without logging anything specific to each
1260 : * one. If we didn't mark the whole DB OID/TS OID combination in some way,
1261 : * then a tablespace that was dropped after the reference backup and
1262 : * recreated using the FILE_COPY method prior to the incremental backup
1263 : * would look just like one that was never touched at all, which would be
1264 : * catastrophic.
1265 : *
1266 : * But it seems best to adopt this treatment for all records that drop or
1267 : * create a DB OID/TS OID combination. That's similar to how we treat the
1268 : * limit block for individual relations, and it's an extra layer of safety
1269 : * here. We can never lose data by marking more stuff as needing to be
1270 : * backed up in full.
1271 : */
1272 8 : if (info == XLOG_DBASE_CREATE_FILE_COPY)
1273 : {
1274 : xl_dbase_create_file_copy_rec *xlrec;
1275 : RelFileLocator rlocator;
1276 :
1277 8 : xlrec =
1278 8 : (xl_dbase_create_file_copy_rec *) XLogRecGetData(xlogreader);
1279 8 : rlocator.spcOid = xlrec->tablespace_id;
1280 8 : rlocator.dbOid = xlrec->db_id;
1281 8 : rlocator.relNumber = 0;
1282 8 : BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1283 : }
1284 0 : else if (info == XLOG_DBASE_CREATE_WAL_LOG)
1285 : {
1286 : xl_dbase_create_wal_log_rec *xlrec;
1287 : RelFileLocator rlocator;
1288 :
1289 0 : xlrec = (xl_dbase_create_wal_log_rec *) XLogRecGetData(xlogreader);
1290 0 : rlocator.spcOid = xlrec->tablespace_id;
1291 0 : rlocator.dbOid = xlrec->db_id;
1292 0 : rlocator.relNumber = 0;
1293 0 : BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1294 : }
1295 0 : else if (info == XLOG_DBASE_DROP)
1296 : {
1297 : xl_dbase_drop_rec *xlrec;
1298 : RelFileLocator rlocator;
1299 : int i;
1300 :
1301 0 : xlrec = (xl_dbase_drop_rec *) XLogRecGetData(xlogreader);
1302 0 : rlocator.dbOid = xlrec->db_id;
1303 0 : rlocator.relNumber = 0;
1304 0 : for (i = 0; i < xlrec->ntablespaces; ++i)
1305 : {
1306 0 : rlocator.spcOid = xlrec->tablespace_ids[i];
1307 0 : BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1308 : }
1309 : }
1310 8 : }
1311 :
1312 : /*
1313 : * Special handling for WAL records with RM_SMGR_ID.
1314 : */
1315 : static void
1316 58 : SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1317 : {
1318 58 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1319 :
1320 58 : if (info == XLOG_SMGR_CREATE)
1321 : {
1322 : xl_smgr_create *xlrec;
1323 :
1324 : /*
1325 : * If a new relation fork is created on disk, there is no point
1326 : * tracking anything about which blocks have been modified, because
1327 : * the whole thing will be new. Hence, set the limit block for this
1328 : * fork to 0.
1329 : *
1330 : * Ignore the FSM fork, which is not fully WAL-logged.
1331 : */
1332 56 : xlrec = (xl_smgr_create *) XLogRecGetData(xlogreader);
1333 :
1334 56 : if (xlrec->forkNum != FSM_FORKNUM)
1335 56 : BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1336 : xlrec->forkNum, 0);
1337 : }
1338 2 : else if (info == XLOG_SMGR_TRUNCATE)
1339 : {
1340 : xl_smgr_truncate *xlrec;
1341 :
1342 2 : xlrec = (xl_smgr_truncate *) XLogRecGetData(xlogreader);
1343 :
1344 : /*
1345 : * If a relation fork is truncated on disk, there is no point in
1346 : * tracking anything about block modifications beyond the truncation
1347 : * point.
1348 : *
1349 : * We ignore SMGR_TRUNCATE_FSM here because the FSM isn't fully
1350 : * WAL-logged and thus we can't track modified blocks for it anyway.
1351 : */
1352 2 : if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1353 2 : BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1354 : MAIN_FORKNUM, xlrec->blkno);
1355 2 : if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0)
1356 2 : BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1357 : VISIBILITYMAP_FORKNUM, xlrec->blkno);
1358 : }
1359 58 : }
1360 :
1361 : /*
1362 : * Special handling for WAL records with RM_XACT_ID.
1363 : */
1364 : static void
1365 3072 : SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1366 : {
1367 3072 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1368 3072 : uint8 xact_info = info & XLOG_XACT_OPMASK;
1369 :
1370 3072 : if (xact_info == XLOG_XACT_COMMIT ||
1371 : xact_info == XLOG_XACT_COMMIT_PREPARED)
1372 3072 : {
1373 3072 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(xlogreader);
1374 : xl_xact_parsed_commit parsed;
1375 : int i;
1376 :
1377 : /*
1378 : * Don't track modified blocks for any relations that were removed on
1379 : * commit.
1380 : */
1381 3072 : ParseCommitRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed);
1382 3072 : for (i = 0; i < parsed.nrels; ++i)
1383 : {
1384 : ForkNumber forknum;
1385 :
1386 0 : for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1387 0 : if (forknum != FSM_FORKNUM)
1388 0 : BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1389 : forknum, 0);
1390 : }
1391 : }
1392 0 : else if (xact_info == XLOG_XACT_ABORT ||
1393 : xact_info == XLOG_XACT_ABORT_PREPARED)
1394 : {
1395 0 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(xlogreader);
1396 : xl_xact_parsed_abort parsed;
1397 : int i;
1398 :
1399 : /*
1400 : * Don't track modified blocks for any relations that were removed on
1401 : * abort.
1402 : */
1403 0 : ParseAbortRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed);
1404 0 : for (i = 0; i < parsed.nrels; ++i)
1405 : {
1406 : ForkNumber forknum;
1407 :
1408 0 : for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1409 0 : if (forknum != FSM_FORKNUM)
1410 0 : BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1411 : forknum, 0);
1412 : }
1413 : }
1414 3072 : }
1415 :
1416 : /*
1417 : * Special handling for WAL records with RM_XLOG_ID.
1418 : *
1419 : * The return value is true if WAL summarization should stop before this
1420 : * record and false otherwise. When the return value is true,
1421 : * *new_fast_forward indicates whether future processing should be done
1422 : * in fast forward mode (i.e. read WAL without emitting summaries) or not.
1423 : */
1424 : static bool
1425 1382 : SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward)
1426 : {
1427 1382 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1428 : int record_wal_level;
1429 :
1430 1382 : if (info == XLOG_CHECKPOINT_REDO)
1431 : {
1432 : /* Payload is wal_level at the time record was written. */
1433 46 : memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int));
1434 : }
1435 1336 : else if (info == XLOG_CHECKPOINT_SHUTDOWN)
1436 : {
1437 : CheckPoint rec_ckpt;
1438 :
1439 : /* Extract wal_level at time record was written from payload. */
1440 24 : memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1441 24 : record_wal_level = rec_ckpt.wal_level;
1442 : }
1443 1312 : else if (info == XLOG_PARAMETER_CHANGE)
1444 : {
1445 : xl_parameter_change xlrec;
1446 :
1447 : /* Extract wal_level at time record was written from payload. */
1448 8 : memcpy(&xlrec, XLogRecGetData(xlogreader),
1449 : sizeof(xl_parameter_change));
1450 8 : record_wal_level = xlrec.wal_level;
1451 : }
1452 1304 : else if (info == XLOG_END_OF_RECOVERY)
1453 : {
1454 : xl_end_of_recovery xlrec;
1455 :
1456 : /* Extract wal_level at time record was written from payload. */
1457 0 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1458 0 : record_wal_level = xlrec.wal_level;
1459 : }
1460 : else
1461 : {
1462 : /* No special handling required. Return false. */
1463 1304 : return false;
1464 : }
1465 :
1466 : /*
1467 : * Redo can only begin at an XLOG_CHECKPOINT_REDO or
1468 : * XLOG_CHECKPOINT_SHUTDOWN record, so we want WAL summarization to begin
1469 : * at those points. Hence, when those records are encountered, return
1470 : * true, so that we stop just before summarizing either of those records.
1471 : *
1472 : * We also reach here if we just saw XLOG_END_OF_RECOVERY or
1473 : * XLOG_PARAMETER_CHANGE. These are not places where recovery can start,
1474 : * but they're still relevant here. A new timeline can begin with
1475 : * XLOG_END_OF_RECOVERY, so we need to confirm the WAL level at that
1476 : * point; and a restart can provoke XLOG_PARAMETER_CHANGE after an
1477 : * intervening change to postgresql.conf, which might force us to stop
1478 : * summarizing.
1479 : */
1480 78 : *new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL);
1481 78 : return true;
1482 : }
1483 :
1484 : /*
1485 : * Similar to read_local_xlog_page, but limited to read from one particular
1486 : * timeline. If the end of WAL is reached, it will wait for more if reading
1487 : * from the current timeline, or give up if reading from a historic timeline.
1488 : * In the latter case, it will also set private_data->end_of_wal = true.
1489 : *
1490 : * Caller must set private_data->tli to the TLI of interest,
1491 : * private_data->read_upto to the lowest LSN that is not known to be safe
1492 : * to read on that timeline, and private_data->historic to true if and only
1493 : * if the timeline is not the current timeline. This function will update
1494 : * private_data->read_upto and private_data->historic if more WAL appears
1495 : * on the current timeline or if the current timeline becomes historic.
1496 : */
1497 : static int
1498 3962 : summarizer_read_local_xlog_page(XLogReaderState *state,
1499 : XLogRecPtr targetPagePtr, int reqLen,
1500 : XLogRecPtr targetRecPtr, char *cur_page)
1501 : {
1502 : int count;
1503 : WALReadError errinfo;
1504 : SummarizerReadLocalXLogPrivate *private_data;
1505 :
1506 3962 : ProcessWalSummarizerInterrupts();
1507 :
1508 3962 : private_data = (SummarizerReadLocalXLogPrivate *)
1509 : state->private_data;
1510 :
1511 : while (1)
1512 : {
1513 4002 : if (targetPagePtr + XLOG_BLCKSZ <= private_data->read_upto)
1514 : {
1515 : /*
1516 : * more than one block available; read only that block, have
1517 : * caller come back if they need more.
1518 : */
1519 3916 : count = XLOG_BLCKSZ;
1520 3916 : break;
1521 : }
1522 86 : else if (targetPagePtr + reqLen > private_data->read_upto)
1523 : {
1524 : /* We don't seem to have enough data. */
1525 44 : if (private_data->historic)
1526 : {
1527 : /*
1528 : * This is a historic timeline, so there will never be any
1529 : * more data than we have currently.
1530 : */
1531 0 : private_data->end_of_wal = true;
1532 0 : return -1;
1533 : }
1534 : else
1535 : {
1536 : XLogRecPtr latest_lsn;
1537 : TimeLineID latest_tli;
1538 :
1539 : /*
1540 : * This is - or at least was up until very recently - the
1541 : * current timeline, so more data might show up. Delay here
1542 : * so we don't tight-loop.
1543 : */
1544 44 : ProcessWalSummarizerInterrupts();
1545 40 : summarizer_wait_for_wal();
1546 :
1547 : /* Recheck end-of-WAL. */
1548 40 : latest_lsn = GetLatestLSN(&latest_tli);
1549 40 : if (private_data->tli == latest_tli)
1550 : {
1551 : /* Still the current timeline, update max LSN. */
1552 : Assert(latest_lsn >= private_data->read_upto);
1553 40 : private_data->read_upto = latest_lsn;
1554 : }
1555 : else
1556 : {
1557 0 : List *tles = readTimeLineHistory(latest_tli);
1558 : XLogRecPtr switchpoint;
1559 :
1560 : /*
1561 : * The timeline we're scanning is no longer the latest
1562 : * one. Figure out when it ended.
1563 : */
1564 0 : private_data->historic = true;
1565 0 : switchpoint = tliSwitchPoint(private_data->tli, tles,
1566 : NULL);
1567 :
1568 : /*
1569 : * Allow reads up to exactly the switch point.
1570 : *
1571 : * It's possible that this will cause read_upto to move
1572 : * backwards, because we might have been promoted before
1573 : * reaching the end of the previous timeline. In that
1574 : * case, the next loop iteration will likely conclude that
1575 : * we've reached end of WAL.
1576 : */
1577 0 : private_data->read_upto = switchpoint;
1578 :
1579 : /* Debugging output. */
1580 0 : ereport(DEBUG1,
1581 : errmsg_internal("timeline %u became historic, can read up to %X/%08X",
1582 : private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto)));
1583 : }
1584 :
1585 : /* Go around and try again. */
1586 : }
1587 : }
1588 : else
1589 : {
1590 : /* enough bytes available to satisfy the request */
1591 42 : count = private_data->read_upto - targetPagePtr;
1592 42 : break;
1593 : }
1594 : }
1595 :
1596 3958 : if (!WALRead(state, cur_page, targetPagePtr, count,
1597 : private_data->tli, &errinfo))
1598 0 : WALReadRaiseError(&errinfo);
1599 :
1600 : /* Track that we read a page, for sleep time calculation. */
1601 3958 : ++pages_read_since_last_sleep;
1602 :
1603 : /* number of valid bytes in the buffer */
1604 3958 : return count;
1605 : }
1606 :
1607 : /*
1608 : * Sleep for long enough that we believe it's likely that more WAL will
1609 : * be available afterwards.
1610 : */
1611 : static void
1612 40 : summarizer_wait_for_wal(void)
1613 : {
1614 40 : if (pages_read_since_last_sleep == 0)
1615 : {
1616 : /*
1617 : * No pages were read since the last sleep, so double the sleep time,
1618 : * but not beyond the maximum allowable value.
1619 : */
1620 20 : sleep_quanta = Min(sleep_quanta * 2, MAX_SLEEP_QUANTA);
1621 : }
1622 20 : else if (pages_read_since_last_sleep > 1)
1623 : {
1624 : /*
1625 : * Multiple pages were read since the last sleep, so reduce the sleep
1626 : * time.
1627 : *
1628 : * A large burst of activity should be able to quickly reduce the
1629 : * sleep time to the minimum, but we don't want a handful of extra WAL
1630 : * records to provoke a strong reaction. We choose to reduce the sleep
1631 : * time by 1 quantum for each page read beyond the first, which is a
1632 : * fairly arbitrary way of trying to be reactive without overreacting.
1633 : */
1634 16 : if (pages_read_since_last_sleep > sleep_quanta - 1)
1635 16 : sleep_quanta = 1;
1636 : else
1637 0 : sleep_quanta -= pages_read_since_last_sleep;
1638 : }
1639 :
1640 : /* Report pending statistics to the cumulative stats system. */
1641 40 : pgstat_report_wal(false);
1642 :
1643 : /* OK, now sleep. */
1644 40 : (void) WaitLatch(MyLatch,
1645 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1646 : sleep_quanta * MS_PER_SLEEP_QUANTUM,
1647 : WAIT_EVENT_WAL_SUMMARIZER_WAL);
1648 40 : ResetLatch(MyLatch);
1649 :
1650 : /* Reset count of pages read. */
1651 40 : pages_read_since_last_sleep = 0;
1652 40 : }
1653 :
1654 : /*
1655 : * Remove WAL summaries whose mtimes are older than wal_summary_keep_time.
1656 : */
1657 : static void
1658 42 : MaybeRemoveOldWalSummaries(void)
1659 : {
1660 42 : XLogRecPtr redo_pointer = GetRedoRecPtr();
1661 : List *wslist;
1662 : time_t cutoff_time;
1663 :
1664 : /* If WAL summary removal is disabled, don't do anything. */
1665 42 : if (wal_summary_keep_time == 0)
1666 0 : return;
1667 :
1668 : /*
1669 : * If the redo pointer has not advanced, don't do anything.
1670 : *
1671 : * This has the effect that we only try to remove old WAL summary files
1672 : * once per checkpoint cycle.
1673 : */
1674 42 : if (redo_pointer == redo_pointer_at_last_summary_removal)
1675 30 : return;
1676 12 : redo_pointer_at_last_summary_removal = redo_pointer;
1677 :
1678 : /*
1679 : * Files should only be removed if the last modification time precedes the
1680 : * cutoff time we compute here.
1681 : */
1682 12 : cutoff_time = time(NULL) - wal_summary_keep_time * SECS_PER_MINUTE;
1683 :
1684 : /* Get all the summaries that currently exist. */
1685 12 : wslist = GetWalSummaries(0, InvalidXLogRecPtr, InvalidXLogRecPtr);
1686 :
1687 : /* Loop until all summaries have been considered for removal. */
1688 18 : while (wslist != NIL)
1689 : {
1690 : ListCell *lc;
1691 : XLogSegNo oldest_segno;
1692 6 : XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
1693 : TimeLineID selected_tli;
1694 :
1695 6 : ProcessWalSummarizerInterrupts();
1696 :
1697 : /*
1698 : * Pick a timeline for which some summary files still exist on disk,
1699 : * and find the oldest LSN that still exists on disk for that
1700 : * timeline.
1701 : */
1702 6 : selected_tli = ((WalSummaryFile *) linitial(wslist))->tli;
1703 6 : oldest_segno = XLogGetOldestSegno(selected_tli);
1704 6 : if (oldest_segno != 0)
1705 6 : XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size,
1706 : oldest_lsn);
1707 :
1708 :
1709 : /* Consider each WAL file on the selected timeline in turn. */
1710 56 : foreach(lc, wslist)
1711 : {
1712 50 : WalSummaryFile *ws = lfirst(lc);
1713 :
1714 50 : ProcessWalSummarizerInterrupts();
1715 :
1716 : /* If it's not on this timeline, it's not time to consider it. */
1717 50 : if (selected_tli != ws->tli)
1718 0 : continue;
1719 :
1720 : /*
1721 : * If the WAL doesn't exist any more, we can remove it if the file
1722 : * modification time is old enough.
1723 : */
1724 50 : if (!XLogRecPtrIsValid(oldest_lsn) || ws->end_lsn <= oldest_lsn)
1725 0 : RemoveWalSummaryIfOlderThan(ws, cutoff_time);
1726 :
1727 : /*
1728 : * Whether we removed the file or not, we need not consider it
1729 : * again.
1730 : */
1731 50 : wslist = foreach_delete_current(wslist, lc);
1732 50 : pfree(ws);
1733 : }
1734 : }
1735 : }
|