Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * walsummarizer.c
4 : *
5 : * Background process to perform WAL summarization, if it is enabled.
6 : * It continuously scans the write-ahead log and periodically emits a
7 : * summary file which indicates which blocks in which relation forks
8 : * were modified by WAL records in the LSN range covered by the summary
9 : * file. See walsummary.c and blkreftable.c for more details on the
10 : * naming and contents of WAL summary files.
11 : *
12 : * If configured to do, this background process will also remove WAL
13 : * summary files when the file timestamp is older than a configurable
14 : * threshold (but only if the WAL has been removed first).
15 : *
16 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17 : *
18 : * IDENTIFICATION
19 : * src/backend/postmaster/walsummarizer.c
20 : *
21 : *-------------------------------------------------------------------------
22 : */
23 : #include "postgres.h"
24 :
25 : #include "access/timeline.h"
26 : #include "access/xlog.h"
27 : #include "access/xlog_internal.h"
28 : #include "access/xlogrecovery.h"
29 : #include "access/xlogutils.h"
30 : #include "backup/walsummary.h"
31 : #include "catalog/storage_xlog.h"
32 : #include "commands/dbcommands_xlog.h"
33 : #include "common/blkreftable.h"
34 : #include "libpq/pqsignal.h"
35 : #include "miscadmin.h"
36 : #include "postmaster/auxprocess.h"
37 : #include "postmaster/interrupt.h"
38 : #include "postmaster/walsummarizer.h"
39 : #include "replication/walreceiver.h"
40 : #include "storage/fd.h"
41 : #include "storage/ipc.h"
42 : #include "storage/latch.h"
43 : #include "storage/lwlock.h"
44 : #include "storage/proc.h"
45 : #include "storage/procsignal.h"
46 : #include "storage/shmem.h"
47 : #include "utils/guc.h"
48 : #include "utils/memutils.h"
49 : #include "utils/wait_event.h"
50 :
51 : /*
52 : * Data in shared memory related to WAL summarization.
53 : */
54 : typedef struct
55 : {
56 : /*
57 : * These fields are protected by WALSummarizerLock.
58 : *
59 : * Until we've discovered what summary files already exist on disk and
60 : * stored that information in shared memory, initialized is false and the
61 : * other fields here contain no meaningful information. After that has
62 : * been done, initialized is true.
63 : *
64 : * summarized_tli and summarized_lsn indicate the last LSN and TLI at
65 : * which the next summary file will start. Normally, these are the LSN and
66 : * TLI at which the last file ended; in such case, lsn_is_exact is true.
67 : * If, however, the LSN is just an approximation, then lsn_is_exact is
68 : * false. This can happen if, for example, there are no existing WAL
69 : * summary files at startup. In that case, we have to derive the position
70 : * at which to start summarizing from the WAL files that exist on disk,
71 : * and so the LSN might point to the start of the next file even though
72 : * that might happen to be in the middle of a WAL record.
73 : *
74 : * summarizer_pgprocno is the proc number of the summarizer process, if
75 : * one is running, or else INVALID_PROC_NUMBER.
76 : *
77 : * pending_lsn is used by the summarizer to advertise the ending LSN of a
78 : * record it has recently read. It shouldn't ever be less than
79 : * summarized_lsn, but might be greater, because the summarizer buffers
80 : * data for a range of LSNs in memory before writing out a new file.
81 : */
82 : bool initialized;
83 : TimeLineID summarized_tli;
84 : XLogRecPtr summarized_lsn;
85 : bool lsn_is_exact;
86 : ProcNumber summarizer_pgprocno;
87 : XLogRecPtr pending_lsn;
88 :
89 : /*
90 : * This field handles its own synchronization.
91 : */
92 : ConditionVariable summary_file_cv;
93 : } WalSummarizerData;
94 :
95 : /*
96 : * Private data for our xlogreader's page read callback.
97 : */
98 : typedef struct
99 : {
100 : TimeLineID tli;
101 : bool historic;
102 : XLogRecPtr read_upto;
103 : bool end_of_wal;
104 : } SummarizerReadLocalXLogPrivate;
105 :
106 : /* Pointer to shared memory state. */
107 : static WalSummarizerData *WalSummarizerCtl;
108 :
109 : /*
110 : * When we reach end of WAL and need to read more, we sleep for a number of
111 : * milliseconds that is an integer multiple of MS_PER_SLEEP_QUANTUM. This is
112 : * the multiplier. It should vary between 1 and MAX_SLEEP_QUANTA, depending
113 : * on system activity. See summarizer_wait_for_wal() for how we adjust this.
114 : */
115 : static long sleep_quanta = 1;
116 :
117 : /*
118 : * The sleep time will always be a multiple of 200ms and will not exceed
119 : * thirty seconds (150 * 200 = 30 * 1000). Note that the timeout here needs
120 : * to be substantially less than the maximum amount of time for which an
121 : * incremental backup will wait for this process to catch up. Otherwise, an
122 : * incremental backup might time out on an idle system just because we sleep
123 : * for too long.
124 : */
125 : #define MAX_SLEEP_QUANTA 150
126 : #define MS_PER_SLEEP_QUANTUM 200
127 :
128 : /*
129 : * This is a count of the number of pages of WAL that we've read since the
130 : * last time we waited for more WAL to appear.
131 : */
132 : static long pages_read_since_last_sleep = 0;
133 :
134 : /*
135 : * Most recent RedoRecPtr value observed by MaybeRemoveOldWalSummaries.
136 : */
137 : static XLogRecPtr redo_pointer_at_last_summary_removal = InvalidXLogRecPtr;
138 :
139 : /*
140 : * GUC parameters
141 : */
142 : bool summarize_wal = false;
143 : int wal_summary_keep_time = 10 * HOURS_PER_DAY * MINS_PER_HOUR;
144 :
145 : static void WalSummarizerShutdown(int code, Datum arg);
146 : static XLogRecPtr GetLatestLSN(TimeLineID *tli);
147 : static void HandleWalSummarizerInterrupts(void);
148 : static XLogRecPtr SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn,
149 : bool exact, XLogRecPtr switch_lsn,
150 : XLogRecPtr maximum_lsn);
151 : static void SummarizeDbaseRecord(XLogReaderState *xlogreader,
152 : BlockRefTable *brtab);
153 : static void SummarizeSmgrRecord(XLogReaderState *xlogreader,
154 : BlockRefTable *brtab);
155 : static void SummarizeXactRecord(XLogReaderState *xlogreader,
156 : BlockRefTable *brtab);
157 : static bool SummarizeXlogRecord(XLogReaderState *xlogreader,
158 : bool *new_fast_forward);
159 : static int summarizer_read_local_xlog_page(XLogReaderState *state,
160 : XLogRecPtr targetPagePtr,
161 : int reqLen,
162 : XLogRecPtr targetRecPtr,
163 : char *cur_page);
164 : static void summarizer_wait_for_wal(void);
165 : static void MaybeRemoveOldWalSummaries(void);
166 :
167 : /*
168 : * Amount of shared memory required for this module.
169 : */
170 : Size
171 5484 : WalSummarizerShmemSize(void)
172 : {
173 5484 : return sizeof(WalSummarizerData);
174 : }
175 :
176 : /*
177 : * Create or attach to shared memory segment for this module.
178 : */
179 : void
180 1918 : WalSummarizerShmemInit(void)
181 : {
182 : bool found;
183 :
184 1918 : WalSummarizerCtl = (WalSummarizerData *)
185 1918 : ShmemInitStruct("Wal Summarizer Ctl", WalSummarizerShmemSize(),
186 : &found);
187 :
188 1918 : if (!found)
189 : {
190 : /*
191 : * First time through, so initialize.
192 : *
193 : * We're just filling in dummy values here -- the real initialization
194 : * will happen when GetOldestUnsummarizedLSN() is called for the first
195 : * time.
196 : */
197 1918 : WalSummarizerCtl->initialized = false;
198 1918 : WalSummarizerCtl->summarized_tli = 0;
199 1918 : WalSummarizerCtl->summarized_lsn = InvalidXLogRecPtr;
200 1918 : WalSummarizerCtl->lsn_is_exact = false;
201 1918 : WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER;
202 1918 : WalSummarizerCtl->pending_lsn = InvalidXLogRecPtr;
203 1918 : ConditionVariableInit(&WalSummarizerCtl->summary_file_cv);
204 : }
205 1918 : }
206 :
207 : /*
208 : * Entry point for walsummarizer process.
209 : */
210 : void
211 2 : WalSummarizerMain(char *startup_data, size_t startup_data_len)
212 : {
213 : sigjmp_buf local_sigjmp_buf;
214 : MemoryContext context;
215 :
216 : /*
217 : * Within this function, 'current_lsn' and 'current_tli' refer to the
218 : * point from which the next WAL summary file should start. 'exact' is
219 : * true if 'current_lsn' is known to be the start of a WAL record or WAL
220 : * segment, and false if it might be in the middle of a record someplace.
221 : *
222 : * 'switch_lsn' and 'switch_tli', if set, are the LSN at which we need to
223 : * switch to a new timeline and the timeline to which we need to switch.
224 : * If not set, we either haven't figured out the answers yet or we're
225 : * already on the latest timeline.
226 : */
227 : XLogRecPtr current_lsn;
228 : TimeLineID current_tli;
229 : bool exact;
230 2 : XLogRecPtr switch_lsn = InvalidXLogRecPtr;
231 2 : TimeLineID switch_tli = 0;
232 :
233 : Assert(startup_data_len == 0);
234 :
235 2 : MyBackendType = B_WAL_SUMMARIZER;
236 2 : AuxiliaryProcessMainCommon();
237 :
238 2 : ereport(DEBUG1,
239 : (errmsg_internal("WAL summarizer started")));
240 :
241 : /*
242 : * Properly accept or ignore signals the postmaster might send us
243 : *
244 : * We have no particular use for SIGINT at the moment, but seems
245 : * reasonable to treat like SIGTERM.
246 : */
247 2 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
248 2 : pqsignal(SIGINT, SignalHandlerForShutdownRequest);
249 2 : pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
250 : /* SIGQUIT handler was already set up by InitPostmasterChild */
251 2 : pqsignal(SIGALRM, SIG_IGN);
252 2 : pqsignal(SIGPIPE, SIG_IGN);
253 2 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
254 2 : pqsignal(SIGUSR2, SIG_IGN); /* not used */
255 :
256 : /* Advertise ourselves. */
257 2 : on_shmem_exit(WalSummarizerShutdown, (Datum) 0);
258 2 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
259 2 : WalSummarizerCtl->summarizer_pgprocno = MyProcNumber;
260 2 : LWLockRelease(WALSummarizerLock);
261 :
262 : /* Create and switch to a memory context that we can reset on error. */
263 2 : context = AllocSetContextCreate(TopMemoryContext,
264 : "Wal Summarizer",
265 : ALLOCSET_DEFAULT_SIZES);
266 2 : MemoryContextSwitchTo(context);
267 :
268 : /*
269 : * Reset some signals that are accepted by postmaster but not here
270 : */
271 2 : pqsignal(SIGCHLD, SIG_DFL);
272 :
273 : /*
274 : * If an exception is encountered, processing resumes here.
275 : */
276 2 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
277 : {
278 : /* Since not using PG_TRY, must reset error stack by hand */
279 0 : error_context_stack = NULL;
280 :
281 : /* Prevent interrupts while cleaning up */
282 0 : HOLD_INTERRUPTS();
283 :
284 : /* Report the error to the server log */
285 0 : EmitErrorReport();
286 :
287 : /* Release resources we might have acquired. */
288 0 : LWLockReleaseAll();
289 0 : ConditionVariableCancelSleep();
290 0 : pgstat_report_wait_end();
291 0 : ReleaseAuxProcessResources(false);
292 0 : AtEOXact_Files(false);
293 0 : AtEOXact_HashTables(false);
294 :
295 : /*
296 : * Now return to normal top-level context and clear ErrorContext for
297 : * next time.
298 : */
299 0 : MemoryContextSwitchTo(context);
300 0 : FlushErrorState();
301 :
302 : /* Flush any leaked data in the top-level context */
303 0 : MemoryContextReset(context);
304 :
305 : /* Now we can allow interrupts again */
306 0 : RESUME_INTERRUPTS();
307 :
308 : /*
309 : * Sleep for 10 seconds before attempting to resume operations in
310 : * order to avoid excessive logging.
311 : *
312 : * Many of the likely error conditions are things that will repeat
313 : * every time. For example, if the WAL can't be read or the summary
314 : * can't be written, only administrator action will cure the problem.
315 : * So a really fast retry time doesn't seem to be especially
316 : * beneficial, and it will clutter the logs.
317 : */
318 0 : (void) WaitLatch(NULL,
319 : WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
320 : 10000,
321 : WAIT_EVENT_WAL_SUMMARIZER_ERROR);
322 : }
323 :
324 : /* We can now handle ereport(ERROR) */
325 2 : PG_exception_stack = &local_sigjmp_buf;
326 :
327 : /*
328 : * Unblock signals (they were blocked when the postmaster forked us)
329 : */
330 2 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
331 :
332 : /*
333 : * Fetch information about previous progress from shared memory, and ask
334 : * GetOldestUnsummarizedLSN to reset pending_lsn to summarized_lsn. We
335 : * might be recovering from an error, and if so, pending_lsn might have
336 : * advanced past summarized_lsn, but any WAL we read previously has been
337 : * lost and will need to be reread.
338 : *
339 : * If we discover that WAL summarization is not enabled, just exit.
340 : */
341 2 : current_lsn = GetOldestUnsummarizedLSN(¤t_tli, &exact);
342 2 : if (XLogRecPtrIsInvalid(current_lsn))
343 0 : proc_exit(0);
344 :
345 : /*
346 : * Loop forever
347 : */
348 : for (;;)
349 16 : {
350 : XLogRecPtr latest_lsn;
351 : TimeLineID latest_tli;
352 : XLogRecPtr end_of_summary_lsn;
353 :
354 : /* Flush any leaked data in the top-level context */
355 18 : MemoryContextReset(context);
356 :
357 : /* Process any signals received recently. */
358 18 : HandleWalSummarizerInterrupts();
359 :
360 : /* If it's time to remove any old WAL summaries, do that now. */
361 18 : MaybeRemoveOldWalSummaries();
362 :
363 : /* Find the LSN and TLI up to which we can safely summarize. */
364 18 : latest_lsn = GetLatestLSN(&latest_tli);
365 :
366 : /*
367 : * If we're summarizing a historic timeline and we haven't yet
368 : * computed the point at which to switch to the next timeline, do that
369 : * now.
370 : *
371 : * Note that if this is a standby, what was previously the current
372 : * timeline could become historic at any time.
373 : *
374 : * We could try to make this more efficient by caching the results of
375 : * readTimeLineHistory when latest_tli has not changed, but since we
376 : * only have to do this once per timeline switch, we probably wouldn't
377 : * save any significant amount of work in practice.
378 : */
379 18 : if (current_tli != latest_tli && XLogRecPtrIsInvalid(switch_lsn))
380 : {
381 0 : List *tles = readTimeLineHistory(latest_tli);
382 :
383 0 : switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli);
384 0 : ereport(DEBUG1,
385 : errmsg_internal("switch point from TLI %u to TLI %u is at %X/%X",
386 : current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn)));
387 : }
388 :
389 : /*
390 : * If we've reached the switch LSN, we can't summarize anything else
391 : * on this timeline. Switch to the next timeline and go around again,
392 : * backing up to the exact switch point if we passed it.
393 : */
394 18 : if (!XLogRecPtrIsInvalid(switch_lsn) && current_lsn >= switch_lsn)
395 : {
396 : /* Restart summarization from switch point. */
397 0 : current_tli = switch_tli;
398 0 : current_lsn = switch_lsn;
399 :
400 : /* Next timeline and switch point, if any, not yet known. */
401 0 : switch_lsn = InvalidXLogRecPtr;
402 0 : switch_tli = 0;
403 :
404 : /* Update (really, rewind, if needed) state in shared memory. */
405 0 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
406 0 : WalSummarizerCtl->summarized_lsn = current_lsn;
407 0 : WalSummarizerCtl->summarized_tli = current_tli;
408 0 : WalSummarizerCtl->lsn_is_exact = true;
409 0 : WalSummarizerCtl->pending_lsn = current_lsn;
410 0 : LWLockRelease(WALSummarizerLock);
411 :
412 0 : continue;
413 : }
414 :
415 : /* Summarize WAL. */
416 18 : end_of_summary_lsn = SummarizeWAL(current_tli,
417 : current_lsn, exact,
418 : switch_lsn, latest_lsn);
419 : Assert(!XLogRecPtrIsInvalid(end_of_summary_lsn));
420 : Assert(end_of_summary_lsn >= current_lsn);
421 :
422 : /*
423 : * Update state for next loop iteration.
424 : *
425 : * Next summary file should start from exactly where this one ended.
426 : */
427 16 : current_lsn = end_of_summary_lsn;
428 16 : exact = true;
429 :
430 : /* Update state in shared memory. */
431 16 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
432 16 : WalSummarizerCtl->summarized_lsn = end_of_summary_lsn;
433 16 : WalSummarizerCtl->summarized_tli = current_tli;
434 16 : WalSummarizerCtl->lsn_is_exact = true;
435 16 : WalSummarizerCtl->pending_lsn = end_of_summary_lsn;
436 16 : LWLockRelease(WALSummarizerLock);
437 :
438 : /* Wake up anyone waiting for more summary files to be written. */
439 16 : ConditionVariableBroadcast(&WalSummarizerCtl->summary_file_cv);
440 : }
441 : }
442 :
443 : /*
444 : * Get information about the state of the WAL summarizer.
445 : */
446 : void
447 0 : GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn,
448 : XLogRecPtr *pending_lsn, int *summarizer_pid)
449 : {
450 0 : LWLockAcquire(WALSummarizerLock, LW_SHARED);
451 0 : if (!WalSummarizerCtl->initialized)
452 : {
453 : /*
454 : * If initialized is false, the rest of the structure contents are
455 : * undefined.
456 : */
457 0 : *summarized_tli = 0;
458 0 : *summarized_lsn = InvalidXLogRecPtr;
459 0 : *pending_lsn = InvalidXLogRecPtr;
460 0 : *summarizer_pid = -1;
461 : }
462 : else
463 : {
464 0 : int summarizer_pgprocno = WalSummarizerCtl->summarizer_pgprocno;
465 :
466 0 : *summarized_tli = WalSummarizerCtl->summarized_tli;
467 0 : *summarized_lsn = WalSummarizerCtl->summarized_lsn;
468 0 : if (summarizer_pgprocno == INVALID_PROC_NUMBER)
469 : {
470 : /*
471 : * If the summarizer has exited, the fact that it had processed
472 : * beyond summarized_lsn is irrelevant now.
473 : */
474 0 : *pending_lsn = WalSummarizerCtl->summarized_lsn;
475 0 : *summarizer_pid = -1;
476 : }
477 : else
478 : {
479 0 : *pending_lsn = WalSummarizerCtl->pending_lsn;
480 :
481 : /*
482 : * We're not fussed about inexact answers here, since they could
483 : * become stale instantly, so we don't bother taking the lock, but
484 : * make sure that invalid PID values are normalized to -1.
485 : */
486 0 : *summarizer_pid = GetPGProcByNumber(summarizer_pgprocno)->pid;
487 0 : if (*summarizer_pid <= 0)
488 0 : *summarizer_pid = -1;
489 : }
490 : }
491 0 : LWLockRelease(WALSummarizerLock);
492 0 : }
493 :
494 : /*
495 : * Get the oldest LSN in this server's timeline history that has not yet been
496 : * summarized, and update shared memory state as appropriate.
497 : *
498 : * If *tli != NULL, it will be set to the TLI for the LSN that is returned.
499 : *
500 : * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is
501 : * necessarily the start of a WAL record and false if it's just the beginning
502 : * of a WAL segment.
503 : */
504 : XLogRecPtr
505 3304 : GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact)
506 : {
507 : TimeLineID latest_tli;
508 : int n;
509 : List *tles;
510 3304 : XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr;
511 3304 : TimeLineID unsummarized_tli = 0;
512 3304 : bool should_make_exact = false;
513 : List *existing_summaries;
514 : ListCell *lc;
515 3304 : bool am_wal_summarizer = AmWalSummarizerProcess();
516 :
517 : /* If not summarizing WAL, do nothing. */
518 3304 : if (!summarize_wal)
519 3298 : return InvalidXLogRecPtr;
520 :
521 : /*
522 : * If we are not the WAL summarizer process, then we normally just want to
523 : * read the values from shared memory. However, as an exception, if shared
524 : * memory hasn't been initialized yet, then we need to do that so that we
525 : * can read legal values and not remove any WAL too early.
526 : */
527 6 : if (!am_wal_summarizer)
528 : {
529 4 : LWLockAcquire(WALSummarizerLock, LW_SHARED);
530 :
531 4 : if (WalSummarizerCtl->initialized)
532 : {
533 4 : unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
534 4 : if (tli != NULL)
535 0 : *tli = WalSummarizerCtl->summarized_tli;
536 4 : if (lsn_is_exact != NULL)
537 0 : *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
538 4 : LWLockRelease(WALSummarizerLock);
539 4 : return unsummarized_lsn;
540 : }
541 :
542 0 : LWLockRelease(WALSummarizerLock);
543 : }
544 :
545 : /*
546 : * Find the oldest timeline on which WAL still exists, and the earliest
547 : * segment for which it exists.
548 : *
549 : * Note that we do this every time the WAL summarizer process restarts or
550 : * recovers from an error, in case the contents of pg_wal have changed
551 : * under us e.g. if some files were removed, either manually - which
552 : * shouldn't really happen, but might - or by postgres itself, if
553 : * summarize_wal was turned off and then back on again.
554 : */
555 2 : (void) GetLatestLSN(&latest_tli);
556 2 : tles = readTimeLineHistory(latest_tli);
557 2 : for (n = list_length(tles) - 1; n >= 0; --n)
558 : {
559 2 : TimeLineHistoryEntry *tle = list_nth(tles, n);
560 : XLogSegNo oldest_segno;
561 :
562 2 : oldest_segno = XLogGetOldestSegno(tle->tli);
563 2 : if (oldest_segno != 0)
564 : {
565 : /* Compute oldest LSN that still exists on disk. */
566 2 : XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size,
567 : unsummarized_lsn);
568 :
569 2 : unsummarized_tli = tle->tli;
570 2 : break;
571 : }
572 : }
573 :
574 : /*
575 : * Don't try to summarize anything older than the end LSN of the newest
576 : * summary file that exists for this timeline.
577 : */
578 : existing_summaries =
579 2 : GetWalSummaries(unsummarized_tli,
580 : InvalidXLogRecPtr, InvalidXLogRecPtr);
581 2 : foreach(lc, existing_summaries)
582 : {
583 0 : WalSummaryFile *ws = lfirst(lc);
584 :
585 0 : if (ws->end_lsn > unsummarized_lsn)
586 : {
587 0 : unsummarized_lsn = ws->end_lsn;
588 0 : should_make_exact = true;
589 : }
590 : }
591 :
592 : /* It really should not be possible for us to find no WAL. */
593 2 : if (unsummarized_tli == 0)
594 0 : ereport(ERROR,
595 : errcode(ERRCODE_INTERNAL_ERROR),
596 : errmsg_internal("no WAL found on timeline %u", latest_tli));
597 :
598 : /*
599 : * If we're the WAL summarizer, we always want to store the values we just
600 : * computed into shared memory, because those are the values we're going
601 : * to use to drive our operation, and so they are the authoritative
602 : * values. Otherwise, we only store values into shared memory if shared
603 : * memory is uninitialized. Our values are not canonical in such a case,
604 : * but it's better to have something than nothing, to guide WAL retention.
605 : */
606 2 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
607 2 : if (am_wal_summarizer || !WalSummarizerCtl->initialized)
608 : {
609 2 : WalSummarizerCtl->initialized = true;
610 2 : WalSummarizerCtl->summarized_lsn = unsummarized_lsn;
611 2 : WalSummarizerCtl->summarized_tli = unsummarized_tli;
612 2 : WalSummarizerCtl->lsn_is_exact = should_make_exact;
613 2 : WalSummarizerCtl->pending_lsn = unsummarized_lsn;
614 : }
615 : else
616 0 : unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
617 :
618 : /* Also return the to the caller as required. */
619 2 : if (tli != NULL)
620 2 : *tli = WalSummarizerCtl->summarized_tli;
621 2 : if (lsn_is_exact != NULL)
622 2 : *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
623 2 : LWLockRelease(WALSummarizerLock);
624 :
625 2 : return unsummarized_lsn;
626 : }
627 :
628 : /*
629 : * Wake up the WAL summarizer process.
630 : *
631 : * This might not work, because there's no guarantee that the WAL summarizer
632 : * process was successfully started, and it also might have started but
633 : * subsequently terminated. So, under normal circumstances, this will get the
634 : * latch set, but there's no guarantee.
635 : */
636 : void
637 2112 : WakeupWalSummarizer(void)
638 : {
639 : ProcNumber pgprocno;
640 :
641 2112 : if (WalSummarizerCtl == NULL)
642 0 : return;
643 :
644 2112 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
645 2112 : pgprocno = WalSummarizerCtl->summarizer_pgprocno;
646 2112 : LWLockRelease(WALSummarizerLock);
647 :
648 2112 : if (pgprocno != INVALID_PROC_NUMBER)
649 2 : SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch);
650 : }
651 :
652 : /*
653 : * Wait until WAL summarization reaches the given LSN, but time out with an
654 : * error if the summarizer seems to be stick.
655 : *
656 : * Returns immediately if summarize_wal is turned off while we wait. Caller
657 : * is expected to handle this case, if necessary.
658 : */
659 : void
660 22 : WaitForWalSummarization(XLogRecPtr lsn)
661 : {
662 : TimestampTz initial_time,
663 : cycle_time,
664 : current_time;
665 22 : XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
666 22 : int deadcycles = 0;
667 :
668 22 : initial_time = cycle_time = GetCurrentTimestamp();
669 :
670 : while (1)
671 8 : {
672 30 : long timeout_in_ms = 10000;
673 : XLogRecPtr summarized_lsn;
674 : XLogRecPtr pending_lsn;
675 :
676 30 : CHECK_FOR_INTERRUPTS();
677 :
678 : /* If WAL summarization is disabled while we're waiting, give up. */
679 30 : if (!summarize_wal)
680 0 : return;
681 :
682 : /*
683 : * If the LSN summarized on disk has reached the target value, stop.
684 : */
685 30 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
686 30 : summarized_lsn = WalSummarizerCtl->summarized_lsn;
687 30 : pending_lsn = WalSummarizerCtl->pending_lsn;
688 30 : LWLockRelease(WALSummarizerLock);
689 :
690 : /* If WAL summarization has progressed sufficiently, stop waiting. */
691 30 : if (summarized_lsn >= lsn)
692 22 : break;
693 :
694 : /* Recheck current time. */
695 8 : current_time = GetCurrentTimestamp();
696 :
697 : /* Have we finished the current cycle of waiting? */
698 8 : if (TimestampDifferenceMilliseconds(cycle_time,
699 : current_time) >= timeout_in_ms)
700 : {
701 : long elapsed_seconds;
702 :
703 : /* Begin new wait cycle. */
704 0 : cycle_time = TimestampTzPlusMilliseconds(cycle_time,
705 : timeout_in_ms);
706 :
707 : /*
708 : * Keep track of the number of cycles during which there has been
709 : * no progression of pending_lsn. If pending_lsn is not advancing,
710 : * that means that not only are no new files appearing on disk,
711 : * but we're not even incorporating new records into the in-memory
712 : * state.
713 : */
714 0 : if (pending_lsn > prior_pending_lsn)
715 : {
716 0 : prior_pending_lsn = pending_lsn;
717 0 : deadcycles = 0;
718 : }
719 : else
720 0 : ++deadcycles;
721 :
722 : /*
723 : * If we've managed to wait for an entire minute without the WAL
724 : * summarizer absorbing a single WAL record, error out; probably
725 : * something is wrong.
726 : *
727 : * We could consider also erroring out if the summarizer is taking
728 : * too long to catch up, but it's not clear what rate of progress
729 : * would be acceptable and what would be too slow. So instead, we
730 : * just try to error out in the case where there's no progress at
731 : * all. That seems likely to catch a reasonable number of the
732 : * things that can go wrong in practice (e.g. the summarizer
733 : * process is completely hung, say because somebody hooked up a
734 : * debugger to it or something) without giving up too quickly when
735 : * the system is just slow.
736 : */
737 0 : if (deadcycles >= 6)
738 0 : ereport(ERROR,
739 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
740 : errmsg("WAL summarization is not progressing"),
741 : errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
742 : LSN_FORMAT_ARGS(lsn),
743 : LSN_FORMAT_ARGS(summarized_lsn),
744 : LSN_FORMAT_ARGS(pending_lsn))));
745 :
746 :
747 : /*
748 : * Otherwise, just let the user know what's happening.
749 : */
750 0 : elapsed_seconds =
751 0 : TimestampDifferenceMilliseconds(initial_time,
752 : current_time) / 1000;
753 0 : ereport(WARNING,
754 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
755 : errmsg_plural("still waiting for WAL summarization through %X/%X after %ld second",
756 : "still waiting for WAL summarization through %X/%X after %ld seconds",
757 : elapsed_seconds,
758 : LSN_FORMAT_ARGS(lsn),
759 : elapsed_seconds),
760 : errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
761 : LSN_FORMAT_ARGS(summarized_lsn),
762 : LSN_FORMAT_ARGS(pending_lsn))));
763 : }
764 :
765 : /*
766 : * Align the wait time to prevent drift. This doesn't really matter,
767 : * but we'd like the warnings about how long we've been waiting to say
768 : * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
769 : * drifting to something that is not a multiple of ten.
770 : */
771 8 : timeout_in_ms -=
772 8 : TimestampDifferenceMilliseconds(cycle_time, current_time);
773 :
774 : /* Wait and see. */
775 8 : ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
776 : timeout_in_ms,
777 : WAIT_EVENT_WAL_SUMMARY_READY);
778 : }
779 :
780 22 : ConditionVariableCancelSleep();
781 : }
782 :
783 : /*
784 : * On exit, update shared memory to make it clear that we're no longer
785 : * running.
786 : */
787 : static void
788 2 : WalSummarizerShutdown(int code, Datum arg)
789 : {
790 2 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
791 2 : WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER;
792 2 : LWLockRelease(WALSummarizerLock);
793 2 : }
794 :
795 : /*
796 : * Get the latest LSN that is eligible to be summarized, and set *tli to the
797 : * corresponding timeline.
798 : */
799 : static XLogRecPtr
800 28 : GetLatestLSN(TimeLineID *tli)
801 : {
802 28 : if (!RecoveryInProgress())
803 : {
804 : /* Don't summarize WAL before it's flushed. */
805 28 : return GetFlushRecPtr(tli);
806 : }
807 : else
808 : {
809 : XLogRecPtr flush_lsn;
810 : TimeLineID flush_tli;
811 : XLogRecPtr replay_lsn;
812 : TimeLineID replay_tli;
813 : TimeLineID insert_tli;
814 :
815 : /*
816 : * After the insert TLI has been set and before the control file has
817 : * been updated to show the DB in production, RecoveryInProgress()
818 : * will return true, because it's not yet safe for all backends to
819 : * begin writing WAL. However, replay has already ceased, so from our
820 : * point of view, recovery is already over. We should summarize up to
821 : * where replay stopped and then prepare to resume at the start of the
822 : * insert timeline.
823 : */
824 0 : if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
825 : {
826 0 : *tli = insert_tli;
827 0 : return GetXLogReplayRecPtr(NULL);
828 : }
829 :
830 : /*
831 : * What we really want to know is how much WAL has been flushed to
832 : * disk, but the only flush position available is the one provided by
833 : * the walreceiver, which may not be running, because this could be
834 : * crash recovery or recovery via restore_command. So use either the
835 : * WAL receiver's flush position or the replay position, whichever is
836 : * further ahead, on the theory that if the WAL has been replayed then
837 : * it must also have been flushed to disk.
838 : */
839 0 : flush_lsn = GetWalRcvFlushRecPtr(NULL, &flush_tli);
840 0 : replay_lsn = GetXLogReplayRecPtr(&replay_tli);
841 0 : if (flush_lsn > replay_lsn)
842 : {
843 0 : *tli = flush_tli;
844 0 : return flush_lsn;
845 : }
846 : else
847 : {
848 0 : *tli = replay_tli;
849 0 : return replay_lsn;
850 : }
851 : }
852 : }
853 :
854 : /*
855 : * Interrupt handler for main loop of WAL summarizer process.
856 : */
857 : static void
858 50312 : HandleWalSummarizerInterrupts(void)
859 : {
860 50312 : if (ProcSignalBarrierPending)
861 0 : ProcessProcSignalBarrier();
862 :
863 50312 : if (ConfigReloadPending)
864 : {
865 0 : ConfigReloadPending = false;
866 0 : ProcessConfigFile(PGC_SIGHUP);
867 : }
868 :
869 50312 : if (ShutdownRequestPending || !summarize_wal)
870 : {
871 2 : ereport(DEBUG1,
872 : errmsg_internal("WAL summarizer shutting down"));
873 2 : proc_exit(0);
874 : }
875 :
876 : /* Perform logging of memory contexts of this process */
877 50310 : if (LogMemoryContextPending)
878 0 : ProcessLogMemoryContextInterrupt();
879 50310 : }
880 :
881 : /*
882 : * Summarize a range of WAL records on a single timeline.
883 : *
884 : * 'tli' is the timeline to be summarized.
885 : *
886 : * 'start_lsn' is the point at which we should start summarizing. If this
887 : * value comes from the end LSN of the previous record as returned by the
888 : * xlogreader machinery, 'exact' should be true; otherwise, 'exact' should
889 : * be false, and this function will search forward for the start of a valid
890 : * WAL record.
891 : *
892 : * 'switch_lsn' is the point at which we should switch to a later timeline,
893 : * if we're summarizing a historic timeline.
894 : *
895 : * 'maximum_lsn' identifies the point beyond which we can't count on being
896 : * able to read any more WAL. It should be the switch point when reading a
897 : * historic timeline, or the most-recently-measured end of WAL when reading
898 : * the current timeline.
899 : *
900 : * The return value is the LSN at which the WAL summary actually ends. Most
901 : * often, a summary file ends because we notice that a checkpoint has
902 : * occurred and reach the redo pointer of that checkpoint, but sometimes
903 : * we stop for other reasons, such as a timeline switch.
904 : */
905 : static XLogRecPtr
906 18 : SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
907 : XLogRecPtr switch_lsn, XLogRecPtr maximum_lsn)
908 : {
909 : SummarizerReadLocalXLogPrivate *private_data;
910 : XLogReaderState *xlogreader;
911 : XLogRecPtr summary_start_lsn;
912 18 : XLogRecPtr summary_end_lsn = switch_lsn;
913 : char temp_path[MAXPGPATH];
914 : char final_path[MAXPGPATH];
915 : WalSummaryIO io;
916 18 : BlockRefTable *brtab = CreateEmptyBlockRefTable();
917 18 : bool fast_forward = true;
918 :
919 : /* Initialize private data for xlogreader. */
920 : private_data = (SummarizerReadLocalXLogPrivate *)
921 18 : palloc0(sizeof(SummarizerReadLocalXLogPrivate));
922 18 : private_data->tli = tli;
923 18 : private_data->historic = !XLogRecPtrIsInvalid(switch_lsn);
924 18 : private_data->read_upto = maximum_lsn;
925 :
926 : /* Create xlogreader. */
927 18 : xlogreader = XLogReaderAllocate(wal_segment_size, NULL,
928 18 : XL_ROUTINE(.page_read = &summarizer_read_local_xlog_page,
929 : .segment_open = &wal_segment_open,
930 : .segment_close = &wal_segment_close),
931 : private_data);
932 18 : if (xlogreader == NULL)
933 0 : ereport(ERROR,
934 : (errcode(ERRCODE_OUT_OF_MEMORY),
935 : errmsg("out of memory"),
936 : errdetail("Failed while allocating a WAL reading processor.")));
937 :
938 : /*
939 : * When exact = false, we're starting from an arbitrary point in the WAL
940 : * and must search forward for the start of the next record.
941 : *
942 : * When exact = true, start_lsn should be either the LSN where a record
943 : * begins, or the LSN of a page where the page header is immediately
944 : * followed by the start of a new record. XLogBeginRead should tolerate
945 : * either case.
946 : *
947 : * We need to allow for both cases because the behavior of xlogreader
948 : * varies. When a record spans two or more xlog pages, the ending LSN
949 : * reported by xlogreader will be the starting LSN of the following
950 : * record, but when an xlog page boundary falls between two records, the
951 : * end LSN for the first will be reported as the first byte of the
952 : * following page. We can't know until we read that page how large the
953 : * header will be, but we'll have to skip over it to find the next record.
954 : */
955 18 : if (exact)
956 : {
957 : /*
958 : * Even if start_lsn is the beginning of a page rather than the
959 : * beginning of the first record on that page, we should still use it
960 : * as the start LSN for the summary file. That's because we detect
961 : * missing summary files by looking for cases where the end LSN of one
962 : * file is less than the start LSN of the next file. When only a page
963 : * header is skipped, nothing has been missed.
964 : */
965 16 : XLogBeginRead(xlogreader, start_lsn);
966 16 : summary_start_lsn = start_lsn;
967 : }
968 : else
969 : {
970 2 : summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn);
971 2 : if (XLogRecPtrIsInvalid(summary_start_lsn))
972 : {
973 : /*
974 : * If we hit end-of-WAL while trying to find the next valid
975 : * record, we must be on a historic timeline that has no valid
976 : * records that begin after start_lsn and before end of WAL.
977 : */
978 0 : if (private_data->end_of_wal)
979 : {
980 0 : ereport(DEBUG1,
981 : errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X",
982 : tli,
983 : LSN_FORMAT_ARGS(start_lsn),
984 : LSN_FORMAT_ARGS(private_data->read_upto)));
985 :
986 : /*
987 : * The timeline ends at or after start_lsn, without containing
988 : * any records. Thus, we must make sure the main loop does not
989 : * iterate. If start_lsn is the end of the timeline, then we
990 : * won't actually emit an empty summary file, but otherwise,
991 : * we must, to capture the fact that the LSN range in question
992 : * contains no interesting WAL records.
993 : */
994 0 : summary_start_lsn = start_lsn;
995 0 : summary_end_lsn = private_data->read_upto;
996 0 : switch_lsn = xlogreader->EndRecPtr;
997 : }
998 : else
999 0 : ereport(ERROR,
1000 : (errmsg("could not find a valid record after %X/%X",
1001 : LSN_FORMAT_ARGS(start_lsn))));
1002 : }
1003 :
1004 : /* We shouldn't go backward. */
1005 : Assert(summary_start_lsn >= start_lsn);
1006 : }
1007 :
1008 : /*
1009 : * Main loop: read xlog records one by one.
1010 : */
1011 : while (1)
1012 48316 : {
1013 : int block_id;
1014 : char *errormsg;
1015 : XLogRecord *record;
1016 : uint8 rmid;
1017 :
1018 48334 : HandleWalSummarizerInterrupts();
1019 :
1020 : /* We shouldn't go backward. */
1021 : Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1022 :
1023 : /* Now read the next record. */
1024 48332 : record = XLogReadRecord(xlogreader, &errormsg);
1025 48332 : if (record == NULL)
1026 : {
1027 0 : if (private_data->end_of_wal)
1028 : {
1029 : /*
1030 : * This timeline must be historic and must end before we were
1031 : * able to read a complete record.
1032 : */
1033 0 : ereport(DEBUG1,
1034 : errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X",
1035 : tli,
1036 : LSN_FORMAT_ARGS(xlogreader->EndRecPtr),
1037 : LSN_FORMAT_ARGS(private_data->read_upto)));
1038 : /* Summary ends at end of WAL. */
1039 0 : summary_end_lsn = private_data->read_upto;
1040 0 : break;
1041 : }
1042 0 : if (errormsg)
1043 0 : ereport(ERROR,
1044 : (errcode_for_file_access(),
1045 : errmsg("could not read WAL from timeline %u at %X/%X: %s",
1046 : tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr),
1047 : errormsg)));
1048 : else
1049 0 : ereport(ERROR,
1050 : (errcode_for_file_access(),
1051 : errmsg("could not read WAL from timeline %u at %X/%X",
1052 : tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr))));
1053 : }
1054 :
1055 : /* We shouldn't go backward. */
1056 : Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1057 :
1058 48332 : if (!XLogRecPtrIsInvalid(switch_lsn) &&
1059 0 : xlogreader->ReadRecPtr >= switch_lsn)
1060 : {
1061 : /*
1062 : * Whoops! We've read a record that *starts* after the switch LSN,
1063 : * contrary to our goal of reading only until we hit the first
1064 : * record that ends at or after the switch LSN. Pretend we didn't
1065 : * read it after all by bailing out of this loop right here,
1066 : * before we do anything with this record.
1067 : *
1068 : * This can happen because the last record before the switch LSN
1069 : * might be continued across multiple pages, and then we might
1070 : * come to a page with XLP_FIRST_IS_OVERWRITE_CONTRECORD set. In
1071 : * that case, the record that was continued across multiple pages
1072 : * is incomplete and will be disregarded, and the read will
1073 : * restart from the beginning of the page that is flagged
1074 : * XLP_FIRST_IS_OVERWRITE_CONTRECORD.
1075 : *
1076 : * If this case occurs, we can fairly say that the current summary
1077 : * file ends at the switch LSN exactly. The first record on the
1078 : * page marked XLP_FIRST_IS_OVERWRITE_CONTRECORD will be
1079 : * discovered when generating the next summary file.
1080 : */
1081 0 : summary_end_lsn = switch_lsn;
1082 0 : break;
1083 : }
1084 :
1085 : /*
1086 : * Certain types of records require special handling. Redo points and
1087 : * shutdown checkpoints trigger creation of new summary files and can
1088 : * also cause us to enter or exit "fast forward" mode. Other types of
1089 : * records can require special updates to the block reference table.
1090 : */
1091 48332 : rmid = XLogRecGetRmid(xlogreader);
1092 48332 : if (rmid == RM_XLOG_ID)
1093 : {
1094 : bool new_fast_forward;
1095 :
1096 : /*
1097 : * If we've already processed some WAL records when we hit a redo
1098 : * point or shutdown checkpoint, then we stop summarization before
1099 : * including this record in the current file, so that it will be
1100 : * the first record in the next file.
1101 : *
1102 : * When we hit one of those record types as the first record in a
1103 : * file, we adjust our notion of whether we're fast-forwarding.
1104 : * Any WAL generated with wal_level=minimal must be skipped
1105 : * without actually generating any summary file, because an
1106 : * incremental backup that crosses such WAL would be unsafe.
1107 : */
1108 676 : if (SummarizeXlogRecord(xlogreader, &new_fast_forward))
1109 : {
1110 34 : if (xlogreader->ReadRecPtr > summary_start_lsn)
1111 : {
1112 16 : summary_end_lsn = xlogreader->ReadRecPtr;
1113 16 : break;
1114 : }
1115 : else
1116 18 : fast_forward = new_fast_forward;
1117 : }
1118 : }
1119 47656 : else if (!fast_forward)
1120 : {
1121 : /*
1122 : * This switch handles record types that require extra updates to
1123 : * the contents of the block reference table.
1124 : */
1125 47656 : switch (rmid)
1126 : {
1127 4 : case RM_DBASE_ID:
1128 4 : SummarizeDbaseRecord(xlogreader, brtab);
1129 4 : break;
1130 30 : case RM_SMGR_ID:
1131 30 : SummarizeSmgrRecord(xlogreader, brtab);
1132 30 : break;
1133 1484 : case RM_XACT_ID:
1134 1484 : SummarizeXactRecord(xlogreader, brtab);
1135 1484 : break;
1136 : }
1137 0 : }
1138 :
1139 : /*
1140 : * If we're in fast-forward mode, we don't really need to do anything.
1141 : * Otherwise, feed block references from xlog record to block
1142 : * reference table.
1143 : */
1144 48316 : if (!fast_forward)
1145 : {
1146 96242 : for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader);
1147 47926 : block_id++)
1148 : {
1149 : RelFileLocator rlocator;
1150 : ForkNumber forknum;
1151 : BlockNumber blocknum;
1152 :
1153 47926 : if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator,
1154 : &forknum, &blocknum, NULL))
1155 40 : continue;
1156 :
1157 : /*
1158 : * As we do elsewhere, ignore the FSM fork, because it's not
1159 : * fully WAL-logged.
1160 : */
1161 47886 : if (forknum != FSM_FORKNUM)
1162 47602 : BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
1163 : blocknum);
1164 : }
1165 : }
1166 :
1167 : /* Update our notion of where this summary file ends. */
1168 48316 : summary_end_lsn = xlogreader->EndRecPtr;
1169 :
1170 : /* Also update shared memory. */
1171 48316 : LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
1172 : Assert(summary_end_lsn >= WalSummarizerCtl->summarized_lsn);
1173 48316 : WalSummarizerCtl->pending_lsn = summary_end_lsn;
1174 48316 : LWLockRelease(WALSummarizerLock);
1175 :
1176 : /*
1177 : * If we have a switch LSN and have reached it, stop before reading
1178 : * the next record.
1179 : */
1180 48316 : if (!XLogRecPtrIsInvalid(switch_lsn) &&
1181 0 : xlogreader->EndRecPtr >= switch_lsn)
1182 0 : break;
1183 : }
1184 :
1185 : /* Destroy xlogreader. */
1186 16 : pfree(xlogreader->private_data);
1187 16 : XLogReaderFree(xlogreader);
1188 :
1189 : /*
1190 : * If a timeline switch occurs, we may fail to make any progress at all
1191 : * before exiting the loop above. If that happens, we don't write a WAL
1192 : * summary file at all. We can also skip writing a file if we're in
1193 : * fast-forward mode.
1194 : */
1195 16 : if (summary_end_lsn > summary_start_lsn && !fast_forward)
1196 : {
1197 : /* Generate temporary and final path name. */
1198 16 : snprintf(temp_path, MAXPGPATH,
1199 : XLOGDIR "/summaries/temp.summary");
1200 16 : snprintf(final_path, MAXPGPATH,
1201 : XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary",
1202 : tli,
1203 16 : LSN_FORMAT_ARGS(summary_start_lsn),
1204 16 : LSN_FORMAT_ARGS(summary_end_lsn));
1205 :
1206 : /* Open the temporary file for writing. */
1207 16 : io.filepos = 0;
1208 16 : io.file = PathNameOpenFile(temp_path, O_WRONLY | O_CREAT | O_TRUNC);
1209 16 : if (io.file < 0)
1210 0 : ereport(ERROR,
1211 : (errcode_for_file_access(),
1212 : errmsg("could not create file \"%s\": %m", temp_path)));
1213 :
1214 : /* Write the data. */
1215 16 : WriteBlockRefTable(brtab, WriteWalSummary, &io);
1216 :
1217 : /* Close temporary file and shut down xlogreader. */
1218 16 : FileClose(io.file);
1219 :
1220 : /* Tell the user what we did. */
1221 16 : ereport(DEBUG1,
1222 : errmsg_internal("summarized WAL on TLI %u from %X/%X to %X/%X",
1223 : tli,
1224 : LSN_FORMAT_ARGS(summary_start_lsn),
1225 : LSN_FORMAT_ARGS(summary_end_lsn)));
1226 :
1227 : /* Durably rename the new summary into place. */
1228 16 : durable_rename(temp_path, final_path, ERROR);
1229 : }
1230 :
1231 : /* If we skipped a non-zero amount of WAL, log a debug message. */
1232 16 : if (summary_end_lsn > summary_start_lsn && fast_forward)
1233 0 : ereport(DEBUG1,
1234 : errmsg_internal("skipped summarizing WAL on TLI %u from %X/%X to %X/%X",
1235 : tli,
1236 : LSN_FORMAT_ARGS(summary_start_lsn),
1237 : LSN_FORMAT_ARGS(summary_end_lsn)));
1238 :
1239 16 : return summary_end_lsn;
1240 : }
1241 :
1242 : /*
1243 : * Special handling for WAL records with RM_DBASE_ID.
1244 : */
1245 : static void
1246 4 : SummarizeDbaseRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1247 : {
1248 4 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1249 :
1250 : /*
1251 : * We use relfilenode zero for a given database OID and tablespace OID to
1252 : * indicate that all relations with that pair of IDs have been recreated
1253 : * if they exist at all. Effectively, we're setting a limit block of 0 for
1254 : * all such relfilenodes.
1255 : *
1256 : * Technically, this special handling is only needed in the case of
1257 : * XLOG_DBASE_CREATE_FILE_COPY, because that can create a whole bunch of
1258 : * relation files in a directory without logging anything specific to each
1259 : * one. If we didn't mark the whole DB OID/TS OID combination in some way,
1260 : * then a tablespace that was dropped after the reference backup and
1261 : * recreated using the FILE_COPY method prior to the incremental backup
1262 : * would look just like one that was never touched at all, which would be
1263 : * catastrophic.
1264 : *
1265 : * But it seems best to adopt this treatment for all records that drop or
1266 : * create a DB OID/TS OID combination. That's similar to how we treat the
1267 : * limit block for individual relations, and it's an extra layer of safety
1268 : * here. We can never lose data by marking more stuff as needing to be
1269 : * backed up in full.
1270 : */
1271 4 : if (info == XLOG_DBASE_CREATE_FILE_COPY)
1272 : {
1273 : xl_dbase_create_file_copy_rec *xlrec;
1274 : RelFileLocator rlocator;
1275 :
1276 4 : xlrec =
1277 4 : (xl_dbase_create_file_copy_rec *) XLogRecGetData(xlogreader);
1278 4 : rlocator.spcOid = xlrec->tablespace_id;
1279 4 : rlocator.dbOid = xlrec->db_id;
1280 4 : rlocator.relNumber = 0;
1281 4 : BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1282 : }
1283 0 : else if (info == XLOG_DBASE_CREATE_WAL_LOG)
1284 : {
1285 : xl_dbase_create_wal_log_rec *xlrec;
1286 : RelFileLocator rlocator;
1287 :
1288 0 : xlrec = (xl_dbase_create_wal_log_rec *) XLogRecGetData(xlogreader);
1289 0 : rlocator.spcOid = xlrec->tablespace_id;
1290 0 : rlocator.dbOid = xlrec->db_id;
1291 0 : rlocator.relNumber = 0;
1292 0 : BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1293 : }
1294 0 : else if (info == XLOG_DBASE_DROP)
1295 : {
1296 : xl_dbase_drop_rec *xlrec;
1297 : RelFileLocator rlocator;
1298 : int i;
1299 :
1300 0 : xlrec = (xl_dbase_drop_rec *) XLogRecGetData(xlogreader);
1301 0 : rlocator.dbOid = xlrec->db_id;
1302 0 : rlocator.relNumber = 0;
1303 0 : for (i = 0; i < xlrec->ntablespaces; ++i)
1304 : {
1305 0 : rlocator.spcOid = xlrec->tablespace_ids[i];
1306 0 : BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1307 : }
1308 : }
1309 4 : }
1310 :
1311 : /*
1312 : * Special handling for WAL records with RM_SMGR_ID.
1313 : */
1314 : static void
1315 30 : SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1316 : {
1317 30 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1318 :
1319 30 : if (info == XLOG_SMGR_CREATE)
1320 : {
1321 : xl_smgr_create *xlrec;
1322 :
1323 : /*
1324 : * If a new relation fork is created on disk, there is no point
1325 : * tracking anything about which blocks have been modified, because
1326 : * the whole thing will be new. Hence, set the limit block for this
1327 : * fork to 0.
1328 : *
1329 : * Ignore the FSM fork, which is not fully WAL-logged.
1330 : */
1331 30 : xlrec = (xl_smgr_create *) XLogRecGetData(xlogreader);
1332 :
1333 30 : if (xlrec->forkNum != FSM_FORKNUM)
1334 30 : BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1335 : xlrec->forkNum, 0);
1336 : }
1337 0 : else if (info == XLOG_SMGR_TRUNCATE)
1338 : {
1339 : xl_smgr_truncate *xlrec;
1340 :
1341 0 : xlrec = (xl_smgr_truncate *) XLogRecGetData(xlogreader);
1342 :
1343 : /*
1344 : * If a relation fork is truncated on disk, there is no point in
1345 : * tracking anything about block modifications beyond the truncation
1346 : * point.
1347 : *
1348 : * We ignore SMGR_TRUNCATE_FSM here because the FSM isn't fully
1349 : * WAL-logged and thus we can't track modified blocks for it anyway.
1350 : */
1351 0 : if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1352 0 : BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1353 : MAIN_FORKNUM, xlrec->blkno);
1354 0 : if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0)
1355 0 : BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1356 : VISIBILITYMAP_FORKNUM, xlrec->blkno);
1357 : }
1358 30 : }
1359 :
1360 : /*
1361 : * Special handling for WAL records with RM_XACT_ID.
1362 : */
1363 : static void
1364 1484 : SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1365 : {
1366 1484 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1367 1484 : uint8 xact_info = info & XLOG_XACT_OPMASK;
1368 :
1369 1484 : if (xact_info == XLOG_XACT_COMMIT ||
1370 : xact_info == XLOG_XACT_COMMIT_PREPARED)
1371 1484 : {
1372 1484 : xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(xlogreader);
1373 : xl_xact_parsed_commit parsed;
1374 : int i;
1375 :
1376 : /*
1377 : * Don't track modified blocks for any relations that were removed on
1378 : * commit.
1379 : */
1380 1484 : ParseCommitRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed);
1381 1484 : for (i = 0; i < parsed.nrels; ++i)
1382 : {
1383 : ForkNumber forknum;
1384 :
1385 0 : for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1386 0 : if (forknum != FSM_FORKNUM)
1387 0 : BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1388 : forknum, 0);
1389 : }
1390 : }
1391 0 : else if (xact_info == XLOG_XACT_ABORT ||
1392 : xact_info == XLOG_XACT_ABORT_PREPARED)
1393 : {
1394 0 : xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(xlogreader);
1395 : xl_xact_parsed_abort parsed;
1396 : int i;
1397 :
1398 : /*
1399 : * Don't track modified blocks for any relations that were removed on
1400 : * abort.
1401 : */
1402 0 : ParseAbortRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed);
1403 0 : for (i = 0; i < parsed.nrels; ++i)
1404 : {
1405 : ForkNumber forknum;
1406 :
1407 0 : for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1408 0 : if (forknum != FSM_FORKNUM)
1409 0 : BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1410 : forknum, 0);
1411 : }
1412 : }
1413 1484 : }
1414 :
1415 : /*
1416 : * Special handling for WAL records with RM_XLOG_ID.
1417 : *
1418 : * The return value is true if WAL summarization should stop before this
1419 : * record and false otherwise. When the return value is true,
1420 : * *new_fast_forward indicates whether future processing should be done
1421 : * in fast forward mode (i.e. read WAL without emitting summaries) or not.
1422 : */
1423 : static bool
1424 676 : SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward)
1425 : {
1426 676 : uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1427 : int record_wal_level;
1428 :
1429 676 : if (info == XLOG_CHECKPOINT_REDO)
1430 : {
1431 : /* Payload is wal_level at the time record was written. */
1432 20 : memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int));
1433 : }
1434 656 : else if (info == XLOG_CHECKPOINT_SHUTDOWN)
1435 : {
1436 : CheckPoint rec_ckpt;
1437 :
1438 : /* Extract wal_level at time record was written from payload. */
1439 10 : memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1440 10 : record_wal_level = rec_ckpt.wal_level;
1441 : }
1442 646 : else if (info == XLOG_PARAMETER_CHANGE)
1443 : {
1444 : xl_parameter_change xlrec;
1445 :
1446 : /* Extract wal_level at time record was written from payload. */
1447 4 : memcpy(&xlrec, XLogRecGetData(xlogreader),
1448 : sizeof(xl_parameter_change));
1449 4 : record_wal_level = xlrec.wal_level;
1450 : }
1451 642 : else if (info == XLOG_END_OF_RECOVERY)
1452 : {
1453 : xl_end_of_recovery xlrec;
1454 :
1455 : /* Extract wal_level at time record was written from payload. */
1456 0 : memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1457 0 : record_wal_level = xlrec.wal_level;
1458 : }
1459 : else
1460 : {
1461 : /* No special handling required. Return false. */
1462 642 : return false;
1463 : }
1464 :
1465 : /*
1466 : * Redo can only begin at an XLOG_CHECKPOINT_REDO or
1467 : * XLOG_CHECKPOINT_SHUTDOWN record, so we want WAL summarization to begin
1468 : * at those points. Hence, when those records are encountered, return
1469 : * true, so that we stop just before summarizing either of those records.
1470 : *
1471 : * We also reach here if we just saw XLOG_END_OF_RECOVERY or
1472 : * XLOG_PARAMETER_CHANGE. These are not places where recovery can start,
1473 : * but they're still relevant here. A new timeline can begin with
1474 : * XLOG_END_OF_RECOVERY, so we need to confirm the WAL level at that
1475 : * point; and a restart can provoke XLOG_PARAMETER_CHANGE after an
1476 : * intervening change to postgresql.conf, which might force us to stop
1477 : * summarizing.
1478 : */
1479 34 : *new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL);
1480 34 : return true;
1481 : }
1482 :
1483 : /*
1484 : * Similar to read_local_xlog_page, but limited to read from one particular
1485 : * timeline. If the end of WAL is reached, it will wait for more if reading
1486 : * from the current timeline, or give up if reading from a historic timeline.
1487 : * In the latter case, it will also set private_data->end_of_wal = true.
1488 : *
1489 : * Caller must set private_data->tli to the TLI of interest,
1490 : * private_data->read_upto to the lowest LSN that is not known to be safe
1491 : * to read on that timeline, and private_data->historic to true if and only
1492 : * if the timeline is not the current timeline. This function will update
1493 : * private_data->read_upto and private_data->historic if more WAL appears
1494 : * on the current timeline or if the current timeline becomes historic.
1495 : */
1496 : static int
1497 1934 : summarizer_read_local_xlog_page(XLogReaderState *state,
1498 : XLogRecPtr targetPagePtr, int reqLen,
1499 : XLogRecPtr targetRecPtr, char *cur_page)
1500 : {
1501 : int count;
1502 : WALReadError errinfo;
1503 : SummarizerReadLocalXLogPrivate *private_data;
1504 :
1505 1934 : HandleWalSummarizerInterrupts();
1506 :
1507 1934 : private_data = (SummarizerReadLocalXLogPrivate *)
1508 : state->private_data;
1509 :
1510 : while (1)
1511 : {
1512 1942 : if (targetPagePtr + XLOG_BLCKSZ <= private_data->read_upto)
1513 : {
1514 : /*
1515 : * more than one block available; read only that block, have
1516 : * caller come back if they need more.
1517 : */
1518 1920 : count = XLOG_BLCKSZ;
1519 1920 : break;
1520 : }
1521 22 : else if (targetPagePtr + reqLen > private_data->read_upto)
1522 : {
1523 : /* We don't seem to have enough data. */
1524 8 : if (private_data->historic)
1525 : {
1526 : /*
1527 : * This is a historic timeline, so there will never be any
1528 : * more data than we have currently.
1529 : */
1530 0 : private_data->end_of_wal = true;
1531 0 : return -1;
1532 : }
1533 : else
1534 : {
1535 : XLogRecPtr latest_lsn;
1536 : TimeLineID latest_tli;
1537 :
1538 : /*
1539 : * This is - or at least was up until very recently - the
1540 : * current timeline, so more data might show up. Delay here
1541 : * so we don't tight-loop.
1542 : */
1543 8 : HandleWalSummarizerInterrupts();
1544 8 : summarizer_wait_for_wal();
1545 :
1546 : /* Recheck end-of-WAL. */
1547 8 : latest_lsn = GetLatestLSN(&latest_tli);
1548 8 : if (private_data->tli == latest_tli)
1549 : {
1550 : /* Still the current timeline, update max LSN. */
1551 : Assert(latest_lsn >= private_data->read_upto);
1552 8 : private_data->read_upto = latest_lsn;
1553 : }
1554 : else
1555 : {
1556 0 : List *tles = readTimeLineHistory(latest_tli);
1557 : XLogRecPtr switchpoint;
1558 :
1559 : /*
1560 : * The timeline we're scanning is no longer the latest
1561 : * one. Figure out when it ended.
1562 : */
1563 0 : private_data->historic = true;
1564 0 : switchpoint = tliSwitchPoint(private_data->tli, tles,
1565 : NULL);
1566 :
1567 : /*
1568 : * Allow reads up to exactly the switch point.
1569 : *
1570 : * It's possible that this will cause read_upto to move
1571 : * backwards, because we might have been promoted before
1572 : * reaching the end of the previous timeline. In that
1573 : * case, the next loop iteration will likely conclude that
1574 : * we've reached end of WAL.
1575 : */
1576 0 : private_data->read_upto = switchpoint;
1577 :
1578 : /* Debugging output. */
1579 0 : ereport(DEBUG1,
1580 : errmsg_internal("timeline %u became historic, can read up to %X/%X",
1581 : private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto)));
1582 : }
1583 :
1584 : /* Go around and try again. */
1585 : }
1586 : }
1587 : else
1588 : {
1589 : /* enough bytes available to satisfy the request */
1590 14 : count = private_data->read_upto - targetPagePtr;
1591 14 : break;
1592 : }
1593 : }
1594 :
1595 1934 : if (!WALRead(state, cur_page, targetPagePtr, count,
1596 : private_data->tli, &errinfo))
1597 0 : WALReadRaiseError(&errinfo);
1598 :
1599 : /* Track that we read a page, for sleep time calculation. */
1600 1934 : ++pages_read_since_last_sleep;
1601 :
1602 : /* number of valid bytes in the buffer */
1603 1934 : return count;
1604 : }
1605 :
1606 : /*
1607 : * Sleep for long enough that we believe it's likely that more WAL will
1608 : * be available afterwards.
1609 : */
1610 : static void
1611 8 : summarizer_wait_for_wal(void)
1612 : {
1613 8 : if (pages_read_since_last_sleep == 0)
1614 : {
1615 : /*
1616 : * No pages were read since the last sleep, so double the sleep time,
1617 : * but not beyond the maximum allowable value.
1618 : */
1619 4 : sleep_quanta = Min(sleep_quanta * 2, MAX_SLEEP_QUANTA);
1620 : }
1621 4 : else if (pages_read_since_last_sleep > 1)
1622 : {
1623 : /*
1624 : * Multiple pages were read since the last sleep, so reduce the sleep
1625 : * time.
1626 : *
1627 : * A large burst of activity should be able to quickly reduce the
1628 : * sleep time to the minimum, but we don't want a handful of extra WAL
1629 : * records to provoke a strong reaction. We choose to reduce the sleep
1630 : * time by 1 quantum for each page read beyond the first, which is a
1631 : * fairly arbitrary way of trying to be reactive without overreacting.
1632 : */
1633 4 : if (pages_read_since_last_sleep > sleep_quanta - 1)
1634 4 : sleep_quanta = 1;
1635 : else
1636 0 : sleep_quanta -= pages_read_since_last_sleep;
1637 : }
1638 :
1639 : /* OK, now sleep. */
1640 8 : (void) WaitLatch(MyLatch,
1641 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1642 : sleep_quanta * MS_PER_SLEEP_QUANTUM,
1643 : WAIT_EVENT_WAL_SUMMARIZER_WAL);
1644 8 : ResetLatch(MyLatch);
1645 :
1646 : /* Reset count of pages read. */
1647 8 : pages_read_since_last_sleep = 0;
1648 8 : }
1649 :
1650 : /*
1651 : * Remove WAL summaries whose mtimes are older than wal_summary_keep_time.
1652 : */
1653 : static void
1654 18 : MaybeRemoveOldWalSummaries(void)
1655 : {
1656 18 : XLogRecPtr redo_pointer = GetRedoRecPtr();
1657 : List *wslist;
1658 : time_t cutoff_time;
1659 :
1660 : /* If WAL summary removal is disabled, don't do anything. */
1661 18 : if (wal_summary_keep_time == 0)
1662 0 : return;
1663 :
1664 : /*
1665 : * If the redo pointer has not advanced, don't do anything.
1666 : *
1667 : * This has the effect that we only try to remove old WAL summary files
1668 : * once per checkpoint cycle.
1669 : */
1670 18 : if (redo_pointer == redo_pointer_at_last_summary_removal)
1671 14 : return;
1672 4 : redo_pointer_at_last_summary_removal = redo_pointer;
1673 :
1674 : /*
1675 : * Files should only be removed if the last modification time precedes the
1676 : * cutoff time we compute here.
1677 : */
1678 4 : cutoff_time = time(NULL) - wal_summary_keep_time * SECS_PER_MINUTE;
1679 :
1680 : /* Get all the summaries that currently exist. */
1681 4 : wslist = GetWalSummaries(0, InvalidXLogRecPtr, InvalidXLogRecPtr);
1682 :
1683 : /* Loop until all summaries have been considered for removal. */
1684 6 : while (wslist != NIL)
1685 : {
1686 : ListCell *lc;
1687 : XLogSegNo oldest_segno;
1688 2 : XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
1689 : TimeLineID selected_tli;
1690 :
1691 2 : HandleWalSummarizerInterrupts();
1692 :
1693 : /*
1694 : * Pick a timeline for which some summary files still exist on disk,
1695 : * and find the oldest LSN that still exists on disk for that
1696 : * timeline.
1697 : */
1698 2 : selected_tli = ((WalSummaryFile *) linitial(wslist))->tli;
1699 2 : oldest_segno = XLogGetOldestSegno(selected_tli);
1700 2 : if (oldest_segno != 0)
1701 2 : XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size,
1702 : oldest_lsn);
1703 :
1704 :
1705 : /* Consider each WAL file on the selected timeline in turn. */
1706 18 : foreach(lc, wslist)
1707 : {
1708 16 : WalSummaryFile *ws = lfirst(lc);
1709 :
1710 16 : HandleWalSummarizerInterrupts();
1711 :
1712 : /* If it's not on this timeline, it's not time to consider it. */
1713 16 : if (selected_tli != ws->tli)
1714 0 : continue;
1715 :
1716 : /*
1717 : * If the WAL doesn't exist any more, we can remove it if the file
1718 : * modification time is old enough.
1719 : */
1720 16 : if (XLogRecPtrIsInvalid(oldest_lsn) || ws->end_lsn <= oldest_lsn)
1721 0 : RemoveWalSummaryIfOlderThan(ws, cutoff_time);
1722 :
1723 : /*
1724 : * Whether we removed the file or not, we need not consider it
1725 : * again.
1726 : */
1727 16 : wslist = foreach_delete_current(wslist, lc);
1728 16 : pfree(ws);
1729 : }
1730 : }
1731 : }
|