Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * basebackup_incremental.c
4 : * code for incremental backup support
5 : *
6 : * This code isn't actually in charge of taking an incremental backup;
7 : * the actual construction of the incremental backup happens in
8 : * basebackup.c. Here, we're concerned with providing the necessary
9 : * supports for that operation. In particular, we need to parse the
10 : * backup manifest supplied by the user taking the incremental backup
11 : * and extract the required information from it.
12 : *
13 : * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
14 : *
15 : * IDENTIFICATION
16 : * src/backend/backup/basebackup_incremental.c
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include "access/timeline.h"
23 : #include "access/xlog.h"
24 : #include "backup/basebackup_incremental.h"
25 : #include "backup/walsummary.h"
26 : #include "common/blkreftable.h"
27 : #include "common/hashfn.h"
28 : #include "common/int.h"
29 : #include "common/parse_manifest.h"
30 : #include "datatype/timestamp.h"
31 : #include "postmaster/walsummarizer.h"
32 : #include "utils/timestamp.h"
33 :
34 : #define BLOCKS_PER_READ 512
35 :
36 : /*
37 : * We expect to find the last lines of the manifest, including the checksum,
38 : * in the last MIN_CHUNK bytes of the manifest. We trigger an incremental
39 : * parse step if we are about to overflow MAX_CHUNK bytes.
40 : */
41 : #define MIN_CHUNK 1024
42 : #define MAX_CHUNK (128 * 1024)
43 :
44 : /*
45 : * Details extracted from the WAL ranges present in the supplied backup manifest.
46 : */
47 : typedef struct
48 : {
49 : TimeLineID tli;
50 : XLogRecPtr start_lsn;
51 : XLogRecPtr end_lsn;
52 : } backup_wal_range;
53 :
54 : /*
55 : * Details extracted from the file list present in the supplied backup manifest.
56 : */
57 : typedef struct
58 : {
59 : uint32 status;
60 : const char *path;
61 : size_t size;
62 : } backup_file_entry;
63 :
64 : static uint32 hash_string_pointer(const char *s);
65 : #define SH_PREFIX backup_file
66 : #define SH_ELEMENT_TYPE backup_file_entry
67 : #define SH_KEY_TYPE const char *
68 : #define SH_KEY path
69 : #define SH_HASH_KEY(tb, key) hash_string_pointer(key)
70 : #define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
71 : #define SH_SCOPE static inline
72 : #define SH_DECLARE
73 : #define SH_DEFINE
74 : #include "lib/simplehash.h"
75 :
76 : struct IncrementalBackupInfo
77 : {
78 : /* Memory context for this object and its subsidiary objects. */
79 : MemoryContext mcxt;
80 :
81 : /* Temporary buffer for storing the manifest while parsing it. */
82 : StringInfoData buf;
83 :
84 : /* WAL ranges extracted from the backup manifest. */
85 : List *manifest_wal_ranges;
86 :
87 : /*
88 : * Files extracted from the backup manifest.
89 : *
90 : * We don't really need this information, because we use WAL summaries to
91 : * figure out what's changed. It would be unsafe to just rely on the list
92 : * of files that existed before, because it's possible for a file to be
93 : * removed and a new one created with the same name and different
94 : * contents. In such cases, the whole file must still be sent. We can tell
95 : * from the WAL summaries whether that happened, but not from the file
96 : * list.
97 : *
98 : * Nonetheless, this data is useful for sanity checking. If a file that we
99 : * think we shouldn't need to send is not present in the manifest for the
100 : * prior backup, something has gone terribly wrong. We retain the file
101 : * names and sizes, but not the checksums or last modified times, for
102 : * which we have no use.
103 : *
104 : * One significant downside of storing this data is that it consumes
105 : * memory. If that turns out to be a problem, we might have to decide not
106 : * to retain this information, or to make it optional.
107 : */
108 : backup_file_hash *manifest_files;
109 :
110 : /*
111 : * Block-reference table for the incremental backup.
112 : *
113 : * It's possible that storing the entire block-reference table in memory
114 : * will be a problem for some users. The in-memory format that we're using
115 : * here is pretty efficient, converging to little more than 1 bit per
116 : * block for relation forks with large numbers of modified blocks. It's
117 : * possible, however, that if you try to perform an incremental backup of
118 : * a database with a sufficiently large number of relations on a
119 : * sufficiently small machine, you could run out of memory here. If that
120 : * turns out to be a problem in practice, we'll need to be more clever.
121 : */
122 : BlockRefTable *brtab;
123 :
124 : /*
125 : * State object for incremental JSON parsing
126 : */
127 : JsonManifestParseIncrementalState *inc_state;
128 : };
129 :
130 : static void manifest_process_version(JsonManifestParseContext *context,
131 : int manifest_version);
132 : static void manifest_process_system_identifier(JsonManifestParseContext *context,
133 : uint64 manifest_system_identifier);
134 : static void manifest_process_file(JsonManifestParseContext *context,
135 : char *pathname,
136 : size_t size,
137 : pg_checksum_type checksum_type,
138 : int checksum_length,
139 : uint8 *checksum_payload);
140 : static void manifest_process_wal_range(JsonManifestParseContext *context,
141 : TimeLineID tli,
142 : XLogRecPtr start_lsn,
143 : XLogRecPtr end_lsn);
144 : static void manifest_report_error(JsonManifestParseContext *ib,
145 : const char *fmt,...)
146 : pg_attribute_printf(2, 3) pg_attribute_noreturn();
147 : static int compare_block_numbers(const void *a, const void *b);
148 :
149 : /*
150 : * Create a new object for storing information extracted from the manifest
151 : * supplied when creating an incremental backup.
152 : */
153 : IncrementalBackupInfo *
154 16 : CreateIncrementalBackupInfo(MemoryContext mcxt)
155 : {
156 : IncrementalBackupInfo *ib;
157 : MemoryContext oldcontext;
158 : JsonManifestParseContext *context;
159 :
160 16 : oldcontext = MemoryContextSwitchTo(mcxt);
161 :
162 16 : ib = palloc0(sizeof(IncrementalBackupInfo));
163 16 : ib->mcxt = mcxt;
164 16 : initStringInfo(&ib->buf);
165 :
166 : /*
167 : * It's hard to guess how many files a "typical" installation will have in
168 : * the data directory, but a fresh initdb creates almost 1000 files as of
169 : * this writing, so it seems to make sense for our estimate to
170 : * substantially higher.
171 : */
172 16 : ib->manifest_files = backup_file_create(mcxt, 10000, NULL);
173 :
174 16 : context = palloc0(sizeof(JsonManifestParseContext));
175 : /* Parse the manifest. */
176 16 : context->private_data = ib;
177 16 : context->version_cb = manifest_process_version;
178 16 : context->system_identifier_cb = manifest_process_system_identifier;
179 16 : context->per_file_cb = manifest_process_file;
180 16 : context->per_wal_range_cb = manifest_process_wal_range;
181 16 : context->error_cb = manifest_report_error;
182 :
183 16 : ib->inc_state = json_parse_manifest_incremental_init(context);
184 :
185 16 : MemoryContextSwitchTo(oldcontext);
186 :
187 16 : return ib;
188 : }
189 :
190 : /*
191 : * Before taking an incremental backup, the caller must supply the backup
192 : * manifest from a prior backup. Each chunk of manifest data received
193 : * from the client should be passed to this function.
194 : */
195 : void
196 48 : AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data,
197 : int len)
198 : {
199 : MemoryContext oldcontext;
200 :
201 : /* Switch to our memory context. */
202 48 : oldcontext = MemoryContextSwitchTo(ib->mcxt);
203 :
204 48 : if (ib->buf.len > MIN_CHUNK && ib->buf.len + len > MAX_CHUNK)
205 : {
206 : /*
207 : * time for an incremental parse. We'll do all but the last MIN_CHUNK
208 : * so that we have enough left for the final piece.
209 : */
210 16 : json_parse_manifest_incremental_chunk(
211 16 : ib->inc_state, ib->buf.data, ib->buf.len - MIN_CHUNK, false);
212 : /* now remove what we just parsed */
213 14 : memmove(ib->buf.data, ib->buf.data + (ib->buf.len - MIN_CHUNK),
214 : MIN_CHUNK + 1);
215 14 : ib->buf.len = MIN_CHUNK;
216 : }
217 :
218 46 : appendBinaryStringInfo(&ib->buf, data, len);
219 :
220 : /* Switch back to previous memory context. */
221 46 : MemoryContextSwitchTo(oldcontext);
222 46 : }
223 :
224 : /*
225 : * Finalize an IncrementalBackupInfo object after all manifest data has
226 : * been supplied via calls to AppendIncrementalManifestData.
227 : */
228 : void
229 14 : FinalizeIncrementalManifest(IncrementalBackupInfo *ib)
230 : {
231 : MemoryContext oldcontext;
232 :
233 : /* Switch to our memory context. */
234 14 : oldcontext = MemoryContextSwitchTo(ib->mcxt);
235 :
236 : /* Parse the last chunk of the manifest */
237 14 : json_parse_manifest_incremental_chunk(
238 : ib->inc_state, ib->buf.data, ib->buf.len, true);
239 :
240 : /* Done with the buffer, so release memory. */
241 14 : pfree(ib->buf.data);
242 14 : ib->buf.data = NULL;
243 :
244 : /* Done with inc_state, so release that memory too */
245 14 : json_parse_manifest_incremental_shutdown(ib->inc_state);
246 :
247 : /* Switch back to previous memory context. */
248 14 : MemoryContextSwitchTo(oldcontext);
249 14 : }
250 :
251 : /*
252 : * Prepare to take an incremental backup.
253 : *
254 : * Before this function is called, AppendIncrementalManifestData and
255 : * FinalizeIncrementalManifest should have already been called to pass all
256 : * the manifest data to this object.
257 : *
258 : * This function performs sanity checks on the data extracted from the
259 : * manifest and figures out for which WAL ranges we need summaries, and
260 : * whether those summaries are available. Then, it reads and combines the
261 : * data from those summary files. It also updates the backup_state with the
262 : * reference TLI and LSN for the prior backup.
263 : */
264 : void
265 14 : PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
266 : BackupState *backup_state)
267 : {
268 : MemoryContext oldcontext;
269 : List *expectedTLEs;
270 : List *all_wslist,
271 14 : *required_wslist = NIL;
272 : ListCell *lc;
273 : TimeLineHistoryEntry **tlep;
274 : int num_wal_ranges;
275 : int i;
276 14 : bool found_backup_start_tli = false;
277 14 : TimeLineID earliest_wal_range_tli = 0;
278 14 : XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
279 14 : TimeLineID latest_wal_range_tli = 0;
280 : XLogRecPtr summarized_lsn;
281 : XLogRecPtr pending_lsn;
282 14 : XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
283 14 : int deadcycles = 0;
284 : TimestampTz initial_time,
285 : current_time;
286 :
287 : Assert(ib->buf.data == NULL);
288 :
289 : /* Switch to our memory context. */
290 14 : oldcontext = MemoryContextSwitchTo(ib->mcxt);
291 :
292 : /*
293 : * A valid backup manifest must always contain at least one WAL range
294 : * (usually exactly one, unless the backup spanned a timeline switch).
295 : */
296 14 : num_wal_ranges = list_length(ib->manifest_wal_ranges);
297 14 : if (num_wal_ranges == 0)
298 0 : ereport(ERROR,
299 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
300 : errmsg("manifest contains no required WAL ranges")));
301 :
302 : /*
303 : * Match up the TLIs that appear in the WAL ranges of the backup manifest
304 : * with those that appear in this server's timeline history. We expect
305 : * every backup_wal_range to match to a TimeLineHistoryEntry; if it does
306 : * not, that's an error.
307 : *
308 : * This loop also decides which of the WAL ranges is the manifest is most
309 : * ancient and which one is the newest, according to the timeline history
310 : * of this server, and stores TLIs of those WAL ranges into
311 : * earliest_wal_range_tli and latest_wal_range_tli. It also updates
312 : * earliest_wal_range_start_lsn to the start LSN of the WAL range for
313 : * earliest_wal_range_tli.
314 : *
315 : * Note that the return value of readTimeLineHistory puts the latest
316 : * timeline at the beginning of the list, not the end. Hence, the earliest
317 : * TLI is the one that occurs nearest the end of the list returned by
318 : * readTimeLineHistory, and the latest TLI is the one that occurs closest
319 : * to the beginning.
320 : */
321 14 : expectedTLEs = readTimeLineHistory(backup_state->starttli);
322 14 : tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *));
323 28 : for (i = 0; i < num_wal_ranges; ++i)
324 : {
325 14 : backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
326 14 : bool saw_earliest_wal_range_tli = false;
327 14 : bool saw_latest_wal_range_tli = false;
328 :
329 : /* Search this server's history for this WAL range's TLI. */
330 14 : foreach(lc, expectedTLEs)
331 : {
332 14 : TimeLineHistoryEntry *tle = lfirst(lc);
333 :
334 14 : if (tle->tli == range->tli)
335 : {
336 14 : tlep[i] = tle;
337 14 : break;
338 : }
339 :
340 0 : if (tle->tli == earliest_wal_range_tli)
341 0 : saw_earliest_wal_range_tli = true;
342 0 : if (tle->tli == latest_wal_range_tli)
343 0 : saw_latest_wal_range_tli = true;
344 : }
345 :
346 : /*
347 : * An incremental backup can only be taken relative to a backup that
348 : * represents a previous state of this server. If the backup requires
349 : * WAL from a timeline that's not in our history, that definitely
350 : * isn't the case.
351 : */
352 14 : if (tlep[i] == NULL)
353 0 : ereport(ERROR,
354 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
355 : errmsg("timeline %u found in manifest, but not in this server's history",
356 : range->tli)));
357 :
358 : /*
359 : * If we found this TLI in the server's history before encountering
360 : * the latest TLI seen so far in the server's history, then this TLI
361 : * is the latest one seen so far.
362 : *
363 : * If on the other hand we saw the earliest TLI seen so far before
364 : * finding this TLI, this TLI is earlier than the earliest one seen so
365 : * far. And if this is the first TLI for which we've searched, it's
366 : * also the earliest one seen so far.
367 : *
368 : * On the first loop iteration, both things should necessarily be
369 : * true.
370 : */
371 14 : if (!saw_latest_wal_range_tli)
372 14 : latest_wal_range_tli = range->tli;
373 14 : if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli)
374 : {
375 14 : earliest_wal_range_tli = range->tli;
376 14 : earliest_wal_range_start_lsn = range->start_lsn;
377 : }
378 : }
379 :
380 : /*
381 : * Propagate information about the prior backup into the backup_label that
382 : * will be generated for this backup.
383 : */
384 14 : backup_state->istartpoint = earliest_wal_range_start_lsn;
385 14 : backup_state->istarttli = earliest_wal_range_tli;
386 :
387 : /*
388 : * Sanity check start and end LSNs for the WAL ranges in the manifest.
389 : *
390 : * Commonly, there won't be any timeline switches during the prior backup
391 : * at all, but if there are, they should happen at the same LSNs that this
392 : * server switched timelines.
393 : *
394 : * Whether there are any timeline switches during the prior backup or not,
395 : * the prior backup shouldn't require any WAL from a timeline prior to the
396 : * start of that timeline. It also shouldn't require any WAL from later
397 : * than the start of this backup.
398 : *
399 : * If any of these sanity checks fail, one possible explanation is that
400 : * the user has generated WAL on the same timeline with the same LSNs more
401 : * than once. For instance, if two standbys running on timeline 1 were
402 : * both promoted and (due to a broken archiving setup) both selected new
403 : * timeline ID 2, then it's possible that one of these checks might trip.
404 : *
405 : * Note that there are lots of ways for the user to do something very bad
406 : * without tripping any of these checks, and they are not intended to be
407 : * comprehensive. It's pretty hard to see how we could be certain of
408 : * anything here. However, if there's a problem staring us right in the
409 : * face, it's best to report it, so we do.
410 : */
411 28 : for (i = 0; i < num_wal_ranges; ++i)
412 : {
413 14 : backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i);
414 :
415 14 : if (range->tli == earliest_wal_range_tli)
416 : {
417 14 : if (range->start_lsn < tlep[i]->begin)
418 0 : ereport(ERROR,
419 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
420 : errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X",
421 : range->tli,
422 : LSN_FORMAT_ARGS(range->start_lsn),
423 : LSN_FORMAT_ARGS(tlep[i]->begin))));
424 : }
425 : else
426 : {
427 0 : if (range->start_lsn != tlep[i]->begin)
428 0 : ereport(ERROR,
429 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
430 : errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X",
431 : range->tli,
432 : LSN_FORMAT_ARGS(range->start_lsn),
433 : LSN_FORMAT_ARGS(tlep[i]->begin))));
434 : }
435 :
436 14 : if (range->tli == latest_wal_range_tli)
437 : {
438 14 : if (range->end_lsn > backup_state->startpoint)
439 0 : ereport(ERROR,
440 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
441 : errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X",
442 : range->tli,
443 : LSN_FORMAT_ARGS(range->end_lsn),
444 : LSN_FORMAT_ARGS(backup_state->startpoint))));
445 : }
446 : else
447 : {
448 0 : if (range->end_lsn != tlep[i]->end)
449 0 : ereport(ERROR,
450 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
451 : errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X",
452 : range->tli,
453 : LSN_FORMAT_ARGS(range->end_lsn),
454 : LSN_FORMAT_ARGS(tlep[i]->end))));
455 : }
456 :
457 : }
458 :
459 : /*
460 : * Wait for WAL summarization to catch up to the backup start LSN (but
461 : * time out if it doesn't do so quickly enough).
462 : */
463 14 : initial_time = current_time = GetCurrentTimestamp();
464 : while (1)
465 0 : {
466 14 : long timeout_in_ms = 10000;
467 : long elapsed_seconds;
468 :
469 : /*
470 : * Align the wait time to prevent drift. This doesn't really matter,
471 : * but we'd like the warnings about how long we've been waiting to say
472 : * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
473 : * drifting to something that is not a multiple of ten.
474 : */
475 14 : timeout_in_ms -=
476 14 : TimestampDifferenceMilliseconds(initial_time, current_time) %
477 : timeout_in_ms;
478 :
479 : /* Wait for up to 10 seconds. */
480 14 : summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
481 : timeout_in_ms, &pending_lsn);
482 :
483 : /* If WAL summarization has progressed sufficiently, stop waiting. */
484 14 : if (summarized_lsn >= backup_state->startpoint)
485 14 : break;
486 :
487 : /*
488 : * Keep track of the number of cycles during which there has been no
489 : * progression of pending_lsn. If pending_lsn is not advancing, that
490 : * means that not only are no new files appearing on disk, but we're
491 : * not even incorporating new records into the in-memory state.
492 : */
493 0 : if (pending_lsn > prior_pending_lsn)
494 : {
495 0 : prior_pending_lsn = pending_lsn;
496 0 : deadcycles = 0;
497 : }
498 : else
499 0 : ++deadcycles;
500 :
501 : /*
502 : * If we've managed to wait for an entire minute without the WAL
503 : * summarizer absorbing a single WAL record, error out; probably
504 : * something is wrong.
505 : *
506 : * We could consider also erroring out if the summarizer is taking too
507 : * long to catch up, but it's not clear what rate of progress would be
508 : * acceptable and what would be too slow. So instead, we just try to
509 : * error out in the case where there's no progress at all. That seems
510 : * likely to catch a reasonable number of the things that can go wrong
511 : * in practice (e.g. the summarizer process is completely hung, say
512 : * because somebody hooked up a debugger to it or something) without
513 : * giving up too quickly when the system is just slow.
514 : */
515 0 : if (deadcycles >= 6)
516 0 : ereport(ERROR,
517 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
518 : errmsg("WAL summarization is not progressing"),
519 : errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
520 : LSN_FORMAT_ARGS(backup_state->startpoint),
521 : LSN_FORMAT_ARGS(summarized_lsn),
522 : LSN_FORMAT_ARGS(pending_lsn))));
523 :
524 : /*
525 : * Otherwise, just let the user know what's happening.
526 : */
527 0 : current_time = GetCurrentTimestamp();
528 0 : elapsed_seconds =
529 0 : TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
530 0 : ereport(WARNING,
531 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
532 : errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
533 : LSN_FORMAT_ARGS(backup_state->startpoint),
534 : elapsed_seconds),
535 : errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
536 : LSN_FORMAT_ARGS(summarized_lsn),
537 : LSN_FORMAT_ARGS(pending_lsn))));
538 : }
539 :
540 : /*
541 : * Retrieve a list of all WAL summaries on any timeline that overlap with
542 : * the LSN range of interest. We could instead call GetWalSummaries() once
543 : * per timeline in the loop that follows, but that would involve reading
544 : * the directory multiple times. It should be mildly faster - and perhaps
545 : * a bit safer - to do it just once.
546 : */
547 14 : all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn,
548 : backup_state->startpoint);
549 :
550 : /*
551 : * We need WAL summaries for everything that happened during the prior
552 : * backup and everything that happened afterward up until the point where
553 : * the current backup started.
554 : */
555 14 : foreach(lc, expectedTLEs)
556 : {
557 14 : TimeLineHistoryEntry *tle = lfirst(lc);
558 14 : XLogRecPtr tli_start_lsn = tle->begin;
559 14 : XLogRecPtr tli_end_lsn = tle->end;
560 14 : XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr;
561 : List *tli_wslist;
562 :
563 : /*
564 : * Working through the history of this server from the current
565 : * timeline backwards, we skip everything until we find the timeline
566 : * where this backup started. Most of the time, this means we won't
567 : * skip anything at all, as it's unlikely that the timeline has
568 : * changed since the beginning of the backup moments ago.
569 : */
570 14 : if (tle->tli == backup_state->starttli)
571 : {
572 14 : found_backup_start_tli = true;
573 14 : tli_end_lsn = backup_state->startpoint;
574 : }
575 0 : else if (!found_backup_start_tli)
576 0 : continue;
577 :
578 : /*
579 : * Find the summaries that overlap the LSN range of interest for this
580 : * timeline. If this is the earliest timeline involved, the range of
581 : * interest begins with the start LSN of the prior backup; otherwise,
582 : * it begins at the LSN at which this timeline came into existence. If
583 : * this is the latest TLI involved, the range of interest ends at the
584 : * start LSN of the current backup; otherwise, it ends at the point
585 : * where we switched from this timeline to the next one.
586 : */
587 14 : if (tle->tli == earliest_wal_range_tli)
588 14 : tli_start_lsn = earliest_wal_range_start_lsn;
589 14 : tli_wslist = FilterWalSummaries(all_wslist, tle->tli,
590 : tli_start_lsn, tli_end_lsn);
591 :
592 : /*
593 : * There is no guarantee that the WAL summaries we found cover the
594 : * entire range of LSNs for which summaries are required, or indeed
595 : * that we found any WAL summaries at all. Check whether we have a
596 : * problem of that sort.
597 : */
598 14 : if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn,
599 : &tli_missing_lsn))
600 : {
601 0 : if (XLogRecPtrIsInvalid(tli_missing_lsn))
602 0 : ereport(ERROR,
603 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
604 : errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist",
605 : tle->tli,
606 : LSN_FORMAT_ARGS(tli_start_lsn),
607 : LSN_FORMAT_ARGS(tli_end_lsn))));
608 : else
609 0 : ereport(ERROR,
610 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
611 : errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete",
612 : tle->tli,
613 : LSN_FORMAT_ARGS(tli_start_lsn),
614 : LSN_FORMAT_ARGS(tli_end_lsn)),
615 : errdetail("The first unsummarized LSN in this range is %X/%X.",
616 : LSN_FORMAT_ARGS(tli_missing_lsn))));
617 : }
618 :
619 : /*
620 : * Remember that we need to read these summaries.
621 : *
622 : * Technically, it's possible that this could read more files than
623 : * required, since tli_wslist in theory could contain redundant
624 : * summaries. For instance, if we have a summary from 0/10000000 to
625 : * 0/20000000 and also one from 0/00000000 to 0/30000000, then the
626 : * latter subsumes the former and the former could be ignored.
627 : *
628 : * We ignore this possibility because the WAL summarizer only tries to
629 : * generate summaries that do not overlap. If somehow they exist,
630 : * we'll do a bit of extra work but the results should still be
631 : * correct.
632 : */
633 14 : required_wslist = list_concat(required_wslist, tli_wslist);
634 :
635 : /*
636 : * Timelines earlier than the one in which the prior backup began are
637 : * not relevant.
638 : */
639 14 : if (tle->tli == earliest_wal_range_tli)
640 14 : break;
641 : }
642 :
643 : /*
644 : * Read all of the required block reference table files and merge all of
645 : * the data into a single in-memory block reference table.
646 : *
647 : * See the comments for struct IncrementalBackupInfo for some thoughts on
648 : * memory usage.
649 : */
650 14 : ib->brtab = CreateEmptyBlockRefTable();
651 38 : foreach(lc, required_wslist)
652 : {
653 24 : WalSummaryFile *ws = lfirst(lc);
654 : WalSummaryIO wsio;
655 : BlockRefTableReader *reader;
656 : RelFileLocator rlocator;
657 : ForkNumber forknum;
658 : BlockNumber limit_block;
659 : BlockNumber blocks[BLOCKS_PER_READ];
660 :
661 24 : wsio.file = OpenWalSummaryFile(ws, false);
662 24 : wsio.filepos = 0;
663 24 : ereport(DEBUG1,
664 : (errmsg_internal("reading WAL summary file \"%s\"",
665 : FilePathName(wsio.file))));
666 24 : reader = CreateBlockRefTableReader(ReadWalSummary, &wsio,
667 : FilePathName(wsio.file),
668 : ReportWalSummaryError, NULL);
669 714 : while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum,
670 : &limit_block))
671 : {
672 690 : BlockRefTableSetLimitBlock(ib->brtab, &rlocator,
673 : forknum, limit_block);
674 :
675 : while (1)
676 498 : {
677 : unsigned nblocks;
678 : unsigned i;
679 :
680 1188 : nblocks = BlockRefTableReaderGetBlocks(reader, blocks,
681 : BLOCKS_PER_READ);
682 1188 : if (nblocks == 0)
683 690 : break;
684 :
685 2280 : for (i = 0; i < nblocks; ++i)
686 1782 : BlockRefTableMarkBlockModified(ib->brtab, &rlocator,
687 : forknum, blocks[i]);
688 : }
689 : }
690 24 : DestroyBlockRefTableReader(reader);
691 24 : FileClose(wsio.file);
692 : }
693 :
694 : /* Switch back to previous memory context. */
695 14 : MemoryContextSwitchTo(oldcontext);
696 14 : }
697 :
698 : /*
699 : * Get the pathname that should be used when a file is sent incrementally.
700 : *
701 : * The result is a palloc'd string.
702 : */
703 : char *
704 3202 : GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
705 : ForkNumber forknum, unsigned segno)
706 : {
707 : char *path;
708 : char *lastslash;
709 : char *ipath;
710 :
711 3202 : path = GetRelationPath(dboid, spcoid, relfilenumber, INVALID_PROC_NUMBER,
712 : forknum);
713 :
714 3202 : lastslash = strrchr(path, '/');
715 : Assert(lastslash != NULL);
716 3202 : *lastslash = '\0';
717 :
718 3202 : if (segno > 0)
719 0 : ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno);
720 : else
721 3202 : ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1);
722 :
723 3202 : pfree(path);
724 :
725 3202 : return ipath;
726 : }
727 :
728 : /*
729 : * How should we back up a particular file as part of an incremental backup?
730 : *
731 : * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole
732 : * file just as if this were not an incremental backup. The contents of the
733 : * relative_block_numbers array are unspecified in this case.
734 : *
735 : * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include
736 : * an incremental file in the backup instead of the entire file. On return,
737 : * *num_blocks_required will be set to the number of blocks that need to be
738 : * sent, and the actual block numbers will have been stored in
739 : * relative_block_numbers, which should be an array of at least RELSEG_SIZE.
740 : * In addition, *truncation_block_length will be set to the value that should
741 : * be included in the incremental file.
742 : */
743 : FileBackupMethod
744 14528 : GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path,
745 : Oid dboid, Oid spcoid,
746 : RelFileNumber relfilenumber, ForkNumber forknum,
747 : unsigned segno, size_t size,
748 : unsigned *num_blocks_required,
749 : BlockNumber *relative_block_numbers,
750 : unsigned *truncation_block_length)
751 : {
752 : BlockNumber limit_block;
753 : BlockNumber start_blkno;
754 : BlockNumber stop_blkno;
755 : RelFileLocator rlocator;
756 : BlockRefTableEntry *brtentry;
757 : unsigned i;
758 : unsigned nblocks;
759 :
760 : /* Should only be called after PrepareForIncrementalBackup. */
761 : Assert(ib->buf.data == NULL);
762 :
763 : /*
764 : * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber
765 : * should have legal values.
766 : */
767 : Assert(OidIsValid(spcoid));
768 : Assert(RelFileNumberIsValid(relfilenumber));
769 :
770 : /*
771 : * If the file size is too large or not a multiple of BLCKSZ, then
772 : * something weird is happening, so give up and send the whole file.
773 : */
774 14528 : if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE)
775 0 : return BACK_UP_FILE_FULLY;
776 :
777 : /*
778 : * The free-space map fork is not properly WAL-logged, so we need to
779 : * backup the entire file every time.
780 : */
781 14528 : if (forknum == FSM_FORKNUM)
782 1776 : return BACK_UP_FILE_FULLY;
783 :
784 : /*
785 : * If this file was not part of the prior backup, back it up fully.
786 : *
787 : * If this file was created after the prior backup and before the start of
788 : * the current backup, then the WAL summary information will tell us to
789 : * back up the whole file. However, if this file was created after the
790 : * start of the current backup, then the WAL summary won't know anything
791 : * about it. Without this logic, we would erroneously conclude that it was
792 : * OK to send it incrementally.
793 : *
794 : * Note that the file could have existed at the time of the prior backup,
795 : * gotten deleted, and then a new file with the same name could have been
796 : * created. In that case, this logic won't prevent the file from being
797 : * backed up incrementally. But, if the deletion happened before the start
798 : * of the current backup, the limit block will be 0, inducing a full
799 : * backup. If the deletion happened after the start of the current backup,
800 : * reconstruction will erroneously combine blocks from the current
801 : * lifespan of the file with blocks from the previous lifespan -- but in
802 : * this type of case, WAL replay to reach backup consistency should remove
803 : * and recreate the file anyway, so the initial bogus contents should not
804 : * matter.
805 : */
806 12752 : if (backup_file_lookup(ib->manifest_files, path) == NULL)
807 : {
808 : char *ipath;
809 :
810 3202 : ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber,
811 : forknum, segno);
812 3202 : if (backup_file_lookup(ib->manifest_files, ipath) == NULL)
813 544 : return BACK_UP_FILE_FULLY;
814 : }
815 :
816 : /*
817 : * Look up the special block reference table entry for the database as a
818 : * whole.
819 : */
820 12208 : rlocator.spcOid = spcoid;
821 12208 : rlocator.dbOid = dboid;
822 12208 : rlocator.relNumber = 0;
823 12208 : if (BlockRefTableGetEntry(ib->brtab, &rlocator, MAIN_FORKNUM,
824 : &limit_block) != NULL)
825 : {
826 : /*
827 : * According to the WAL summary, this database OID/tablespace OID
828 : * pairing has been created since the previous backup. So, everything
829 : * in it must be backed up fully.
830 : */
831 518 : return BACK_UP_FILE_FULLY;
832 : }
833 :
834 : /* Look up the block reference table entry for this relfilenode. */
835 11690 : rlocator.relNumber = relfilenumber;
836 11690 : brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum,
837 : &limit_block);
838 :
839 : /*
840 : * If there is no entry, then there have been no WAL-logged changes to the
841 : * relation since the predecessor backup was taken, so we can back it up
842 : * incrementally and need not include any modified blocks.
843 : *
844 : * However, if the file is zero-length, we should do a full backup,
845 : * because an incremental file is always more than zero length, and it's
846 : * silly to take an incremental backup when a full backup would be
847 : * smaller.
848 : */
849 11690 : if (brtentry == NULL)
850 : {
851 11628 : if (size == 0)
852 2366 : return BACK_UP_FILE_FULLY;
853 9262 : *num_blocks_required = 0;
854 9262 : *truncation_block_length = size / BLCKSZ;
855 9262 : return BACK_UP_FILE_INCREMENTALLY;
856 : }
857 :
858 : /*
859 : * If the limit_block is less than or equal to the point where this
860 : * segment starts, send the whole file.
861 : */
862 62 : if (limit_block <= segno * RELSEG_SIZE)
863 0 : return BACK_UP_FILE_FULLY;
864 :
865 : /*
866 : * Get relevant entries from the block reference table entry.
867 : *
868 : * We shouldn't overflow computing the start or stop block numbers, but if
869 : * it manages to happen somehow, detect it and throw an error.
870 : */
871 62 : start_blkno = segno * RELSEG_SIZE;
872 62 : stop_blkno = start_blkno + (size / BLCKSZ);
873 62 : if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno)
874 0 : ereport(ERROR,
875 : errcode(ERRCODE_INTERNAL_ERROR),
876 : errmsg_internal("overflow computing block number bounds for segment %u with size %zu",
877 : segno, size));
878 :
879 : /*
880 : * This will write *absolute* block numbers into the output array, but
881 : * we'll transpose them below.
882 : */
883 62 : nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno,
884 : relative_block_numbers, RELSEG_SIZE);
885 : Assert(nblocks <= RELSEG_SIZE);
886 :
887 : /*
888 : * If we're going to have to send nearly all of the blocks, then just send
889 : * the whole file, because that won't require much extra storage or
890 : * transfer and will speed up and simplify backup restoration. It's not
891 : * clear what threshold is most appropriate here and perhaps it ought to
892 : * be configurable, but for now we're just going to say that if we'd need
893 : * to send 90% of the blocks anyway, give up and send the whole file.
894 : *
895 : * NB: If you change the threshold here, at least make sure to back up the
896 : * file fully when every single block must be sent, because there's
897 : * nothing good about sending an incremental file in that case.
898 : */
899 62 : if (nblocks * BLCKSZ > size * 0.9)
900 16 : return BACK_UP_FILE_FULLY;
901 :
902 : /*
903 : * Looks like we can send an incremental file, so sort the block numbers
904 : * and then transpose them from absolute block numbers to relative block
905 : * numbers if necessary.
906 : *
907 : * NB: If the block reference table was using the bitmap representation
908 : * for a given chunk, the block numbers in that chunk will already be
909 : * sorted, but when the array-of-offsets representation is used, we can
910 : * receive block numbers here out of order.
911 : */
912 46 : qsort(relative_block_numbers, nblocks, sizeof(BlockNumber),
913 : compare_block_numbers);
914 46 : if (start_blkno != 0)
915 : {
916 0 : for (i = 0; i < nblocks; ++i)
917 0 : relative_block_numbers[i] -= start_blkno;
918 : }
919 46 : *num_blocks_required = nblocks;
920 :
921 : /*
922 : * The truncation block length is the minimum length of the reconstructed
923 : * file. Any block numbers below this threshold that are not present in
924 : * the backup need to be fetched from the prior backup. At or above this
925 : * threshold, blocks should only be included in the result if they are
926 : * present in the backup. (This may require inserting zero blocks if the
927 : * blocks included in the backup are non-consecutive.)
928 : */
929 46 : *truncation_block_length = size / BLCKSZ;
930 46 : if (BlockNumberIsValid(limit_block))
931 : {
932 0 : unsigned relative_limit = limit_block - segno * RELSEG_SIZE;
933 :
934 0 : if (*truncation_block_length < relative_limit)
935 0 : *truncation_block_length = relative_limit;
936 : }
937 :
938 : /* Send it incrementally. */
939 46 : return BACK_UP_FILE_INCREMENTALLY;
940 : }
941 :
942 : /*
943 : * Compute the size for a header of an incremental file containing a given
944 : * number of blocks. The header is rounded to a multiple of BLCKSZ, but
945 : * only if the file will store some block data.
946 : */
947 : extern size_t
948 9308 : GetIncrementalHeaderSize(unsigned num_blocks_required)
949 : {
950 : size_t result;
951 :
952 : /* Make sure we're not going to overflow. */
953 : Assert(num_blocks_required <= RELSEG_SIZE);
954 :
955 : /*
956 : * Three four byte quantities (magic number, truncation block length,
957 : * block count) followed by block numbers.
958 : */
959 9308 : result = 3 * sizeof(uint32) + (sizeof(BlockNumber) * num_blocks_required);
960 :
961 : /*
962 : * Round the header size to a multiple of BLCKSZ - when not a multiple of
963 : * BLCKSZ, add the missing fraction of a block. But do this only if the
964 : * file will store data for some blocks, otherwise keep it small.
965 : */
966 9308 : if ((num_blocks_required > 0) && (result % BLCKSZ != 0))
967 46 : result += BLCKSZ - (result % BLCKSZ);
968 :
969 9308 : return result;
970 : }
971 :
972 : /*
973 : * Compute the size for an incremental file containing a given number of blocks.
974 : */
975 : extern size_t
976 9308 : GetIncrementalFileSize(unsigned num_blocks_required)
977 : {
978 : size_t result;
979 :
980 : /* Make sure we're not going to overflow. */
981 : Assert(num_blocks_required <= RELSEG_SIZE);
982 :
983 : /*
984 : * Header with three four byte quantities (magic number, truncation block
985 : * length, block count) followed by block numbers, rounded to a multiple
986 : * of BLCKSZ (for files with block data), followed by block contents.
987 : */
988 9308 : result = GetIncrementalHeaderSize(num_blocks_required);
989 9308 : result += BLCKSZ * num_blocks_required;
990 :
991 9308 : return result;
992 : }
993 :
994 : /*
995 : * Helper function for filemap hash table.
996 : */
997 : static uint32
998 31236 : hash_string_pointer(const char *s)
999 : {
1000 31236 : unsigned char *ss = (unsigned char *) s;
1001 :
1002 31236 : return hash_bytes(ss, strlen(s));
1003 : }
1004 :
1005 : /*
1006 : * This callback to validate the manifest version for incremental backup.
1007 : */
1008 : static void
1009 16 : manifest_process_version(JsonManifestParseContext *context,
1010 : int manifest_version)
1011 : {
1012 : /* Incremental backups don't work with manifest version 1 */
1013 16 : if (manifest_version == 1)
1014 0 : context->error_cb(context,
1015 : "backup manifest version 1 does not support incremental backup");
1016 16 : }
1017 :
1018 : /*
1019 : * This callback to validate the manifest system identifier against the current
1020 : * database server.
1021 : */
1022 : static void
1023 16 : manifest_process_system_identifier(JsonManifestParseContext *context,
1024 : uint64 manifest_system_identifier)
1025 : {
1026 : uint64 system_identifier;
1027 :
1028 : /* Get system identifier of current system */
1029 16 : system_identifier = GetSystemIdentifier();
1030 :
1031 16 : if (manifest_system_identifier != system_identifier)
1032 2 : context->error_cb(context,
1033 : "manifest system identifier is %llu, but database system identifier is %llu",
1034 : (unsigned long long) manifest_system_identifier,
1035 : (unsigned long long) system_identifier);
1036 14 : }
1037 :
1038 : /*
1039 : * This callback is invoked for each file mentioned in the backup manifest.
1040 : *
1041 : * We store the path to each file and the size of each file for sanity-checking
1042 : * purposes. For further details, see comments for IncrementalBackupInfo.
1043 : */
1044 : static void
1045 14788 : manifest_process_file(JsonManifestParseContext *context,
1046 : char *pathname, size_t size,
1047 : pg_checksum_type checksum_type,
1048 : int checksum_length,
1049 : uint8 *checksum_payload)
1050 : {
1051 14788 : IncrementalBackupInfo *ib = context->private_data;
1052 : backup_file_entry *entry;
1053 : bool found;
1054 :
1055 14788 : entry = backup_file_insert(ib->manifest_files, pathname, &found);
1056 14788 : if (!found)
1057 : {
1058 14788 : entry->path = MemoryContextStrdup(ib->manifest_files->ctx,
1059 : pathname);
1060 14788 : entry->size = size;
1061 : }
1062 14788 : }
1063 :
1064 : /*
1065 : * This callback is invoked for each WAL range mentioned in the backup
1066 : * manifest.
1067 : *
1068 : * We're just interested in learning the oldest LSN and the corresponding TLI
1069 : * that appear in any WAL range.
1070 : */
1071 : static void
1072 14 : manifest_process_wal_range(JsonManifestParseContext *context,
1073 : TimeLineID tli, XLogRecPtr start_lsn,
1074 : XLogRecPtr end_lsn)
1075 : {
1076 14 : IncrementalBackupInfo *ib = context->private_data;
1077 14 : backup_wal_range *range = palloc(sizeof(backup_wal_range));
1078 :
1079 14 : range->tli = tli;
1080 14 : range->start_lsn = start_lsn;
1081 14 : range->end_lsn = end_lsn;
1082 14 : ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range);
1083 14 : }
1084 :
1085 : /*
1086 : * This callback is invoked if an error occurs while parsing the backup
1087 : * manifest.
1088 : */
1089 : static void
1090 2 : manifest_report_error(JsonManifestParseContext *context, const char *fmt,...)
1091 : {
1092 : StringInfoData errbuf;
1093 :
1094 2 : initStringInfo(&errbuf);
1095 :
1096 : for (;;)
1097 0 : {
1098 : va_list ap;
1099 : int needed;
1100 :
1101 2 : va_start(ap, fmt);
1102 2 : needed = appendStringInfoVA(&errbuf, fmt, ap);
1103 2 : va_end(ap);
1104 2 : if (needed == 0)
1105 2 : break;
1106 0 : enlargeStringInfo(&errbuf, needed);
1107 : }
1108 :
1109 2 : ereport(ERROR,
1110 : errmsg_internal("%s", errbuf.data));
1111 : }
1112 :
1113 : /*
1114 : * Quicksort comparator for block numbers.
1115 : */
1116 : static int
1117 44 : compare_block_numbers(const void *a, const void *b)
1118 : {
1119 44 : BlockNumber aa = *(BlockNumber *) a;
1120 44 : BlockNumber bb = *(BlockNumber *) b;
1121 :
1122 44 : return pg_cmp_u32(aa, bb);
1123 : }
|