Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_rewind.c
4 : * Synchronizes a PostgreSQL data directory to a new timeline
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : *
8 : *-------------------------------------------------------------------------
9 : */
10 : #include "postgres_fe.h"
11 :
12 : #include <sys/stat.h>
13 : #include <fcntl.h>
14 : #include <time.h>
15 : #include <unistd.h>
16 :
17 : #include "access/timeline.h"
18 : #include "access/xlog_internal.h"
19 : #include "catalog/catversion.h"
20 : #include "catalog/pg_control.h"
21 : #include "common/controldata_utils.h"
22 : #include "common/file_perm.h"
23 : #include "common/restricted_token.h"
24 : #include "common/string.h"
25 : #include "fe_utils/option_utils.h"
26 : #include "fe_utils/recovery_gen.h"
27 : #include "fe_utils/string_utils.h"
28 : #include "file_ops.h"
29 : #include "filemap.h"
30 : #include "getopt_long.h"
31 : #include "pg_rewind.h"
32 : #include "rewind_source.h"
33 : #include "storage/bufpage.h"
34 :
35 : static void usage(const char *progname);
36 :
37 : static void perform_rewind(filemap_t *filemap, rewind_source *source,
38 : XLogRecPtr chkptrec,
39 : TimeLineID chkpttli,
40 : XLogRecPtr chkptredo);
41 :
42 : static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
43 : XLogRecPtr checkpointloc);
44 :
45 : static void digestControlFile(ControlFileData *ControlFile,
46 : const char *content, size_t size);
47 : static void getRestoreCommand(const char *argv0);
48 : static void sanityChecks(void);
49 : static TimeLineHistoryEntry *getTimelineHistory(TimeLineID tli, bool is_source,
50 : int *nentries);
51 : static void findCommonAncestorTimeline(TimeLineHistoryEntry *a_history,
52 : int a_nentries,
53 : TimeLineHistoryEntry *b_history,
54 : int b_nentries,
55 : XLogRecPtr *recptr, int *tliIndex);
56 : static void ensureCleanShutdown(const char *argv0);
57 : static void disconnect_atexit(void);
58 :
59 : static ControlFileData ControlFile_target;
60 : static ControlFileData ControlFile_source;
61 : static ControlFileData ControlFile_source_after;
62 :
63 : static const char *progname;
64 : int WalSegSz;
65 :
66 : /* Configuration options */
67 : char *datadir_target = NULL;
68 : static char *datadir_source = NULL;
69 : static char *connstr_source = NULL;
70 : static char *restore_command = NULL;
71 : static char *config_file = NULL;
72 :
73 : static bool debug = false;
74 : bool showprogress = false;
75 : bool dry_run = false;
76 : bool do_sync = true;
77 : static bool restore_wal = false;
78 : DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
79 :
80 : /* Target history */
81 : TimeLineHistoryEntry *targetHistory;
82 : int targetNentries;
83 :
84 : /* Progress counters */
85 : uint64 fetch_size;
86 : uint64 fetch_done;
87 :
88 : static PGconn *conn;
89 : static rewind_source *source;
90 :
91 : static void
92 2 : usage(const char *progname)
93 : {
94 2 : printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
95 2 : printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
96 2 : printf(_("Options:\n"));
97 2 : printf(_(" -c, --restore-target-wal use \"restore_command\" in target configuration to\n"
98 : " retrieve WAL files from archives\n"));
99 2 : printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n"));
100 2 : printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n"));
101 2 : printf(_(" --source-server=CONNSTR source server to synchronize with\n"));
102 2 : printf(_(" -n, --dry-run stop before modifying anything\n"));
103 2 : printf(_(" -N, --no-sync do not wait for changes to be written\n"
104 : " safely to disk\n"));
105 2 : printf(_(" -P, --progress write progress messages\n"));
106 2 : printf(_(" -R, --write-recovery-conf write configuration for replication\n"
107 : " (requires --source-server)\n"));
108 2 : printf(_(" --config-file=FILENAME use specified main server configuration\n"
109 : " file when running target cluster\n"));
110 2 : printf(_(" --debug write a lot of debug messages\n"));
111 2 : printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n"));
112 2 : printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
113 2 : printf(_(" -V, --version output version information, then exit\n"));
114 2 : printf(_(" -?, --help show this help, then exit\n"));
115 2 : printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
116 2 : printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
117 2 : }
118 :
119 :
120 : int
121 50 : main(int argc, char **argv)
122 : {
123 : static struct option long_options[] = {
124 : {"help", no_argument, NULL, '?'},
125 : {"target-pgdata", required_argument, NULL, 'D'},
126 : {"write-recovery-conf", no_argument, NULL, 'R'},
127 : {"source-pgdata", required_argument, NULL, 1},
128 : {"source-server", required_argument, NULL, 2},
129 : {"no-ensure-shutdown", no_argument, NULL, 4},
130 : {"config-file", required_argument, NULL, 5},
131 : {"version", no_argument, NULL, 'V'},
132 : {"restore-target-wal", no_argument, NULL, 'c'},
133 : {"dry-run", no_argument, NULL, 'n'},
134 : {"no-sync", no_argument, NULL, 'N'},
135 : {"progress", no_argument, NULL, 'P'},
136 : {"debug", no_argument, NULL, 3},
137 : {"sync-method", required_argument, NULL, 6},
138 : {NULL, 0, NULL, 0}
139 : };
140 : int option_index;
141 : int c;
142 : XLogRecPtr divergerec;
143 : int lastcommontliIndex;
144 : XLogRecPtr chkptrec;
145 : TimeLineID chkpttli;
146 : XLogRecPtr chkptredo;
147 : TimeLineID source_tli;
148 : TimeLineID target_tli;
149 : XLogRecPtr target_wal_endrec;
150 : size_t size;
151 : char *buffer;
152 50 : bool no_ensure_shutdown = false;
153 : bool rewind_needed;
154 50 : bool writerecoveryconf = false;
155 : filemap_t *filemap;
156 :
157 50 : pg_logging_init(argv[0]);
158 50 : set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
159 50 : progname = get_progname(argv[0]);
160 :
161 : /* Process command-line arguments */
162 50 : if (argc > 1)
163 : {
164 50 : if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
165 : {
166 2 : usage(progname);
167 2 : exit(0);
168 : }
169 48 : if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
170 : {
171 2 : puts("pg_rewind (PostgreSQL) " PG_VERSION);
172 2 : exit(0);
173 : }
174 : }
175 :
176 252 : while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1)
177 : {
178 208 : switch (c)
179 : {
180 2 : case 'c':
181 2 : restore_wal = true;
182 2 : break;
183 :
184 0 : case 'P':
185 0 : showprogress = true;
186 0 : break;
187 :
188 2 : case 'n':
189 2 : dry_run = true;
190 2 : break;
191 :
192 34 : case 'N':
193 34 : do_sync = false;
194 34 : break;
195 :
196 12 : case 'R':
197 12 : writerecoveryconf = true;
198 12 : break;
199 :
200 42 : case 3:
201 42 : debug = true;
202 42 : pg_logging_increase_verbosity();
203 42 : break;
204 :
205 44 : case 'D': /* -D or --target-pgdata */
206 44 : datadir_target = pg_strdup(optarg);
207 44 : break;
208 :
209 30 : case 1: /* --source-pgdata */
210 30 : datadir_source = pg_strdup(optarg);
211 30 : break;
212 :
213 14 : case 2: /* --source-server */
214 14 : connstr_source = pg_strdup(optarg);
215 14 : break;
216 :
217 6 : case 4:
218 6 : no_ensure_shutdown = true;
219 6 : break;
220 :
221 20 : case 5:
222 20 : config_file = pg_strdup(optarg);
223 20 : break;
224 :
225 0 : case 6:
226 0 : if (!parse_sync_method(optarg, &sync_method))
227 0 : exit(1);
228 0 : break;
229 :
230 2 : default:
231 : /* getopt_long already emitted a complaint */
232 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
233 2 : exit(1);
234 : }
235 : }
236 :
237 44 : if (datadir_source == NULL && connstr_source == NULL)
238 : {
239 2 : pg_log_error("no source specified (--source-pgdata or --source-server)");
240 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
241 2 : exit(1);
242 : }
243 :
244 42 : if (datadir_source != NULL && connstr_source != NULL)
245 : {
246 2 : pg_log_error("only one of --source-pgdata or --source-server can be specified");
247 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
248 2 : exit(1);
249 : }
250 :
251 40 : if (datadir_target == NULL)
252 : {
253 0 : pg_log_error("no target data directory specified (--target-pgdata)");
254 0 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
255 0 : exit(1);
256 : }
257 :
258 40 : if (writerecoveryconf && connstr_source == NULL)
259 : {
260 2 : pg_log_error("no source server information (--source-server) specified for --write-recovery-conf");
261 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
262 2 : exit(1);
263 : }
264 :
265 38 : if (optind < argc)
266 : {
267 2 : pg_log_error("too many command-line arguments (first is \"%s\")",
268 : argv[optind]);
269 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
270 2 : exit(1);
271 : }
272 :
273 : /*
274 : * Don't allow pg_rewind to be run as root, to avoid overwriting the
275 : * ownership of files in the data directory. We need only check for root
276 : * -- any other user won't have sufficient permissions to modify files in
277 : * the data directory.
278 : */
279 : #ifndef WIN32
280 36 : if (geteuid() == 0)
281 : {
282 0 : pg_log_error("cannot be executed by \"root\"");
283 0 : pg_log_error_hint("You must run %s as the PostgreSQL superuser.",
284 : progname);
285 0 : exit(1);
286 : }
287 : #endif
288 :
289 36 : get_restricted_token();
290 :
291 : /* Set mask based on PGDATA permissions */
292 36 : if (!GetDataDirectoryCreatePerm(datadir_target))
293 0 : pg_fatal("could not read permissions of directory \"%s\": %m",
294 : datadir_target);
295 :
296 36 : umask(pg_mode_mask);
297 :
298 36 : getRestoreCommand(argv[0]);
299 :
300 36 : atexit(disconnect_atexit);
301 :
302 : /*
303 : * Ok, we have all the options and we're ready to start. First, connect to
304 : * remote server.
305 : */
306 36 : if (connstr_source)
307 : {
308 12 : conn = PQconnectdb(connstr_source);
309 :
310 12 : if (PQstatus(conn) == CONNECTION_BAD)
311 0 : pg_fatal("%s", PQerrorMessage(conn));
312 :
313 12 : if (showprogress)
314 0 : pg_log_info("connected to server");
315 :
316 12 : source = init_libpq_source(conn);
317 : }
318 : else
319 24 : source = init_local_source(datadir_source);
320 :
321 : /*
322 : * Check the status of the target instance.
323 : *
324 : * If the target instance was not cleanly shut down, start and stop the
325 : * target cluster once in single-user mode to enforce recovery to finish,
326 : * ensuring that the cluster can be used by pg_rewind. Note that if
327 : * no_ensure_shutdown is specified, pg_rewind ignores this step, and users
328 : * need to make sure by themselves that the target cluster is in a clean
329 : * state.
330 : */
331 36 : buffer = slurpFile(datadir_target, "global/pg_control", &size);
332 36 : digestControlFile(&ControlFile_target, buffer, size);
333 36 : pg_free(buffer);
334 :
335 36 : if (!no_ensure_shutdown &&
336 30 : ControlFile_target.state != DB_SHUTDOWNED &&
337 22 : ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
338 : {
339 20 : ensureCleanShutdown(argv[0]);
340 :
341 18 : buffer = slurpFile(datadir_target, "global/pg_control", &size);
342 18 : digestControlFile(&ControlFile_target, buffer, size);
343 18 : pg_free(buffer);
344 : }
345 :
346 34 : buffer = source->fetch_file(source, "global/pg_control", &size);
347 34 : digestControlFile(&ControlFile_source, buffer, size);
348 34 : pg_free(buffer);
349 :
350 34 : sanityChecks();
351 :
352 : /*
353 : * Usually, the TLI can be found in the latest checkpoint record. But if
354 : * the source server is just being promoted (or it's a standby that's
355 : * following a primary that's just being promoted), and the checkpoint
356 : * requested by the promotion hasn't completed yet, the latest timeline is
357 : * in minRecoveryPoint. So we check which is later, the TLI of the
358 : * minRecoveryPoint or the latest checkpoint.
359 : */
360 30 : source_tli = Max(ControlFile_source.minRecoveryPointTLI,
361 : ControlFile_source.checkPointCopy.ThisTimeLineID);
362 :
363 : /* Similarly for the target. */
364 30 : target_tli = Max(ControlFile_target.minRecoveryPointTLI,
365 : ControlFile_target.checkPointCopy.ThisTimeLineID);
366 :
367 : /*
368 : * Find the common ancestor timeline between the clusters.
369 : *
370 : * If both clusters are already on the same timeline, there's nothing to
371 : * do.
372 : */
373 30 : if (target_tli == source_tli)
374 : {
375 2 : pg_log_info("source and target cluster are on the same timeline");
376 2 : rewind_needed = false;
377 2 : target_wal_endrec = 0;
378 : }
379 : else
380 : {
381 : XLogRecPtr chkptendrec;
382 : TimeLineHistoryEntry *sourceHistory;
383 : int sourceNentries;
384 :
385 : /*
386 : * Retrieve timelines for both source and target, and find the point
387 : * where they diverged.
388 : */
389 28 : sourceHistory = getTimelineHistory(source_tli, true, &sourceNentries);
390 28 : targetHistory = getTimelineHistory(target_tli, false, &targetNentries);
391 :
392 28 : findCommonAncestorTimeline(sourceHistory, sourceNentries,
393 : targetHistory, targetNentries,
394 : &divergerec, &lastcommontliIndex);
395 :
396 28 : pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
397 : LSN_FORMAT_ARGS(divergerec),
398 : targetHistory[lastcommontliIndex].tli);
399 :
400 : /*
401 : * Don't need the source history anymore. The target history is still
402 : * needed by the routines in parsexlog.c, when we read the target WAL.
403 : */
404 28 : pfree(sourceHistory);
405 :
406 :
407 : /*
408 : * Determine the end-of-WAL on the target.
409 : *
410 : * The WAL ends at the last shutdown checkpoint, or at
411 : * minRecoveryPoint if it was a standby. (If we supported rewinding a
412 : * server that was not shut down cleanly, we would need to replay
413 : * until we reach the first invalid record, like crash recovery does.)
414 : */
415 :
416 : /* read the checkpoint record on the target to see where it ends. */
417 28 : chkptendrec = readOneRecord(datadir_target,
418 : ControlFile_target.checkPoint,
419 : targetNentries - 1,
420 : restore_command);
421 :
422 28 : if (ControlFile_target.minRecoveryPoint > chkptendrec)
423 : {
424 2 : target_wal_endrec = ControlFile_target.minRecoveryPoint;
425 : }
426 : else
427 : {
428 26 : target_wal_endrec = chkptendrec;
429 : }
430 :
431 : /*
432 : * Check for the possibility that the target is in fact a direct
433 : * ancestor of the source. In that case, there is no divergent history
434 : * in the target that needs rewinding.
435 : */
436 28 : if (target_wal_endrec > divergerec)
437 : {
438 28 : rewind_needed = true;
439 : }
440 : else
441 : {
442 : /* the last common checkpoint record must be part of target WAL */
443 : Assert(target_wal_endrec == divergerec);
444 :
445 0 : rewind_needed = false;
446 : }
447 : }
448 :
449 30 : if (!rewind_needed)
450 : {
451 2 : pg_log_info("no rewind required");
452 2 : if (writerecoveryconf && !dry_run)
453 0 : WriteRecoveryConfig(conn, datadir_target,
454 : GenerateRecoveryConfig(conn, NULL,
455 : GetDbnameFromConnectionOptions(connstr_source)));
456 2 : exit(0);
457 : }
458 :
459 : /* Initialize hashtable that tracks WAL files protected from removal */
460 28 : keepwal_init();
461 :
462 28 : findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
463 : &chkptrec, &chkpttli, &chkptredo, restore_command);
464 28 : pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
465 : LSN_FORMAT_ARGS(chkptrec), chkpttli);
466 :
467 : /* Initialize the hash table to track the status of each file */
468 28 : filehash_init();
469 :
470 : /*
471 : * Collect information about all files in the both data directories.
472 : */
473 28 : if (showprogress)
474 0 : pg_log_info("reading source file list");
475 28 : source->traverse_files(source, &process_source_file);
476 :
477 28 : if (showprogress)
478 0 : pg_log_info("reading target file list");
479 28 : traverse_datadir(datadir_target, &process_target_file);
480 :
481 : /*
482 : * Read the target WAL from last checkpoint before the point of fork, to
483 : * extract all the pages that were modified on the target cluster after
484 : * the fork.
485 : */
486 28 : if (showprogress)
487 0 : pg_log_info("reading WAL in target");
488 28 : extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
489 : target_wal_endrec, restore_command);
490 :
491 : /*
492 : * We have collected all information we need from both systems. Decide
493 : * what to do with each file.
494 : */
495 28 : filemap = decide_file_actions();
496 28 : if (showprogress)
497 0 : calculate_totals(filemap);
498 :
499 : /* this is too verbose even for verbose mode */
500 28 : if (debug)
501 28 : print_filemap(filemap);
502 :
503 : /*
504 : * Ok, we're ready to start copying things over.
505 : */
506 28 : if (showprogress)
507 : {
508 0 : pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
509 : (unsigned long) (filemap->fetch_size / (1024 * 1024)),
510 : (unsigned long) (filemap->total_size / (1024 * 1024)));
511 :
512 0 : fetch_size = filemap->fetch_size;
513 0 : fetch_done = 0;
514 : }
515 :
516 : /*
517 : * We have now collected all the information we need from both systems,
518 : * and we are ready to start modifying the target directory.
519 : *
520 : * This is the point of no return. Once we start copying things, there is
521 : * no turning back!
522 : */
523 28 : perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo);
524 :
525 26 : if (showprogress)
526 0 : pg_log_info("syncing target data directory");
527 26 : sync_target_dir();
528 :
529 : /* Also update the standby configuration, if requested. */
530 26 : if (writerecoveryconf && !dry_run)
531 10 : WriteRecoveryConfig(conn, datadir_target,
532 : GenerateRecoveryConfig(conn, NULL,
533 : GetDbnameFromConnectionOptions(connstr_source)));
534 :
535 : /* don't need the source connection anymore */
536 26 : source->destroy(source);
537 26 : if (conn)
538 : {
539 12 : PQfinish(conn);
540 12 : conn = NULL;
541 : }
542 :
543 26 : pg_log_info("Done!");
544 :
545 26 : return 0;
546 : }
547 :
548 : /*
549 : * Perform the rewind.
550 : *
551 : * We have already collected all the information we need from the
552 : * target and the source.
553 : */
554 : static void
555 28 : perform_rewind(filemap_t *filemap, rewind_source *source,
556 : XLogRecPtr chkptrec,
557 : TimeLineID chkpttli,
558 : XLogRecPtr chkptredo)
559 : {
560 : XLogRecPtr endrec;
561 : TimeLineID endtli;
562 : ControlFileData ControlFile_new;
563 : size_t size;
564 : char *buffer;
565 :
566 : /*
567 : * Execute the actions in the file map, fetching data from the source
568 : * system as needed.
569 : */
570 31908 : for (int i = 0; i < filemap->nentries; i++)
571 : {
572 31882 : file_entry_t *entry = filemap->entries[i];
573 :
574 : /*
575 : * If this is a relation file, copy the modified blocks.
576 : *
577 : * This is in addition to any other changes.
578 : */
579 31882 : if (entry->target_pages_to_overwrite.bitmapsize > 0)
580 : {
581 : datapagemap_iterator_t *iter;
582 : BlockNumber blkno;
583 : off_t offset;
584 :
585 820 : iter = datapagemap_iterate(&entry->target_pages_to_overwrite);
586 4142 : while (datapagemap_next(iter, &blkno))
587 : {
588 3322 : offset = blkno * BLCKSZ;
589 3322 : source->queue_fetch_range(source, entry->path, offset, BLCKSZ);
590 : }
591 820 : pg_free(iter);
592 : }
593 :
594 31882 : switch (entry->action)
595 : {
596 21546 : case FILE_ACTION_NONE:
597 : /* nothing else to do */
598 21546 : break;
599 :
600 8874 : case FILE_ACTION_COPY:
601 8874 : source->queue_fetch_file(source, entry->path, entry->source_size);
602 8872 : break;
603 :
604 8 : case FILE_ACTION_TRUNCATE:
605 8 : truncate_target_file(entry->path, entry->source_size);
606 8 : break;
607 :
608 10 : case FILE_ACTION_COPY_TAIL:
609 10 : source->queue_fetch_range(source, entry->path,
610 10 : entry->target_size,
611 10 : entry->source_size - entry->target_size);
612 10 : break;
613 :
614 1426 : case FILE_ACTION_REMOVE:
615 1426 : remove_target(entry);
616 1426 : break;
617 :
618 18 : case FILE_ACTION_CREATE:
619 18 : create_target(entry);
620 18 : break;
621 :
622 0 : case FILE_ACTION_UNDECIDED:
623 0 : pg_fatal("no action decided for file \"%s\"", entry->path);
624 : break;
625 : }
626 31880 : }
627 :
628 : /* Complete any remaining range-fetches that we queued up above. */
629 26 : source->finish_fetch(source);
630 :
631 26 : close_target_file();
632 :
633 26 : progress_report(true);
634 :
635 : /*
636 : * Fetch the control file from the source last. This ensures that the
637 : * minRecoveryPoint is up-to-date.
638 : */
639 26 : buffer = source->fetch_file(source, "global/pg_control", &size);
640 26 : digestControlFile(&ControlFile_source_after, buffer, size);
641 26 : pg_free(buffer);
642 :
643 : /*
644 : * Sanity check: If the source is a local system, the control file should
645 : * not have changed since we started.
646 : *
647 : * XXX: We assume it hasn't been modified, but actually, what could go
648 : * wrong? The logic handles a libpq source that's modified concurrently,
649 : * why not a local datadir?
650 : */
651 26 : if (datadir_source &&
652 14 : memcmp(&ControlFile_source, &ControlFile_source_after,
653 : sizeof(ControlFileData)) != 0)
654 : {
655 0 : pg_fatal("source system was modified while pg_rewind was running");
656 : }
657 :
658 26 : if (showprogress)
659 0 : pg_log_info("creating backup label and updating control file");
660 :
661 : /*
662 : * Create a backup label file, to tell the target where to begin the WAL
663 : * replay. Normally, from the last common checkpoint between the source
664 : * and the target. But if the source is a standby server, it's possible
665 : * that the last common checkpoint is *after* the standby's restartpoint.
666 : * That implies that the source server has applied the checkpoint record,
667 : * but hasn't performed a corresponding restartpoint yet. Make sure we
668 : * start at the restartpoint's redo point in that case.
669 : *
670 : * Use the old version of the source's control file for this. The server
671 : * might have finished the restartpoint after we started copying files,
672 : * but we must begin from the redo point at the time that started copying.
673 : */
674 26 : if (ControlFile_source.checkPointCopy.redo < chkptredo)
675 : {
676 4 : chkptredo = ControlFile_source.checkPointCopy.redo;
677 4 : chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
678 4 : chkptrec = ControlFile_source.checkPoint;
679 : }
680 26 : createBackupLabel(chkptredo, chkpttli, chkptrec);
681 :
682 : /*
683 : * Update control file of target, to tell the target how far it must
684 : * replay the WAL (minRecoveryPoint).
685 : */
686 26 : if (connstr_source)
687 : {
688 : /*
689 : * The source is a live server. Like in an online backup, it's
690 : * important that we recover all the WAL that was generated while we
691 : * were copying files.
692 : */
693 12 : if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
694 : {
695 : /*
696 : * Source is a standby server. We must replay to its
697 : * minRecoveryPoint.
698 : */
699 2 : endrec = ControlFile_source_after.minRecoveryPoint;
700 2 : endtli = ControlFile_source_after.minRecoveryPointTLI;
701 : }
702 : else
703 : {
704 : /*
705 : * Source is a production, non-standby, server. We must replay to
706 : * the last WAL insert location.
707 : */
708 10 : if (ControlFile_source_after.state != DB_IN_PRODUCTION)
709 0 : pg_fatal("source system was in unexpected state at end of rewind");
710 :
711 10 : endrec = source->get_current_wal_insert_lsn(source);
712 10 : endtli = Max(ControlFile_source_after.checkPointCopy.ThisTimeLineID,
713 : ControlFile_source_after.minRecoveryPointTLI);
714 : }
715 : }
716 : else
717 : {
718 : /*
719 : * Source is a local data directory. It should've shut down cleanly,
720 : * and we must replay to the latest shutdown checkpoint.
721 : */
722 14 : endrec = ControlFile_source_after.checkPoint;
723 14 : endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
724 : }
725 :
726 26 : memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
727 26 : ControlFile_new.minRecoveryPoint = endrec;
728 26 : ControlFile_new.minRecoveryPointTLI = endtli;
729 26 : ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
730 26 : if (!dry_run)
731 24 : update_controlfile(datadir_target, &ControlFile_new, do_sync);
732 26 : }
733 :
734 : static void
735 34 : sanityChecks(void)
736 : {
737 : /* TODO Check that there's no backup_label in either cluster */
738 :
739 : /* Check system_identifier match */
740 34 : if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
741 0 : pg_fatal("source and target clusters are from different systems");
742 :
743 : /* check version */
744 34 : if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION ||
745 34 : ControlFile_source.pg_control_version != PG_CONTROL_VERSION ||
746 34 : ControlFile_target.catalog_version_no != CATALOG_VERSION_NO ||
747 34 : ControlFile_source.catalog_version_no != CATALOG_VERSION_NO)
748 : {
749 0 : pg_fatal("clusters are not compatible with this version of pg_rewind");
750 : }
751 :
752 : /*
753 : * Target cluster need to use checksums or hint bit wal-logging, this to
754 : * prevent from data corruption that could occur because of hint bits.
755 : */
756 34 : if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
757 0 : !ControlFile_target.wal_log_hints)
758 : {
759 0 : pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
760 : }
761 :
762 : /*
763 : * Target cluster better not be running. This doesn't guard against
764 : * someone starting the cluster concurrently. Also, this is probably more
765 : * strict than necessary; it's OK if the target node was not shut down
766 : * cleanly, as long as it isn't running at the moment.
767 : */
768 34 : if (ControlFile_target.state != DB_SHUTDOWNED &&
769 4 : ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
770 2 : pg_fatal("target server must be shut down cleanly");
771 :
772 : /*
773 : * When the source is a data directory, also require that the source
774 : * server is shut down. There isn't any very strong reason for this
775 : * limitation, but better safe than sorry.
776 : */
777 32 : if (datadir_source &&
778 20 : ControlFile_source.state != DB_SHUTDOWNED &&
779 4 : ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
780 2 : pg_fatal("source data directory must be shut down cleanly");
781 30 : }
782 :
783 : /*
784 : * Print a progress report based on the fetch_size and fetch_done variables.
785 : *
786 : * Progress report is written at maximum once per second, except that the
787 : * last progress report is always printed.
788 : *
789 : * If finished is set to true, this is the last progress report. The cursor
790 : * is moved to the next line.
791 : */
792 : void
793 119852 : progress_report(bool finished)
794 : {
795 : static pg_time_t last_progress_report = 0;
796 : int percent;
797 : char fetch_done_str[32];
798 : char fetch_size_str[32];
799 : pg_time_t now;
800 :
801 119852 : if (!showprogress)
802 119852 : return;
803 :
804 0 : now = time(NULL);
805 0 : if (now == last_progress_report && !finished)
806 0 : return; /* Max once per second */
807 :
808 0 : last_progress_report = now;
809 0 : percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
810 :
811 : /*
812 : * Avoid overflowing past 100% or the full size. This may make the total
813 : * size number change as we approach the end of the backup (the estimate
814 : * will always be wrong if WAL is included), but that's better than having
815 : * the done column be bigger than the total.
816 : */
817 0 : if (percent > 100)
818 0 : percent = 100;
819 0 : if (fetch_done > fetch_size)
820 0 : fetch_size = fetch_done;
821 :
822 0 : snprintf(fetch_done_str, sizeof(fetch_done_str), UINT64_FORMAT,
823 : fetch_done / 1024);
824 0 : snprintf(fetch_size_str, sizeof(fetch_size_str), UINT64_FORMAT,
825 : fetch_size / 1024);
826 :
827 0 : fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
828 0 : (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
829 : percent);
830 :
831 : /*
832 : * Stay on the same line if reporting to a terminal and we're not done
833 : * yet.
834 : */
835 0 : fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
836 : }
837 :
838 : /*
839 : * Find minimum from two WAL locations assuming InvalidXLogRecPtr means
840 : * infinity as src/include/access/timeline.h states. This routine should
841 : * be used only when comparing WAL locations related to history files.
842 : */
843 : static XLogRecPtr
844 28 : MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
845 : {
846 28 : if (XLogRecPtrIsInvalid(a))
847 2 : return b;
848 26 : else if (XLogRecPtrIsInvalid(b))
849 26 : return a;
850 : else
851 0 : return Min(a, b);
852 : }
853 :
854 : /*
855 : * Retrieve timeline history for the source or target system.
856 : */
857 : static TimeLineHistoryEntry *
858 56 : getTimelineHistory(TimeLineID tli, bool is_source, int *nentries)
859 : {
860 : TimeLineHistoryEntry *history;
861 :
862 : /*
863 : * Timeline 1 does not have a history file, so there is no need to check
864 : * and fake an entry with infinite start and end positions.
865 : */
866 56 : if (tli == 1)
867 : {
868 26 : history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
869 26 : history->tli = tli;
870 26 : history->begin = history->end = InvalidXLogRecPtr;
871 26 : *nentries = 1;
872 : }
873 : else
874 : {
875 : char path[MAXPGPATH];
876 : char *histfile;
877 :
878 30 : TLHistoryFilePath(path, tli);
879 :
880 : /* Get history file from appropriate source */
881 30 : if (is_source)
882 26 : histfile = source->fetch_file(source, path, NULL);
883 : else
884 4 : histfile = slurpFile(datadir_target, path, NULL);
885 :
886 30 : history = rewind_parseTimeLineHistory(histfile, tli, nentries);
887 30 : pg_free(histfile);
888 : }
889 :
890 : /* In debugging mode, print what we read */
891 56 : if (debug)
892 : {
893 : int i;
894 :
895 56 : if (is_source)
896 28 : pg_log_debug("Source timeline history:");
897 : else
898 28 : pg_log_debug("Target timeline history:");
899 :
900 144 : for (i = 0; i < *nentries; i++)
901 : {
902 : TimeLineHistoryEntry *entry;
903 :
904 88 : entry = &history[i];
905 88 : pg_log_debug("%u: %X/%X - %X/%X", entry->tli,
906 : LSN_FORMAT_ARGS(entry->begin),
907 : LSN_FORMAT_ARGS(entry->end));
908 : }
909 : }
910 :
911 56 : return history;
912 : }
913 :
914 : /*
915 : * Determine the TLI of the last common timeline in the timeline history of
916 : * two clusters. *tliIndex is set to the index of last common timeline in
917 : * the arrays, and *recptr is set to the position where the timeline history
918 : * diverged (ie. the first WAL record that's not the same in both clusters).
919 : */
920 : static void
921 28 : findCommonAncestorTimeline(TimeLineHistoryEntry *a_history, int a_nentries,
922 : TimeLineHistoryEntry *b_history, int b_nentries,
923 : XLogRecPtr *recptr, int *tliIndex)
924 : {
925 : int i,
926 : n;
927 :
928 : /*
929 : * Trace the history forward, until we hit the timeline diverge. It may
930 : * still be possible that the source and target nodes used the same
931 : * timeline number in their history but with different start position
932 : * depending on the history files that each node has fetched in previous
933 : * recovery processes. Hence check the start position of the new timeline
934 : * as well and move down by one extra timeline entry if they do not match.
935 : */
936 28 : n = Min(a_nentries, b_nentries);
937 58 : for (i = 0; i < n; i++)
938 : {
939 30 : if (a_history[i].tli != b_history[i].tli ||
940 30 : a_history[i].begin != b_history[i].begin)
941 : break;
942 : }
943 :
944 28 : if (i > 0)
945 : {
946 28 : i--;
947 28 : *recptr = MinXLogRecPtr(a_history[i].end, b_history[i].end);
948 28 : *tliIndex = i;
949 28 : return;
950 : }
951 : else
952 : {
953 0 : pg_fatal("could not find common ancestor of the source and target cluster's timelines");
954 : }
955 : }
956 :
957 :
958 : /*
959 : * Create a backup_label file that forces recovery to begin at the last common
960 : * checkpoint.
961 : */
962 : static void
963 26 : createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
964 : {
965 : XLogSegNo startsegno;
966 : time_t stamp_time;
967 : char strfbuf[128];
968 : char xlogfilename[MAXFNAMELEN];
969 : struct tm *tmp;
970 : char buf[1000];
971 : int len;
972 :
973 26 : XLByteToSeg(startpoint, startsegno, WalSegSz);
974 26 : XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
975 :
976 : /*
977 : * Construct backup label file
978 : */
979 26 : stamp_time = time(NULL);
980 26 : tmp = localtime(&stamp_time);
981 26 : strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
982 :
983 26 : len = snprintf(buf, sizeof(buf),
984 : "START WAL LOCATION: %X/%X (file %s)\n"
985 : "CHECKPOINT LOCATION: %X/%X\n"
986 : "BACKUP METHOD: pg_rewind\n"
987 : "BACKUP FROM: standby\n"
988 : "START TIME: %s\n",
989 : /* omit LABEL: line */
990 26 : LSN_FORMAT_ARGS(startpoint), xlogfilename,
991 26 : LSN_FORMAT_ARGS(checkpointloc),
992 : strfbuf);
993 26 : if (len >= sizeof(buf))
994 0 : pg_fatal("backup label buffer too small"); /* shouldn't happen */
995 :
996 : /* TODO: move old file out of the way, if any. */
997 26 : open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
998 26 : write_target_range(buf, 0, len);
999 26 : close_target_file();
1000 26 : }
1001 :
1002 : /*
1003 : * Check CRC of control file
1004 : */
1005 : static void
1006 114 : checkControlFile(ControlFileData *ControlFile)
1007 : {
1008 : pg_crc32c crc;
1009 :
1010 : /* Calculate CRC */
1011 114 : INIT_CRC32C(crc);
1012 114 : COMP_CRC32C(crc, ControlFile, offsetof(ControlFileData, crc));
1013 114 : FIN_CRC32C(crc);
1014 :
1015 : /* And simply compare it */
1016 114 : if (!EQ_CRC32C(crc, ControlFile->crc))
1017 0 : pg_fatal("unexpected control file CRC");
1018 114 : }
1019 :
1020 : /*
1021 : * Verify control file contents in the buffer 'content', and copy it to
1022 : * *ControlFile.
1023 : */
1024 : static void
1025 114 : digestControlFile(ControlFileData *ControlFile, const char *content,
1026 : size_t size)
1027 : {
1028 114 : if (size != PG_CONTROL_FILE_SIZE)
1029 0 : pg_fatal("unexpected control file size %d, expected %d",
1030 : (int) size, PG_CONTROL_FILE_SIZE);
1031 :
1032 114 : memcpy(ControlFile, content, sizeof(ControlFileData));
1033 :
1034 : /* set and validate WalSegSz */
1035 114 : WalSegSz = ControlFile->xlog_seg_size;
1036 :
1037 114 : if (!IsValidWalSegSize(WalSegSz))
1038 : {
1039 0 : pg_log_error(ngettext("invalid WAL segment size in control file (%d byte)",
1040 : "invalid WAL segment size in control file (%d bytes)",
1041 : WalSegSz),
1042 : WalSegSz);
1043 0 : pg_log_error_detail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
1044 0 : exit(1);
1045 : }
1046 :
1047 : /* Additional checks on control file */
1048 114 : checkControlFile(ControlFile);
1049 114 : }
1050 :
1051 : /*
1052 : * Get value of GUC parameter restore_command from the target cluster.
1053 : *
1054 : * This uses a logic based on "postgres -C" to get the value from the
1055 : * cluster.
1056 : */
1057 : static void
1058 36 : getRestoreCommand(const char *argv0)
1059 : {
1060 : int rc;
1061 : char postgres_exec_path[MAXPGPATH];
1062 : PQExpBuffer postgres_cmd;
1063 :
1064 36 : if (!restore_wal)
1065 34 : return;
1066 :
1067 : /* find postgres executable */
1068 2 : rc = find_other_exec(argv0, "postgres",
1069 : PG_BACKEND_VERSIONSTR,
1070 : postgres_exec_path);
1071 :
1072 2 : if (rc < 0)
1073 : {
1074 : char full_path[MAXPGPATH];
1075 :
1076 0 : if (find_my_exec(argv0, full_path) < 0)
1077 0 : strlcpy(full_path, progname, sizeof(full_path));
1078 :
1079 0 : if (rc == -1)
1080 0 : pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
1081 : "postgres", progname, full_path);
1082 : else
1083 0 : pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
1084 : "postgres", full_path, progname);
1085 : }
1086 :
1087 : /*
1088 : * Build a command able to retrieve the value of GUC parameter
1089 : * restore_command, if set.
1090 : */
1091 2 : postgres_cmd = createPQExpBuffer();
1092 :
1093 : /* path to postgres, properly quoted */
1094 2 : appendShellString(postgres_cmd, postgres_exec_path);
1095 :
1096 : /* add -D switch, with properly quoted data directory */
1097 2 : appendPQExpBufferStr(postgres_cmd, " -D ");
1098 2 : appendShellString(postgres_cmd, datadir_target);
1099 :
1100 : /* add custom configuration file only if requested */
1101 2 : if (config_file != NULL)
1102 : {
1103 2 : appendPQExpBufferStr(postgres_cmd, " -c config_file=");
1104 2 : appendShellString(postgres_cmd, config_file);
1105 : }
1106 :
1107 : /* add -C switch, for restore_command */
1108 2 : appendPQExpBufferStr(postgres_cmd, " -C restore_command");
1109 :
1110 2 : restore_command = pipe_read_line(postgres_cmd->data);
1111 2 : if (restore_command == NULL)
1112 0 : pg_fatal("could not read \"restore_command\" from target cluster");
1113 :
1114 2 : (void) pg_strip_crlf(restore_command);
1115 :
1116 2 : if (strcmp(restore_command, "") == 0)
1117 0 : pg_fatal("\"restore_command\" is not set in the target cluster");
1118 :
1119 2 : pg_log_debug("using for rewind \"restore_command = \'%s\'\"",
1120 : restore_command);
1121 :
1122 2 : destroyPQExpBuffer(postgres_cmd);
1123 : }
1124 :
1125 :
1126 : /*
1127 : * Ensure clean shutdown of target instance by launching single-user mode
1128 : * postgres to do crash recovery.
1129 : */
1130 : static void
1131 20 : ensureCleanShutdown(const char *argv0)
1132 : {
1133 : int ret;
1134 : char exec_path[MAXPGPATH];
1135 : PQExpBuffer postgres_cmd;
1136 :
1137 : /* locate postgres binary */
1138 20 : if ((ret = find_other_exec(argv0, "postgres",
1139 : PG_BACKEND_VERSIONSTR,
1140 : exec_path)) < 0)
1141 : {
1142 : char full_path[MAXPGPATH];
1143 :
1144 0 : if (find_my_exec(argv0, full_path) < 0)
1145 0 : strlcpy(full_path, progname, sizeof(full_path));
1146 :
1147 0 : if (ret == -1)
1148 0 : pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
1149 : "postgres", progname, full_path);
1150 : else
1151 0 : pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
1152 : "postgres", full_path, progname);
1153 : }
1154 :
1155 20 : pg_log_info("executing \"%s\" for target server to complete crash recovery",
1156 : exec_path);
1157 :
1158 : /*
1159 : * Skip processing if requested, but only after ensuring presence of
1160 : * postgres.
1161 : */
1162 20 : if (dry_run)
1163 0 : return;
1164 :
1165 : /*
1166 : * Finally run postgres in single-user mode. There is no need to use
1167 : * fsync here. This makes the recovery faster, and the target data folder
1168 : * is synced at the end anyway.
1169 : */
1170 20 : postgres_cmd = createPQExpBuffer();
1171 :
1172 : /* path to postgres, properly quoted */
1173 20 : appendShellString(postgres_cmd, exec_path);
1174 :
1175 : /* add set of options with properly quoted data directory */
1176 20 : appendPQExpBufferStr(postgres_cmd, " --single -F -D ");
1177 20 : appendShellString(postgres_cmd, datadir_target);
1178 :
1179 : /* add custom configuration file only if requested */
1180 20 : if (config_file != NULL)
1181 : {
1182 18 : appendPQExpBufferStr(postgres_cmd, " -c config_file=");
1183 18 : appendShellString(postgres_cmd, config_file);
1184 : }
1185 :
1186 : /* finish with the database name, and a properly quoted redirection */
1187 20 : appendPQExpBufferStr(postgres_cmd, " template1 < ");
1188 20 : appendShellString(postgres_cmd, DEVNULL);
1189 :
1190 20 : fflush(NULL);
1191 20 : if (system(postgres_cmd->data) != 0)
1192 : {
1193 2 : pg_log_error("postgres single-user mode in target cluster failed");
1194 2 : pg_log_error_detail("Command was: %s", postgres_cmd->data);
1195 2 : exit(1);
1196 : }
1197 :
1198 18 : destroyPQExpBuffer(postgres_cmd);
1199 : }
1200 :
1201 : static void
1202 36 : disconnect_atexit(void)
1203 : {
1204 36 : if (conn != NULL)
1205 0 : PQfinish(conn);
1206 36 : }
|