Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_rewind.c
4 : * Synchronizes a PostgreSQL data directory to a new timeline
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : *
8 : *-------------------------------------------------------------------------
9 : */
10 : #include "postgres_fe.h"
11 :
12 : #include <sys/stat.h>
13 : #include <fcntl.h>
14 : #include <time.h>
15 : #include <unistd.h>
16 :
17 : #include "access/timeline.h"
18 : #include "access/xlog_internal.h"
19 : #include "catalog/catversion.h"
20 : #include "catalog/pg_control.h"
21 : #include "common/controldata_utils.h"
22 : #include "common/file_perm.h"
23 : #include "common/restricted_token.h"
24 : #include "common/string.h"
25 : #include "fe_utils/option_utils.h"
26 : #include "fe_utils/recovery_gen.h"
27 : #include "fe_utils/string_utils.h"
28 : #include "file_ops.h"
29 : #include "filemap.h"
30 : #include "getopt_long.h"
31 : #include "pg_rewind.h"
32 : #include "rewind_source.h"
33 : #include "storage/bufpage.h"
34 :
35 : static void usage(const char *progname);
36 :
37 : static void perform_rewind(filemap_t *filemap, rewind_source *source,
38 : XLogRecPtr chkptrec,
39 : TimeLineID chkpttli,
40 : XLogRecPtr chkptredo);
41 :
42 : static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
43 : XLogRecPtr checkpointloc);
44 :
45 : static void digestControlFile(ControlFileData *ControlFile,
46 : const char *content, size_t size);
47 : static void getRestoreCommand(const char *argv0);
48 : static void sanityChecks(void);
49 : static TimeLineHistoryEntry *getTimelineHistory(TimeLineID tli, bool is_source,
50 : int *nentries);
51 : static void findCommonAncestorTimeline(TimeLineHistoryEntry *a_history,
52 : int a_nentries,
53 : TimeLineHistoryEntry *b_history,
54 : int b_nentries,
55 : XLogRecPtr *recptr, int *tliIndex);
56 : static void ensureCleanShutdown(const char *argv0);
57 : static void disconnect_atexit(void);
58 :
59 : static ControlFileData ControlFile_target;
60 : static ControlFileData ControlFile_source;
61 : static ControlFileData ControlFile_source_after;
62 :
63 : static const char *progname;
64 : int WalSegSz;
65 :
66 : /* Configuration options */
67 : char *datadir_target = NULL;
68 : static char *datadir_source = NULL;
69 : static char *connstr_source = NULL;
70 : static char *restore_command = NULL;
71 : static char *config_file = NULL;
72 :
73 : static bool debug = false;
74 : bool showprogress = false;
75 : bool dry_run = false;
76 : bool do_sync = true;
77 : static bool restore_wal = false;
78 : DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
79 :
80 : /* Target history */
81 : TimeLineHistoryEntry *targetHistory;
82 : int targetNentries;
83 :
84 : /* Progress counters */
85 : uint64 fetch_size;
86 : uint64 fetch_done;
87 :
88 : static PGconn *conn;
89 : static rewind_source *source;
90 :
91 : static void
92 2 : usage(const char *progname)
93 : {
94 2 : printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
95 2 : printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
96 2 : printf(_("Options:\n"));
97 2 : printf(_(" -c, --restore-target-wal use \"restore_command\" in target configuration to\n"
98 : " retrieve WAL files from archives\n"));
99 2 : printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n"));
100 2 : printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n"));
101 2 : printf(_(" --source-server=CONNSTR source server to synchronize with\n"));
102 2 : printf(_(" -n, --dry-run stop before modifying anything\n"));
103 2 : printf(_(" -N, --no-sync do not wait for changes to be written\n"
104 : " safely to disk\n"));
105 2 : printf(_(" -P, --progress write progress messages\n"));
106 2 : printf(_(" -R, --write-recovery-conf write configuration for replication\n"
107 : " (requires --source-server)\n"));
108 2 : printf(_(" --config-file=FILENAME use specified main server configuration\n"
109 : " file when running target cluster\n"));
110 2 : printf(_(" --debug write a lot of debug messages\n"));
111 2 : printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n"));
112 2 : printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
113 2 : printf(_(" -V, --version output version information, then exit\n"));
114 2 : printf(_(" -?, --help show this help, then exit\n"));
115 2 : printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
116 2 : printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
117 2 : }
118 :
119 :
120 : int
121 52 : main(int argc, char **argv)
122 : {
123 : static struct option long_options[] = {
124 : {"help", no_argument, NULL, '?'},
125 : {"target-pgdata", required_argument, NULL, 'D'},
126 : {"write-recovery-conf", no_argument, NULL, 'R'},
127 : {"source-pgdata", required_argument, NULL, 1},
128 : {"source-server", required_argument, NULL, 2},
129 : {"no-ensure-shutdown", no_argument, NULL, 4},
130 : {"config-file", required_argument, NULL, 5},
131 : {"version", no_argument, NULL, 'V'},
132 : {"restore-target-wal", no_argument, NULL, 'c'},
133 : {"dry-run", no_argument, NULL, 'n'},
134 : {"no-sync", no_argument, NULL, 'N'},
135 : {"progress", no_argument, NULL, 'P'},
136 : {"debug", no_argument, NULL, 3},
137 : {"sync-method", required_argument, NULL, 6},
138 : {NULL, 0, NULL, 0}
139 : };
140 : int option_index;
141 : int c;
142 : XLogRecPtr divergerec;
143 : int lastcommontliIndex;
144 : XLogRecPtr chkptrec;
145 : TimeLineID chkpttli;
146 : XLogRecPtr chkptredo;
147 : TimeLineID source_tli;
148 : TimeLineID target_tli;
149 : XLogRecPtr target_wal_endrec;
150 : XLogSegNo last_common_segno;
151 : size_t size;
152 : char *buffer;
153 52 : bool no_ensure_shutdown = false;
154 : bool rewind_needed;
155 52 : bool writerecoveryconf = false;
156 : filemap_t *filemap;
157 :
158 52 : pg_logging_init(argv[0]);
159 52 : set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
160 52 : progname = get_progname(argv[0]);
161 :
162 : /* Process command-line arguments */
163 52 : if (argc > 1)
164 : {
165 52 : if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
166 : {
167 2 : usage(progname);
168 2 : exit(0);
169 : }
170 50 : if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
171 : {
172 2 : puts("pg_rewind (PostgreSQL) " PG_VERSION);
173 2 : exit(0);
174 : }
175 : }
176 :
177 262 : while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1)
178 : {
179 216 : switch (c)
180 : {
181 2 : case 'c':
182 2 : restore_wal = true;
183 2 : break;
184 :
185 0 : case 'P':
186 0 : showprogress = true;
187 0 : break;
188 :
189 2 : case 'n':
190 2 : dry_run = true;
191 2 : break;
192 :
193 36 : case 'N':
194 36 : do_sync = false;
195 36 : break;
196 :
197 12 : case 'R':
198 12 : writerecoveryconf = true;
199 12 : break;
200 :
201 44 : case 3:
202 44 : debug = true;
203 44 : pg_logging_increase_verbosity();
204 44 : break;
205 :
206 46 : case 'D': /* -D or --target-pgdata */
207 46 : datadir_target = pg_strdup(optarg);
208 46 : break;
209 :
210 32 : case 1: /* --source-pgdata */
211 32 : datadir_source = pg_strdup(optarg);
212 32 : break;
213 :
214 14 : case 2: /* --source-server */
215 14 : connstr_source = pg_strdup(optarg);
216 14 : break;
217 :
218 6 : case 4:
219 6 : no_ensure_shutdown = true;
220 6 : break;
221 :
222 20 : case 5:
223 20 : config_file = pg_strdup(optarg);
224 20 : break;
225 :
226 0 : case 6:
227 0 : if (!parse_sync_method(optarg, &sync_method))
228 0 : exit(1);
229 0 : break;
230 :
231 2 : default:
232 : /* getopt_long already emitted a complaint */
233 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
234 2 : exit(1);
235 : }
236 : }
237 :
238 46 : if (datadir_source == NULL && connstr_source == NULL)
239 : {
240 2 : pg_log_error("no source specified (--source-pgdata or --source-server)");
241 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
242 2 : exit(1);
243 : }
244 :
245 44 : if (datadir_source != NULL && connstr_source != NULL)
246 : {
247 2 : pg_log_error("only one of --source-pgdata or --source-server can be specified");
248 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
249 2 : exit(1);
250 : }
251 :
252 42 : if (datadir_target == NULL)
253 : {
254 0 : pg_log_error("no target data directory specified (--target-pgdata)");
255 0 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
256 0 : exit(1);
257 : }
258 :
259 42 : if (writerecoveryconf && connstr_source == NULL)
260 : {
261 2 : pg_log_error("no source server information (--source-server) specified for --write-recovery-conf");
262 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
263 2 : exit(1);
264 : }
265 :
266 40 : if (optind < argc)
267 : {
268 2 : pg_log_error("too many command-line arguments (first is \"%s\")",
269 : argv[optind]);
270 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
271 2 : exit(1);
272 : }
273 :
274 : /*
275 : * Don't allow pg_rewind to be run as root, to avoid overwriting the
276 : * ownership of files in the data directory. We need only check for root
277 : * -- any other user won't have sufficient permissions to modify files in
278 : * the data directory.
279 : */
280 : #ifndef WIN32
281 38 : if (geteuid() == 0)
282 : {
283 0 : pg_log_error("cannot be executed by \"root\"");
284 0 : pg_log_error_hint("You must run %s as the PostgreSQL superuser.",
285 : progname);
286 0 : exit(1);
287 : }
288 : #endif
289 :
290 38 : get_restricted_token();
291 :
292 : /* Set mask based on PGDATA permissions */
293 38 : if (!GetDataDirectoryCreatePerm(datadir_target))
294 0 : pg_fatal("could not read permissions of directory \"%s\": %m",
295 : datadir_target);
296 :
297 38 : umask(pg_mode_mask);
298 :
299 38 : getRestoreCommand(argv[0]);
300 :
301 38 : atexit(disconnect_atexit);
302 :
303 : /* Ok, we have all the options and we're ready to start. */
304 38 : if (dry_run)
305 2 : pg_log_info("Executing in dry-run mode.\n"
306 : "The target directory will not be modified.");
307 :
308 : /* First, connect to remote server. */
309 38 : if (connstr_source)
310 : {
311 12 : conn = PQconnectdb(connstr_source);
312 :
313 12 : if (PQstatus(conn) == CONNECTION_BAD)
314 0 : pg_fatal("%s", PQerrorMessage(conn));
315 :
316 12 : if (showprogress)
317 0 : pg_log_info("connected to server");
318 :
319 12 : source = init_libpq_source(conn);
320 : }
321 : else
322 26 : source = init_local_source(datadir_source);
323 :
324 : /*
325 : * Check the status of the target instance.
326 : *
327 : * If the target instance was not cleanly shut down, start and stop the
328 : * target cluster once in single-user mode to enforce recovery to finish,
329 : * ensuring that the cluster can be used by pg_rewind. Note that if
330 : * no_ensure_shutdown is specified, pg_rewind ignores this step, and users
331 : * need to make sure by themselves that the target cluster is in a clean
332 : * state.
333 : */
334 38 : buffer = slurpFile(datadir_target, XLOG_CONTROL_FILE, &size);
335 38 : digestControlFile(&ControlFile_target, buffer, size);
336 38 : pg_free(buffer);
337 :
338 38 : if (!no_ensure_shutdown &&
339 32 : ControlFile_target.state != DB_SHUTDOWNED &&
340 22 : ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
341 : {
342 20 : ensureCleanShutdown(argv[0]);
343 :
344 18 : buffer = slurpFile(datadir_target, XLOG_CONTROL_FILE, &size);
345 18 : digestControlFile(&ControlFile_target, buffer, size);
346 18 : pg_free(buffer);
347 : }
348 :
349 36 : buffer = source->fetch_file(source, XLOG_CONTROL_FILE, &size);
350 36 : digestControlFile(&ControlFile_source, buffer, size);
351 36 : pg_free(buffer);
352 :
353 36 : sanityChecks();
354 :
355 : /*
356 : * Usually, the TLI can be found in the latest checkpoint record. But if
357 : * the source server is just being promoted (or it's a standby that's
358 : * following a primary that's just being promoted), and the checkpoint
359 : * requested by the promotion hasn't completed yet, the latest timeline is
360 : * in minRecoveryPoint. So we check which is later, the TLI of the
361 : * minRecoveryPoint or the latest checkpoint.
362 : */
363 32 : source_tli = Max(ControlFile_source.minRecoveryPointTLI,
364 : ControlFile_source.checkPointCopy.ThisTimeLineID);
365 :
366 : /* Similarly for the target. */
367 32 : target_tli = Max(ControlFile_target.minRecoveryPointTLI,
368 : ControlFile_target.checkPointCopy.ThisTimeLineID);
369 :
370 : /*
371 : * Find the common ancestor timeline between the clusters.
372 : *
373 : * If both clusters are already on the same timeline, there's nothing to
374 : * do.
375 : */
376 32 : if (target_tli == source_tli)
377 : {
378 2 : pg_log_info("source and target cluster are on the same timeline");
379 2 : rewind_needed = false;
380 2 : target_wal_endrec = 0;
381 : }
382 : else
383 : {
384 : XLogRecPtr chkptendrec;
385 : TimeLineHistoryEntry *sourceHistory;
386 : int sourceNentries;
387 :
388 : /*
389 : * Retrieve timelines for both source and target, and find the point
390 : * where they diverged.
391 : */
392 30 : sourceHistory = getTimelineHistory(source_tli, true, &sourceNentries);
393 30 : targetHistory = getTimelineHistory(target_tli, false, &targetNentries);
394 :
395 30 : findCommonAncestorTimeline(sourceHistory, sourceNentries,
396 : targetHistory, targetNentries,
397 : &divergerec, &lastcommontliIndex);
398 :
399 30 : pg_log_info("servers diverged at WAL location %X/%08X on timeline %u",
400 : LSN_FORMAT_ARGS(divergerec),
401 : targetHistory[lastcommontliIndex].tli);
402 :
403 : /*
404 : * Convert the divergence LSN to a segment number, that will be used
405 : * to decide how WAL segments should be processed.
406 : */
407 30 : XLByteToSeg(divergerec, last_common_segno, ControlFile_target.xlog_seg_size);
408 :
409 : /*
410 : * Don't need the source history anymore. The target history is still
411 : * needed by the routines in parsexlog.c, when we read the target WAL.
412 : */
413 30 : pfree(sourceHistory);
414 :
415 :
416 : /*
417 : * Determine the end-of-WAL on the target.
418 : *
419 : * The WAL ends at the last shutdown checkpoint, or at
420 : * minRecoveryPoint if it was a standby. (If we supported rewinding a
421 : * server that was not shut down cleanly, we would need to replay
422 : * until we reach the first invalid record, like crash recovery does.)
423 : */
424 :
425 : /* read the checkpoint record on the target to see where it ends. */
426 30 : chkptendrec = readOneRecord(datadir_target,
427 : ControlFile_target.checkPoint,
428 : targetNentries - 1,
429 : restore_command);
430 :
431 30 : if (ControlFile_target.minRecoveryPoint > chkptendrec)
432 : {
433 2 : target_wal_endrec = ControlFile_target.minRecoveryPoint;
434 : }
435 : else
436 : {
437 28 : target_wal_endrec = chkptendrec;
438 : }
439 :
440 : /*
441 : * Check for the possibility that the target is in fact a direct
442 : * ancestor of the source. In that case, there is no divergent history
443 : * in the target that needs rewinding.
444 : */
445 30 : if (target_wal_endrec > divergerec)
446 : {
447 30 : rewind_needed = true;
448 : }
449 : else
450 : {
451 : /* the last common checkpoint record must be part of target WAL */
452 : Assert(target_wal_endrec == divergerec);
453 :
454 0 : rewind_needed = false;
455 : }
456 : }
457 :
458 32 : if (!rewind_needed)
459 : {
460 2 : pg_log_info("no rewind required");
461 2 : if (writerecoveryconf && !dry_run)
462 0 : WriteRecoveryConfig(conn, datadir_target,
463 : GenerateRecoveryConfig(conn, NULL,
464 : GetDbnameFromConnectionOptions(connstr_source)));
465 2 : exit(0);
466 : }
467 :
468 : /* Initialize hashtable that tracks WAL files protected from removal */
469 30 : keepwal_init();
470 :
471 30 : findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
472 : &chkptrec, &chkpttli, &chkptredo, restore_command);
473 30 : pg_log_info("rewinding from last common checkpoint at %X/%08X on timeline %u",
474 : LSN_FORMAT_ARGS(chkptrec), chkpttli);
475 :
476 : /* Initialize the hash table to track the status of each file */
477 30 : filehash_init();
478 :
479 : /*
480 : * Collect information about all files in the both data directories.
481 : */
482 30 : if (showprogress)
483 0 : pg_log_info("reading source file list");
484 30 : source->traverse_files(source, &process_source_file);
485 :
486 30 : if (showprogress)
487 0 : pg_log_info("reading target file list");
488 30 : traverse_datadir(datadir_target, &process_target_file);
489 :
490 : /*
491 : * Read the target WAL from last checkpoint before the point of fork, to
492 : * extract all the pages that were modified on the target cluster after
493 : * the fork.
494 : */
495 30 : if (showprogress)
496 0 : pg_log_info("reading WAL in target");
497 30 : extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
498 : target_wal_endrec, restore_command);
499 :
500 : /*
501 : * We have collected all information we need from both systems. Decide
502 : * what to do with each file.
503 : */
504 30 : filemap = decide_file_actions(last_common_segno);
505 30 : if (showprogress)
506 0 : calculate_totals(filemap);
507 :
508 : /* this is too verbose even for verbose mode */
509 30 : if (debug)
510 30 : print_filemap(filemap);
511 :
512 : /*
513 : * Ok, we're ready to start copying things over.
514 : */
515 30 : if (showprogress)
516 : {
517 0 : pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
518 : (unsigned long) (filemap->fetch_size / (1024 * 1024)),
519 : (unsigned long) (filemap->total_size / (1024 * 1024)));
520 :
521 0 : fetch_size = filemap->fetch_size;
522 0 : fetch_done = 0;
523 : }
524 :
525 : /*
526 : * We have now collected all the information we need from both systems,
527 : * and we are ready to start modifying the target directory.
528 : *
529 : * This is the point of no return. Once we start copying things, there is
530 : * no turning back!
531 : */
532 30 : perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo);
533 :
534 28 : if (showprogress)
535 0 : pg_log_info("syncing target data directory");
536 28 : sync_target_dir();
537 :
538 : /* Also update the standby configuration, if requested. */
539 28 : if (writerecoveryconf && !dry_run)
540 10 : WriteRecoveryConfig(conn, datadir_target,
541 : GenerateRecoveryConfig(conn, NULL,
542 : GetDbnameFromConnectionOptions(connstr_source)));
543 :
544 : /* don't need the source connection anymore */
545 28 : source->destroy(source);
546 28 : if (conn)
547 : {
548 12 : PQfinish(conn);
549 12 : conn = NULL;
550 : }
551 :
552 28 : pg_log_info("Done!");
553 :
554 28 : return 0;
555 : }
556 :
557 : /*
558 : * Perform the rewind.
559 : *
560 : * We have already collected all the information we need from the
561 : * target and the source.
562 : */
563 : static void
564 30 : perform_rewind(filemap_t *filemap, rewind_source *source,
565 : XLogRecPtr chkptrec,
566 : TimeLineID chkpttli,
567 : XLogRecPtr chkptredo)
568 : {
569 : XLogRecPtr endrec;
570 : TimeLineID endtli;
571 : ControlFileData ControlFile_new;
572 : size_t size;
573 : char *buffer;
574 :
575 : /*
576 : * Execute the actions in the file map, fetching data from the source
577 : * system as needed.
578 : */
579 33882 : for (int i = 0; i < filemap->nentries; i++)
580 : {
581 33854 : file_entry_t *entry = filemap->entries[i];
582 :
583 : /*
584 : * If this is a relation file, copy the modified blocks.
585 : *
586 : * This is in addition to any other changes.
587 : */
588 33854 : if (entry->target_pages_to_overwrite.bitmapsize > 0)
589 : {
590 : datapagemap_iterator_t *iter;
591 : BlockNumber blkno;
592 : off_t offset;
593 :
594 842 : iter = datapagemap_iterate(&entry->target_pages_to_overwrite);
595 4210 : while (datapagemap_next(iter, &blkno))
596 : {
597 3368 : offset = blkno * BLCKSZ;
598 3368 : source->queue_fetch_range(source, entry->path, offset, BLCKSZ);
599 : }
600 842 : pg_free(iter);
601 : }
602 :
603 33854 : switch (entry->action)
604 : {
605 23024 : case FILE_ACTION_NONE:
606 : /* nothing else to do */
607 23024 : break;
608 :
609 9352 : case FILE_ACTION_COPY:
610 9352 : source->queue_fetch_file(source, entry->path, entry->source_size);
611 9350 : break;
612 :
613 8 : case FILE_ACTION_TRUNCATE:
614 8 : truncate_target_file(entry->path, entry->source_size);
615 8 : break;
616 :
617 10 : case FILE_ACTION_COPY_TAIL:
618 10 : source->queue_fetch_range(source, entry->path,
619 10 : entry->target_size,
620 10 : entry->source_size - entry->target_size);
621 10 : break;
622 :
623 1442 : case FILE_ACTION_REMOVE:
624 1442 : remove_target(entry);
625 1442 : break;
626 :
627 18 : case FILE_ACTION_CREATE:
628 18 : create_target(entry);
629 18 : break;
630 :
631 0 : case FILE_ACTION_UNDECIDED:
632 0 : pg_fatal("no action decided for file \"%s\"", entry->path);
633 : break;
634 : }
635 : }
636 :
637 : /* Complete any remaining range-fetches that we queued up above. */
638 28 : source->finish_fetch(source);
639 :
640 28 : close_target_file();
641 :
642 28 : progress_report(true);
643 :
644 : /*
645 : * Fetch the control file from the source last. This ensures that the
646 : * minRecoveryPoint is up-to-date.
647 : */
648 28 : buffer = source->fetch_file(source, XLOG_CONTROL_FILE, &size);
649 28 : digestControlFile(&ControlFile_source_after, buffer, size);
650 28 : pg_free(buffer);
651 :
652 : /*
653 : * Sanity check: If the source is a local system, the control file should
654 : * not have changed since we started.
655 : *
656 : * XXX: We assume it hasn't been modified, but actually, what could go
657 : * wrong? The logic handles a libpq source that's modified concurrently,
658 : * why not a local datadir?
659 : */
660 28 : if (datadir_source &&
661 16 : memcmp(&ControlFile_source, &ControlFile_source_after,
662 : sizeof(ControlFileData)) != 0)
663 : {
664 0 : pg_fatal("source system was modified while pg_rewind was running");
665 : }
666 :
667 28 : if (showprogress)
668 0 : pg_log_info("creating backup label and updating control file");
669 :
670 : /*
671 : * Create a backup label file, to tell the target where to begin the WAL
672 : * replay. Normally, from the last common checkpoint between the source
673 : * and the target. But if the source is a standby server, it's possible
674 : * that the last common checkpoint is *after* the standby's restartpoint.
675 : * That implies that the source server has applied the checkpoint record,
676 : * but hasn't performed a corresponding restartpoint yet. Make sure we
677 : * start at the restartpoint's redo point in that case.
678 : *
679 : * Use the old version of the source's control file for this. The server
680 : * might have finished the restartpoint after we started copying files,
681 : * but we must begin from the redo point at the time that started copying.
682 : */
683 28 : if (ControlFile_source.checkPointCopy.redo < chkptredo)
684 : {
685 4 : chkptredo = ControlFile_source.checkPointCopy.redo;
686 4 : chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
687 4 : chkptrec = ControlFile_source.checkPoint;
688 : }
689 28 : createBackupLabel(chkptredo, chkpttli, chkptrec);
690 :
691 : /*
692 : * Update control file of target, to tell the target how far it must
693 : * replay the WAL (minRecoveryPoint).
694 : */
695 28 : if (connstr_source)
696 : {
697 : /*
698 : * The source is a live server. Like in an online backup, it's
699 : * important that we recover all the WAL that was generated while we
700 : * were copying files.
701 : */
702 12 : if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
703 : {
704 : /*
705 : * Source is a standby server. We must replay to its
706 : * minRecoveryPoint.
707 : */
708 2 : endrec = ControlFile_source_after.minRecoveryPoint;
709 2 : endtli = ControlFile_source_after.minRecoveryPointTLI;
710 : }
711 : else
712 : {
713 : /*
714 : * Source is a production, non-standby, server. We must replay to
715 : * the last WAL insert location.
716 : */
717 10 : if (ControlFile_source_after.state != DB_IN_PRODUCTION)
718 0 : pg_fatal("source system was in unexpected state at end of rewind");
719 :
720 10 : endrec = source->get_current_wal_insert_lsn(source);
721 10 : endtli = Max(ControlFile_source_after.checkPointCopy.ThisTimeLineID,
722 : ControlFile_source_after.minRecoveryPointTLI);
723 : }
724 : }
725 : else
726 : {
727 : /*
728 : * Source is a local data directory. It should've shut down cleanly,
729 : * and we must replay to the latest shutdown checkpoint.
730 : */
731 16 : endrec = ControlFile_source_after.checkPoint;
732 16 : endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
733 : }
734 :
735 28 : memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
736 28 : ControlFile_new.minRecoveryPoint = endrec;
737 28 : ControlFile_new.minRecoveryPointTLI = endtli;
738 28 : ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
739 28 : if (!dry_run)
740 26 : update_controlfile(datadir_target, &ControlFile_new, do_sync);
741 28 : }
742 :
743 : static void
744 36 : sanityChecks(void)
745 : {
746 : /* TODO Check that there's no backup_label in either cluster */
747 :
748 : /* Check system_identifier match */
749 36 : if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
750 0 : pg_fatal("source and target clusters are from different systems");
751 :
752 : /* check version */
753 36 : if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION ||
754 36 : ControlFile_source.pg_control_version != PG_CONTROL_VERSION ||
755 36 : ControlFile_target.catalog_version_no != CATALOG_VERSION_NO ||
756 36 : ControlFile_source.catalog_version_no != CATALOG_VERSION_NO)
757 : {
758 0 : pg_fatal("clusters are not compatible with this version of pg_rewind");
759 : }
760 :
761 : /*
762 : * Target cluster need to use checksums or hint bit wal-logging, this to
763 : * prevent from data corruption that could occur because of hint bits.
764 : */
765 36 : if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
766 0 : !ControlFile_target.wal_log_hints)
767 : {
768 0 : pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
769 : }
770 :
771 : /*
772 : * Target cluster better not be running. This doesn't guard against
773 : * someone starting the cluster concurrently. Also, this is probably more
774 : * strict than necessary; it's OK if the target node was not shut down
775 : * cleanly, as long as it isn't running at the moment.
776 : */
777 36 : if (ControlFile_target.state != DB_SHUTDOWNED &&
778 4 : ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
779 2 : pg_fatal("target server must be shut down cleanly");
780 :
781 : /*
782 : * When the source is a data directory, also require that the source
783 : * server is shut down. There isn't any very strong reason for this
784 : * limitation, but better safe than sorry.
785 : */
786 34 : if (datadir_source &&
787 22 : ControlFile_source.state != DB_SHUTDOWNED &&
788 4 : ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
789 2 : pg_fatal("source data directory must be shut down cleanly");
790 32 : }
791 :
792 : /*
793 : * Print a progress report based on the fetch_size and fetch_done variables.
794 : *
795 : * Progress report is written at maximum once per second, except that the
796 : * last progress report is always printed.
797 : *
798 : * If finished is set to true, this is the last progress report. The cursor
799 : * is moved to the next line.
800 : */
801 : void
802 104328 : progress_report(bool finished)
803 : {
804 : static pg_time_t last_progress_report = 0;
805 : int percent;
806 : char fetch_done_str[32];
807 : char fetch_size_str[32];
808 : pg_time_t now;
809 :
810 104328 : if (!showprogress)
811 104328 : return;
812 :
813 0 : now = time(NULL);
814 0 : if (now == last_progress_report && !finished)
815 0 : return; /* Max once per second */
816 :
817 0 : last_progress_report = now;
818 0 : percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
819 :
820 : /*
821 : * Avoid overflowing past 100% or the full size. This may make the total
822 : * size number change as we approach the end of the backup (the estimate
823 : * will always be wrong if WAL is included), but that's better than having
824 : * the done column be bigger than the total.
825 : */
826 0 : if (percent > 100)
827 0 : percent = 100;
828 0 : if (fetch_done > fetch_size)
829 0 : fetch_size = fetch_done;
830 :
831 0 : snprintf(fetch_done_str, sizeof(fetch_done_str), UINT64_FORMAT,
832 : fetch_done / 1024);
833 0 : snprintf(fetch_size_str, sizeof(fetch_size_str), UINT64_FORMAT,
834 : fetch_size / 1024);
835 :
836 0 : fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
837 0 : (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
838 : percent);
839 :
840 : /*
841 : * Stay on the same line if reporting to a terminal and we're not done
842 : * yet.
843 : */
844 0 : fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
845 : }
846 :
847 : /*
848 : * Find minimum from two WAL locations assuming InvalidXLogRecPtr means
849 : * infinity as src/include/access/timeline.h states. This routine should
850 : * be used only when comparing WAL locations related to history files.
851 : */
852 : static XLogRecPtr
853 30 : MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
854 : {
855 30 : if (!XLogRecPtrIsValid(a))
856 2 : return b;
857 28 : else if (!XLogRecPtrIsValid(b))
858 28 : return a;
859 : else
860 0 : return Min(a, b);
861 : }
862 :
863 : /*
864 : * Retrieve timeline history for the source or target system.
865 : */
866 : static TimeLineHistoryEntry *
867 60 : getTimelineHistory(TimeLineID tli, bool is_source, int *nentries)
868 : {
869 : TimeLineHistoryEntry *history;
870 :
871 : /*
872 : * Timeline 1 does not have a history file, so there is no need to check
873 : * and fake an entry with infinite start and end positions.
874 : */
875 60 : if (tli == 1)
876 : {
877 28 : history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
878 28 : history->tli = tli;
879 28 : history->begin = history->end = InvalidXLogRecPtr;
880 28 : *nentries = 1;
881 : }
882 : else
883 : {
884 : char path[MAXPGPATH];
885 : char *histfile;
886 :
887 32 : TLHistoryFilePath(path, tli);
888 :
889 : /* Get history file from appropriate source */
890 32 : if (is_source)
891 28 : histfile = source->fetch_file(source, path, NULL);
892 : else
893 4 : histfile = slurpFile(datadir_target, path, NULL);
894 :
895 32 : history = rewind_parseTimeLineHistory(histfile, tli, nentries);
896 32 : pg_free(histfile);
897 : }
898 :
899 : /* In debugging mode, print what we read */
900 60 : if (debug)
901 : {
902 : int i;
903 :
904 60 : if (is_source)
905 30 : pg_log_debug("Source timeline history:");
906 : else
907 30 : pg_log_debug("Target timeline history:");
908 :
909 154 : for (i = 0; i < *nentries; i++)
910 : {
911 : TimeLineHistoryEntry *entry;
912 :
913 94 : entry = &history[i];
914 94 : pg_log_debug("%u: %X/%08X - %X/%08X", entry->tli,
915 : LSN_FORMAT_ARGS(entry->begin),
916 : LSN_FORMAT_ARGS(entry->end));
917 : }
918 : }
919 :
920 60 : return history;
921 : }
922 :
923 : /*
924 : * Determine the TLI of the last common timeline in the timeline history of
925 : * two clusters. *tliIndex is set to the index of last common timeline in
926 : * the arrays, and *recptr is set to the position where the timeline history
927 : * diverged (ie. the first WAL record that's not the same in both clusters).
928 : */
929 : static void
930 30 : findCommonAncestorTimeline(TimeLineHistoryEntry *a_history, int a_nentries,
931 : TimeLineHistoryEntry *b_history, int b_nentries,
932 : XLogRecPtr *recptr, int *tliIndex)
933 : {
934 : int i,
935 : n;
936 :
937 : /*
938 : * Trace the history forward, until we hit the timeline diverge. It may
939 : * still be possible that the source and target nodes used the same
940 : * timeline number in their history but with different start position
941 : * depending on the history files that each node has fetched in previous
942 : * recovery processes. Hence check the start position of the new timeline
943 : * as well and move down by one extra timeline entry if they do not match.
944 : */
945 30 : n = Min(a_nentries, b_nentries);
946 62 : for (i = 0; i < n; i++)
947 : {
948 32 : if (a_history[i].tli != b_history[i].tli ||
949 32 : a_history[i].begin != b_history[i].begin)
950 : break;
951 : }
952 :
953 30 : if (i > 0)
954 : {
955 30 : i--;
956 30 : *recptr = MinXLogRecPtr(a_history[i].end, b_history[i].end);
957 30 : *tliIndex = i;
958 30 : return;
959 : }
960 : else
961 : {
962 0 : pg_fatal("could not find common ancestor of the source and target cluster's timelines");
963 : }
964 : }
965 :
966 :
967 : /*
968 : * Create a backup_label file that forces recovery to begin at the last common
969 : * checkpoint.
970 : */
971 : static void
972 28 : createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
973 : {
974 : XLogSegNo startsegno;
975 : time_t stamp_time;
976 : char strfbuf[128];
977 : char xlogfilename[MAXFNAMELEN];
978 : struct tm *tmp;
979 : char buf[1000];
980 : int len;
981 :
982 28 : XLByteToSeg(startpoint, startsegno, WalSegSz);
983 28 : XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
984 :
985 : /*
986 : * Construct backup label file
987 : */
988 28 : stamp_time = time(NULL);
989 28 : tmp = localtime(&stamp_time);
990 28 : strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
991 :
992 28 : len = snprintf(buf, sizeof(buf),
993 : "START WAL LOCATION: %X/%08X (file %s)\n"
994 : "CHECKPOINT LOCATION: %X/%08X\n"
995 : "BACKUP METHOD: pg_rewind\n"
996 : "BACKUP FROM: standby\n"
997 : "START TIME: %s\n",
998 : /* omit LABEL: line */
999 28 : LSN_FORMAT_ARGS(startpoint), xlogfilename,
1000 28 : LSN_FORMAT_ARGS(checkpointloc),
1001 : strfbuf);
1002 28 : if (len >= sizeof(buf))
1003 0 : pg_fatal("backup label buffer too small"); /* shouldn't happen */
1004 :
1005 : /* TODO: move old file out of the way, if any. */
1006 28 : open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
1007 28 : write_target_range(buf, 0, len);
1008 28 : close_target_file();
1009 28 : }
1010 :
1011 : /*
1012 : * Check CRC of control file
1013 : */
1014 : static void
1015 120 : checkControlFile(ControlFileData *ControlFile)
1016 : {
1017 : pg_crc32c crc;
1018 :
1019 : /* Calculate CRC */
1020 120 : INIT_CRC32C(crc);
1021 120 : COMP_CRC32C(crc, ControlFile, offsetof(ControlFileData, crc));
1022 120 : FIN_CRC32C(crc);
1023 :
1024 : /* And simply compare it */
1025 120 : if (!EQ_CRC32C(crc, ControlFile->crc))
1026 0 : pg_fatal("unexpected control file CRC");
1027 120 : }
1028 :
1029 : /*
1030 : * Verify control file contents in the buffer 'content', and copy it to
1031 : * *ControlFile.
1032 : */
1033 : static void
1034 120 : digestControlFile(ControlFileData *ControlFile, const char *content,
1035 : size_t size)
1036 : {
1037 120 : if (size != PG_CONTROL_FILE_SIZE)
1038 0 : pg_fatal("unexpected control file size %d, expected %d",
1039 : (int) size, PG_CONTROL_FILE_SIZE);
1040 :
1041 120 : memcpy(ControlFile, content, sizeof(ControlFileData));
1042 :
1043 : /* set and validate WalSegSz */
1044 120 : WalSegSz = ControlFile->xlog_seg_size;
1045 :
1046 120 : if (!IsValidWalSegSize(WalSegSz))
1047 : {
1048 0 : pg_log_error(ngettext("invalid WAL segment size in control file (%d byte)",
1049 : "invalid WAL segment size in control file (%d bytes)",
1050 : WalSegSz),
1051 : WalSegSz);
1052 0 : pg_log_error_detail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
1053 0 : exit(1);
1054 : }
1055 :
1056 : /* Additional checks on control file */
1057 120 : checkControlFile(ControlFile);
1058 120 : }
1059 :
1060 : /*
1061 : * Get value of GUC parameter restore_command from the target cluster.
1062 : *
1063 : * This uses a logic based on "postgres -C" to get the value from the
1064 : * cluster.
1065 : */
1066 : static void
1067 38 : getRestoreCommand(const char *argv0)
1068 : {
1069 : int rc;
1070 : char postgres_exec_path[MAXPGPATH];
1071 : PQExpBuffer postgres_cmd;
1072 :
1073 38 : if (!restore_wal)
1074 36 : return;
1075 :
1076 : /* find postgres executable */
1077 2 : rc = find_other_exec(argv0, "postgres",
1078 : PG_BACKEND_VERSIONSTR,
1079 : postgres_exec_path);
1080 :
1081 2 : if (rc < 0)
1082 : {
1083 : char full_path[MAXPGPATH];
1084 :
1085 0 : if (find_my_exec(argv0, full_path) < 0)
1086 0 : strlcpy(full_path, progname, sizeof(full_path));
1087 :
1088 0 : if (rc == -1)
1089 0 : pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
1090 : "postgres", progname, full_path);
1091 : else
1092 0 : pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
1093 : "postgres", full_path, progname);
1094 : }
1095 :
1096 : /*
1097 : * Build a command able to retrieve the value of GUC parameter
1098 : * restore_command, if set.
1099 : */
1100 2 : postgres_cmd = createPQExpBuffer();
1101 :
1102 : /* path to postgres, properly quoted */
1103 2 : appendShellString(postgres_cmd, postgres_exec_path);
1104 :
1105 : /* add -D switch, with properly quoted data directory */
1106 2 : appendPQExpBufferStr(postgres_cmd, " -D ");
1107 2 : appendShellString(postgres_cmd, datadir_target);
1108 :
1109 : /* add custom configuration file only if requested */
1110 2 : if (config_file != NULL)
1111 : {
1112 2 : appendPQExpBufferStr(postgres_cmd, " -c config_file=");
1113 2 : appendShellString(postgres_cmd, config_file);
1114 : }
1115 :
1116 : /* add -C switch, for restore_command */
1117 2 : appendPQExpBufferStr(postgres_cmd, " -C restore_command");
1118 :
1119 2 : restore_command = pipe_read_line(postgres_cmd->data);
1120 2 : if (restore_command == NULL)
1121 0 : pg_fatal("could not read \"restore_command\" from target cluster");
1122 :
1123 2 : (void) pg_strip_crlf(restore_command);
1124 :
1125 2 : if (strcmp(restore_command, "") == 0)
1126 0 : pg_fatal("\"restore_command\" is not set in the target cluster");
1127 :
1128 2 : pg_log_debug("using for rewind \"restore_command = \'%s\'\"",
1129 : restore_command);
1130 :
1131 2 : destroyPQExpBuffer(postgres_cmd);
1132 : }
1133 :
1134 :
1135 : /*
1136 : * Ensure clean shutdown of target instance by launching single-user mode
1137 : * postgres to do crash recovery.
1138 : */
1139 : static void
1140 20 : ensureCleanShutdown(const char *argv0)
1141 : {
1142 : int ret;
1143 : char exec_path[MAXPGPATH];
1144 : PQExpBuffer postgres_cmd;
1145 :
1146 : /* locate postgres binary */
1147 20 : if ((ret = find_other_exec(argv0, "postgres",
1148 : PG_BACKEND_VERSIONSTR,
1149 : exec_path)) < 0)
1150 : {
1151 : char full_path[MAXPGPATH];
1152 :
1153 0 : if (find_my_exec(argv0, full_path) < 0)
1154 0 : strlcpy(full_path, progname, sizeof(full_path));
1155 :
1156 0 : if (ret == -1)
1157 0 : pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
1158 : "postgres", progname, full_path);
1159 : else
1160 0 : pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
1161 : "postgres", full_path, progname);
1162 : }
1163 :
1164 20 : pg_log_info("executing \"%s\" for target server to complete crash recovery",
1165 : exec_path);
1166 :
1167 : /*
1168 : * Skip processing if requested, but only after ensuring presence of
1169 : * postgres.
1170 : */
1171 20 : if (dry_run)
1172 0 : return;
1173 :
1174 : /*
1175 : * Finally run postgres in single-user mode. There is no need to use
1176 : * fsync here. This makes the recovery faster, and the target data folder
1177 : * is synced at the end anyway.
1178 : */
1179 20 : postgres_cmd = createPQExpBuffer();
1180 :
1181 : /* path to postgres, properly quoted */
1182 20 : appendShellString(postgres_cmd, exec_path);
1183 :
1184 : /* add set of options with properly quoted data directory */
1185 20 : appendPQExpBufferStr(postgres_cmd, " --single -F -D ");
1186 20 : appendShellString(postgres_cmd, datadir_target);
1187 :
1188 : /* add custom configuration file only if requested */
1189 20 : if (config_file != NULL)
1190 : {
1191 18 : appendPQExpBufferStr(postgres_cmd, " -c config_file=");
1192 18 : appendShellString(postgres_cmd, config_file);
1193 : }
1194 :
1195 : /* finish with the database name, and a properly quoted redirection */
1196 20 : appendPQExpBufferStr(postgres_cmd, " template1 < ");
1197 20 : appendShellString(postgres_cmd, DEVNULL);
1198 :
1199 20 : fflush(NULL);
1200 20 : if (system(postgres_cmd->data) != 0)
1201 : {
1202 2 : pg_log_error("postgres single-user mode in target cluster failed");
1203 2 : pg_log_error_detail("Command was: %s", postgres_cmd->data);
1204 2 : exit(1);
1205 : }
1206 :
1207 18 : destroyPQExpBuffer(postgres_cmd);
1208 : }
1209 :
1210 : static void
1211 38 : disconnect_atexit(void)
1212 : {
1213 38 : if (conn != NULL)
1214 0 : PQfinish(conn);
1215 38 : }
|