Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_checksums.c
4 : * Checks, enables or disables page level checksums for an offline
5 : * cluster
6 : *
7 : * Copyright (c) 2010-2025, PostgreSQL Global Development Group
8 : *
9 : * IDENTIFICATION
10 : * src/bin/pg_checksums/pg_checksums.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres_fe.h"
16 :
17 : #include <dirent.h>
18 : #include <limits.h>
19 : #include <sys/stat.h>
20 : #include <time.h>
21 : #include <unistd.h>
22 :
23 : #include "common/controldata_utils.h"
24 : #include "common/file_utils.h"
25 : #include "common/logging.h"
26 : #include "common/relpath.h"
27 : #include "fe_utils/option_utils.h"
28 : #include "getopt_long.h"
29 : #include "pg_getopt.h"
30 : #include "storage/bufpage.h"
31 : #include "storage/checksum.h"
32 : #include "storage/checksum_impl.h"
33 :
34 :
35 : static int64 files_scanned = 0;
36 : static int64 files_written = 0;
37 : static int64 blocks_scanned = 0;
38 : static int64 blocks_written = 0;
39 : static int64 badblocks = 0;
40 : static ControlFileData *ControlFile;
41 :
42 : static char *only_filenode = NULL;
43 : static bool do_sync = true;
44 : static bool verbose = false;
45 : static bool showprogress = false;
46 : static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
47 :
48 : typedef enum
49 : {
50 : PG_MODE_CHECK,
51 : PG_MODE_DISABLE,
52 : PG_MODE_ENABLE,
53 : } PgChecksumMode;
54 :
55 : static PgChecksumMode mode = PG_MODE_CHECK;
56 :
57 : static const char *progname;
58 :
59 : /*
60 : * Progress status information.
61 : */
62 : static int64 total_size = 0;
63 : static int64 current_size = 0;
64 : static pg_time_t last_progress_report = 0;
65 :
66 : static void
67 2 : usage(void)
68 : {
69 2 : printf(_("%s enables, disables, or verifies data checksums in a PostgreSQL database cluster.\n\n"), progname);
70 2 : printf(_("Usage:\n"));
71 2 : printf(_(" %s [OPTION]... [DATADIR]\n"), progname);
72 2 : printf(_("\nOptions:\n"));
73 2 : printf(_(" [-D, --pgdata=]DATADIR data directory\n"));
74 2 : printf(_(" -c, --check check data checksums (default)\n"));
75 2 : printf(_(" -d, --disable disable data checksums\n"));
76 2 : printf(_(" -e, --enable enable data checksums\n"));
77 2 : printf(_(" -f, --filenode=FILENODE check only relation with specified filenode\n"));
78 2 : printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n"));
79 2 : printf(_(" -P, --progress show progress information\n"));
80 2 : printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
81 2 : printf(_(" -v, --verbose output verbose messages\n"));
82 2 : printf(_(" -V, --version output version information, then exit\n"));
83 2 : printf(_(" -?, --help show this help, then exit\n"));
84 2 : printf(_("\nIf no data directory (DATADIR) is specified, "
85 : "the environment variable PGDATA\nis used.\n\n"));
86 2 : printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT);
87 2 : printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
88 2 : }
89 :
90 : /*
91 : * Definition of one element part of an exclusion list, used for files
92 : * to exclude from checksum validation. "name" is the name of the file
93 : * or path to check for exclusion. If "match_prefix" is true, any items
94 : * matching the name as prefix are excluded.
95 : */
96 : struct exclude_list_item
97 : {
98 : const char *name;
99 : bool match_prefix;
100 : };
101 :
102 : /*
103 : * List of files excluded from checksum validation.
104 : *
105 : * Note: this list should be kept in sync with what basebackup.c includes.
106 : */
107 : static const struct exclude_list_item skip[] = {
108 : {"pg_control", false},
109 : {"pg_filenode.map", false},
110 : {"pg_internal.init", true},
111 : {"PG_VERSION", false},
112 : #ifdef EXEC_BACKEND
113 : {"config_exec_params", true},
114 : #endif
115 : {NULL, false}
116 : };
117 :
118 : /*
119 : * Report current progress status. Parts borrowed from
120 : * src/bin/pg_basebackup/pg_basebackup.c.
121 : */
122 : static void
123 0 : progress_report(bool finished)
124 : {
125 : int percent;
126 : pg_time_t now;
127 :
128 : Assert(showprogress);
129 :
130 0 : now = time(NULL);
131 0 : if (now == last_progress_report && !finished)
132 0 : return; /* Max once per second */
133 :
134 : /* Save current time */
135 0 : last_progress_report = now;
136 :
137 : /* Adjust total size if current_size is larger */
138 0 : if (current_size > total_size)
139 0 : total_size = current_size;
140 :
141 : /* Calculate current percentage of size done */
142 0 : percent = total_size ? (int) ((current_size) * 100 / total_size) : 0;
143 :
144 0 : fprintf(stderr, _("%lld/%lld MB (%d%%) computed"),
145 0 : (long long) (current_size / (1024 * 1024)),
146 0 : (long long) (total_size / (1024 * 1024)),
147 : percent);
148 :
149 : /*
150 : * Stay on the same line if reporting to a terminal and we're not done
151 : * yet.
152 : */
153 0 : fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
154 : }
155 :
156 : static bool
157 25766 : skipfile(const char *fn)
158 : {
159 : int excludeIdx;
160 :
161 128152 : for (excludeIdx = 0; skip[excludeIdx].name != NULL; excludeIdx++)
162 : {
163 102678 : int cmplen = strlen(skip[excludeIdx].name);
164 :
165 102678 : if (!skip[excludeIdx].match_prefix)
166 77052 : cmplen++;
167 102678 : if (strncmp(skip[excludeIdx].name, fn, cmplen) == 0)
168 292 : return true;
169 : }
170 :
171 25474 : return false;
172 : }
173 :
174 : static void
175 17794 : scan_file(const char *fn, int segmentno)
176 : {
177 : PGIOAlignedBlock buf;
178 17794 : PageHeader header = (PageHeader) buf.data;
179 : int f;
180 : BlockNumber blockno;
181 : int flags;
182 17794 : int64 blocks_written_in_file = 0;
183 :
184 : Assert(mode == PG_MODE_ENABLE ||
185 : mode == PG_MODE_CHECK);
186 :
187 17794 : flags = (mode == PG_MODE_ENABLE) ? O_RDWR : O_RDONLY;
188 17794 : f = open(fn, PG_BINARY | flags, 0);
189 :
190 17794 : if (f < 0)
191 0 : pg_fatal("could not open file \"%s\": %m", fn);
192 :
193 17794 : files_scanned++;
194 :
195 17794 : for (blockno = 0;; blockno++)
196 52690 : {
197 : uint16 csum;
198 70484 : int r = read(f, buf.data, BLCKSZ);
199 :
200 70484 : if (r == 0)
201 17778 : break;
202 52706 : if (r != BLCKSZ)
203 : {
204 16 : if (r < 0)
205 0 : pg_fatal("could not read block %u in file \"%s\": %m",
206 : blockno, fn);
207 : else
208 16 : pg_fatal("could not read block %u in file \"%s\": read %d of %d",
209 : blockno, fn, r, BLCKSZ);
210 : }
211 52690 : blocks_scanned++;
212 :
213 : /*
214 : * Since the file size is counted as total_size for progress status
215 : * information, the sizes of all pages including new ones in the file
216 : * should be counted as current_size. Otherwise the progress reporting
217 : * calculated using those counters may not reach 100%.
218 : */
219 52690 : current_size += r;
220 :
221 : /* New pages have no checksum yet */
222 52690 : if (PageIsNew(buf.data))
223 228 : continue;
224 :
225 52462 : csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
226 52462 : if (mode == PG_MODE_CHECK)
227 : {
228 41030 : if (csum != header->pd_checksum)
229 : {
230 8 : if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
231 8 : pg_log_error("checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X",
232 : fn, blockno, csum, header->pd_checksum);
233 8 : badblocks++;
234 : }
235 : }
236 11432 : else if (mode == PG_MODE_ENABLE)
237 : {
238 : int w;
239 :
240 : /*
241 : * Do not rewrite if the checksum is already set to the expected
242 : * value.
243 : */
244 11432 : if (header->pd_checksum == csum)
245 5716 : continue;
246 :
247 5716 : blocks_written_in_file++;
248 :
249 : /* Set checksum in page header */
250 5716 : header->pd_checksum = csum;
251 :
252 : /* Seek back to beginning of block */
253 5716 : if (lseek(f, -BLCKSZ, SEEK_CUR) < 0)
254 0 : pg_fatal("seek failed for block %u in file \"%s\": %m", blockno, fn);
255 :
256 : /* Write block with checksum */
257 5716 : w = write(f, buf.data, BLCKSZ);
258 5716 : if (w != BLCKSZ)
259 : {
260 0 : if (w < 0)
261 0 : pg_fatal("could not write block %u in file \"%s\": %m",
262 : blockno, fn);
263 : else
264 0 : pg_fatal("could not write block %u in file \"%s\": wrote %d of %d",
265 : blockno, fn, w, BLCKSZ);
266 : }
267 : }
268 :
269 46746 : if (showprogress)
270 0 : progress_report(false);
271 : }
272 :
273 17778 : if (verbose)
274 : {
275 0 : if (mode == PG_MODE_CHECK)
276 0 : pg_log_info("checksums verified in file \"%s\"", fn);
277 0 : if (mode == PG_MODE_ENABLE)
278 0 : pg_log_info("checksums enabled in file \"%s\"", fn);
279 : }
280 :
281 : /* Update write counters if any write activity has happened */
282 17778 : if (blocks_written_in_file > 0)
283 : {
284 1564 : files_written++;
285 1564 : blocks_written += blocks_written_in_file;
286 : }
287 :
288 17778 : close(f);
289 17778 : }
290 :
291 : /*
292 : * Scan the given directory for items which can be checksummed and
293 : * operate on each one of them. If "sizeonly" is true, the size of
294 : * all the items which have checksums is computed and returned back
295 : * to the caller without operating on the files. This is used to compile
296 : * the total size of the data directory for progress reports.
297 : */
298 : static int64
299 192 : scan_directory(const char *basedir, const char *subdir, bool sizeonly)
300 : {
301 192 : int64 dirsize = 0;
302 : char path[MAXPGPATH];
303 : DIR *dir;
304 : struct dirent *de;
305 :
306 192 : snprintf(path, sizeof(path), "%s/%s", basedir, subdir);
307 192 : dir = opendir(path);
308 192 : if (!dir)
309 0 : pg_fatal("could not open directory \"%s\": %m", path);
310 26504 : while ((de = readdir(dir)) != NULL)
311 : {
312 : char fn[MAXPGPATH];
313 : struct stat st;
314 :
315 26328 : if (strcmp(de->d_name, ".") == 0 ||
316 26146 : strcmp(de->d_name, "..") == 0)
317 8436 : continue;
318 :
319 : /* Skip temporary files */
320 25964 : if (strncmp(de->d_name,
321 : PG_TEMP_FILE_PREFIX,
322 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
323 62 : continue;
324 :
325 : /* Skip temporary folders */
326 25902 : if (strncmp(de->d_name,
327 : PG_TEMP_FILES_DIR,
328 : strlen(PG_TEMP_FILES_DIR)) == 0)
329 0 : continue;
330 :
331 : /* Skip macOS system files */
332 25902 : if (strcmp(de->d_name, ".DS_Store") == 0)
333 38 : continue;
334 :
335 25864 : snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
336 25864 : if (lstat(fn, &st) < 0)
337 0 : pg_fatal("could not stat file \"%s\": %m", fn);
338 25864 : if (S_ISREG(st.st_mode))
339 : {
340 : char fnonly[MAXPGPATH];
341 : char *forkpath,
342 : *segmentpath;
343 25766 : int segmentno = 0;
344 :
345 25766 : if (skipfile(de->d_name))
346 7972 : continue;
347 :
348 : /*
349 : * Cut off at the segment boundary (".") to get the segment number
350 : * in order to mix it into the checksum. Then also cut off at the
351 : * fork boundary, to get the filenode the file belongs to for
352 : * filtering.
353 : */
354 25474 : strlcpy(fnonly, de->d_name, sizeof(fnonly));
355 25474 : segmentpath = strchr(fnonly, '.');
356 25474 : if (segmentpath != NULL)
357 : {
358 142 : *segmentpath++ = '\0';
359 142 : segmentno = atoi(segmentpath);
360 142 : if (segmentno == 0)
361 0 : pg_fatal("invalid segment number %d in file name \"%s\"",
362 : segmentno, fn);
363 : }
364 :
365 25474 : forkpath = strchr(fnonly, '_');
366 25474 : if (forkpath != NULL)
367 6302 : *forkpath++ = '\0';
368 :
369 25474 : if (only_filenode && strcmp(only_filenode, fnonly) != 0)
370 : /* filenode not to be included */
371 7680 : continue;
372 :
373 17794 : dirsize += st.st_size;
374 :
375 : /*
376 : * No need to work on the file when calculating only the size of
377 : * the items in the data folder.
378 : */
379 17794 : if (!sizeonly)
380 17794 : scan_file(fn, segmentno);
381 : }
382 98 : else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
383 : {
384 : /*
385 : * If going through the entries of pg_tblspc, we assume to operate
386 : * on tablespace locations where only TABLESPACE_VERSION_DIRECTORY
387 : * is valid, resolving the linked locations and dive into them
388 : * directly.
389 : */
390 98 : if (strncmp(PG_TBLSPC_DIR, subdir, strlen(PG_TBLSPC_DIR)) == 0)
391 : {
392 : char tblspc_path[MAXPGPATH];
393 : struct stat tblspc_st;
394 :
395 : /*
396 : * Resolve tablespace location path and check whether
397 : * TABLESPACE_VERSION_DIRECTORY exists. Not finding a valid
398 : * location is unexpected, since there should be no orphaned
399 : * links and no links pointing to something else than a
400 : * directory.
401 : */
402 10 : snprintf(tblspc_path, sizeof(tblspc_path), "%s/%s/%s",
403 10 : path, de->d_name, TABLESPACE_VERSION_DIRECTORY);
404 :
405 10 : if (lstat(tblspc_path, &tblspc_st) < 0)
406 0 : pg_fatal("could not stat file \"%s\": %m",
407 : tblspc_path);
408 :
409 : /*
410 : * Move backwards once as the scan needs to happen for the
411 : * contents of TABLESPACE_VERSION_DIRECTORY.
412 : */
413 10 : snprintf(tblspc_path, sizeof(tblspc_path), "%s/%s",
414 10 : path, de->d_name);
415 :
416 : /* Looks like a valid tablespace location */
417 10 : dirsize += scan_directory(tblspc_path,
418 : TABLESPACE_VERSION_DIRECTORY,
419 : sizeonly);
420 : }
421 : else
422 : {
423 88 : dirsize += scan_directory(path, de->d_name, sizeonly);
424 : }
425 : }
426 : }
427 176 : closedir(dir);
428 176 : return dirsize;
429 : }
430 :
431 : int
432 62 : main(int argc, char *argv[])
433 : {
434 : static struct option long_options[] = {
435 : {"check", no_argument, NULL, 'c'},
436 : {"pgdata", required_argument, NULL, 'D'},
437 : {"disable", no_argument, NULL, 'd'},
438 : {"enable", no_argument, NULL, 'e'},
439 : {"filenode", required_argument, NULL, 'f'},
440 : {"no-sync", no_argument, NULL, 'N'},
441 : {"progress", no_argument, NULL, 'P'},
442 : {"verbose", no_argument, NULL, 'v'},
443 : {"sync-method", required_argument, NULL, 1},
444 : {NULL, 0, NULL, 0}
445 : };
446 :
447 62 : char *DataDir = NULL;
448 : int c;
449 : int option_index;
450 : bool crc_ok;
451 :
452 62 : pg_logging_init(argv[0]);
453 62 : set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_checksums"));
454 62 : progname = get_progname(argv[0]);
455 :
456 62 : if (argc > 1)
457 : {
458 62 : if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
459 : {
460 2 : usage();
461 2 : exit(0);
462 : }
463 60 : if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
464 : {
465 2 : puts("pg_checksums (PostgreSQL) " PG_VERSION);
466 2 : exit(0);
467 : }
468 : }
469 :
470 186 : while ((c = getopt_long(argc, argv, "cdD:ef:NPv", long_options, &option_index)) != -1)
471 : {
472 130 : switch (c)
473 : {
474 38 : case 'c':
475 38 : mode = PG_MODE_CHECK;
476 38 : break;
477 6 : case 'd':
478 6 : mode = PG_MODE_DISABLE;
479 6 : break;
480 56 : case 'D':
481 56 : DataDir = optarg;
482 56 : break;
483 8 : case 'e':
484 8 : mode = PG_MODE_ENABLE;
485 8 : break;
486 12 : case 'f':
487 12 : if (!option_parse_int(optarg, "-f/--filenode", 0,
488 : INT_MAX,
489 : NULL))
490 0 : exit(1);
491 12 : only_filenode = pstrdup(optarg);
492 12 : break;
493 8 : case 'N':
494 8 : do_sync = false;
495 8 : break;
496 0 : case 'P':
497 0 : showprogress = true;
498 0 : break;
499 0 : case 'v':
500 0 : verbose = true;
501 0 : break;
502 0 : case 1:
503 0 : if (!parse_sync_method(optarg, &sync_method))
504 0 : exit(1);
505 0 : break;
506 2 : default:
507 : /* getopt_long already emitted a complaint */
508 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
509 2 : exit(1);
510 : }
511 : }
512 :
513 56 : if (DataDir == NULL)
514 : {
515 0 : if (optind < argc)
516 0 : DataDir = argv[optind++];
517 : else
518 0 : DataDir = getenv("PGDATA");
519 :
520 : /* If no DataDir was specified, and none could be found, error out */
521 0 : if (DataDir == NULL)
522 : {
523 0 : pg_log_error("no data directory specified");
524 0 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
525 0 : exit(1);
526 : }
527 : }
528 :
529 : /* Complain if any arguments remain */
530 56 : if (optind < argc)
531 : {
532 0 : pg_log_error("too many command-line arguments (first is \"%s\")",
533 : argv[optind]);
534 0 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
535 0 : exit(1);
536 : }
537 :
538 : /* filenode checking only works in --check mode */
539 56 : if (mode != PG_MODE_CHECK && only_filenode)
540 : {
541 4 : pg_log_error("option -f/--filenode can only be used with --check");
542 4 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
543 4 : exit(1);
544 : }
545 :
546 : /* Read the control file and check compatibility */
547 52 : ControlFile = get_controlfile(DataDir, &crc_ok);
548 52 : if (!crc_ok)
549 0 : pg_fatal("pg_control CRC value is incorrect");
550 :
551 52 : if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
552 0 : pg_fatal("cluster is not compatible with this version of pg_checksums");
553 :
554 52 : if (ControlFile->blcksz != BLCKSZ)
555 : {
556 0 : pg_log_error("database cluster is not compatible");
557 0 : pg_log_error_detail("The database cluster was initialized with block size %u, but pg_checksums was compiled with block size %u.",
558 : ControlFile->blcksz, BLCKSZ);
559 0 : exit(1);
560 : }
561 :
562 : /*
563 : * Check if cluster is running. A clean shutdown is required to avoid
564 : * random checksum failures caused by torn pages. Note that this doesn't
565 : * guard against someone starting the cluster concurrently.
566 : */
567 52 : if (ControlFile->state != DB_SHUTDOWNED &&
568 2 : ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
569 2 : pg_fatal("cluster must be shut down");
570 :
571 50 : if (ControlFile->data_checksum_version == 0 &&
572 8 : mode == PG_MODE_CHECK)
573 2 : pg_fatal("data checksums are not enabled in cluster");
574 :
575 48 : if (ControlFile->data_checksum_version == 0 &&
576 6 : mode == PG_MODE_DISABLE)
577 2 : pg_fatal("data checksums are already disabled in cluster");
578 :
579 46 : if (ControlFile->data_checksum_version > 0 &&
580 42 : mode == PG_MODE_ENABLE)
581 2 : pg_fatal("data checksums are already enabled in cluster");
582 :
583 : /* Operate on all files if checking or enabling checksums */
584 44 : if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE)
585 : {
586 : /*
587 : * If progress status information is requested, we need to scan the
588 : * directory tree twice: once to know how much total data needs to be
589 : * processed and once to do the real work.
590 : */
591 42 : if (showprogress)
592 : {
593 0 : total_size = scan_directory(DataDir, "global", true);
594 0 : total_size += scan_directory(DataDir, "base", true);
595 0 : total_size += scan_directory(DataDir, PG_TBLSPC_DIR, true);
596 : }
597 :
598 42 : (void) scan_directory(DataDir, "global", false);
599 26 : (void) scan_directory(DataDir, "base", false);
600 26 : (void) scan_directory(DataDir, PG_TBLSPC_DIR, false);
601 :
602 26 : if (showprogress)
603 0 : progress_report(true);
604 :
605 26 : printf(_("Checksum operation completed\n"));
606 26 : printf(_("Files scanned: %lld\n"), (long long) files_scanned);
607 26 : printf(_("Blocks scanned: %lld\n"), (long long) blocks_scanned);
608 26 : if (mode == PG_MODE_CHECK)
609 : {
610 22 : printf(_("Bad checksums: %lld\n"), (long long) badblocks);
611 22 : printf(_("Data checksum version: %u\n"), ControlFile->data_checksum_version);
612 :
613 22 : if (badblocks > 0)
614 8 : exit(1);
615 : }
616 4 : else if (mode == PG_MODE_ENABLE)
617 : {
618 4 : printf(_("Files written: %lld\n"), (long long) files_written);
619 4 : printf(_("Blocks written: %lld\n"), (long long) blocks_written);
620 : }
621 : }
622 :
623 : /*
624 : * Finally make the data durable on disk if enabling or disabling
625 : * checksums. Flush first the data directory for safety, and then update
626 : * the control file to keep the switch consistent.
627 : */
628 20 : if (mode == PG_MODE_ENABLE || mode == PG_MODE_DISABLE)
629 : {
630 6 : ControlFile->data_checksum_version =
631 6 : (mode == PG_MODE_ENABLE) ? PG_DATA_CHECKSUM_VERSION : 0;
632 :
633 6 : if (do_sync)
634 : {
635 2 : pg_log_info("syncing data directory");
636 2 : sync_pgdata(DataDir, PG_VERSION_NUM, sync_method);
637 : }
638 :
639 6 : pg_log_info("updating control file");
640 6 : update_controlfile(DataDir, ControlFile, do_sync);
641 :
642 6 : if (verbose)
643 0 : printf(_("Data checksum version: %u\n"), ControlFile->data_checksum_version);
644 6 : if (mode == PG_MODE_ENABLE)
645 4 : printf(_("Checksums enabled in cluster\n"));
646 : else
647 2 : printf(_("Checksums disabled in cluster\n"));
648 : }
649 :
650 20 : return 0;
651 : }
|