Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_test_fsync --- tests all supported fsync() methods
4 : *
5 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 : *
7 : * src/bin/pg_test_fsync/pg_test_fsync.c
8 : *
9 : *-------------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres_fe.h"
13 :
14 : #include <limits.h>
15 : #include <sys/stat.h>
16 : #include <sys/time.h>
17 : #include <fcntl.h>
18 : #include <time.h>
19 : #include <unistd.h>
20 : #include <signal.h>
21 :
22 : #include "common/logging.h"
23 : #include "common/pg_prng.h"
24 : #include "getopt_long.h"
25 :
26 : /*
27 : * put the temp files in the local directory
28 : * unless the user specifies otherwise
29 : */
30 : #define FSYNC_FILENAME "./pg_test_fsync.out"
31 :
32 : #define XLOG_BLCKSZ_K (XLOG_BLCKSZ / 1024)
33 :
34 : #define LABEL_FORMAT " %-30s"
35 : #define NA_FORMAT "%21s\n"
36 : /* translator: maintain alignment with NA_FORMAT */
37 : #define OPS_FORMAT gettext_noop("%13.3f ops/sec %6.0f usecs/op\n")
38 : #define USECS_SEC 1000000
39 :
40 : /* These are macros to avoid timing the function call overhead. */
41 : #ifndef WIN32
42 : #define START_TIMER \
43 : do { \
44 : alarm_triggered = false; \
45 : alarm(secs_per_test); \
46 : gettimeofday(&start_t, NULL); \
47 : } while (0)
48 : #else
49 : /* WIN32 doesn't support alarm, so we create a thread and sleep there */
50 : #define START_TIMER \
51 : do { \
52 : alarm_triggered = false; \
53 : if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \
54 : INVALID_HANDLE_VALUE) \
55 : pg_fatal("could not create thread for alarm"); \
56 : gettimeofday(&start_t, NULL); \
57 : } while (0)
58 : #endif
59 :
60 : #define STOP_TIMER \
61 : do { \
62 : gettimeofday(&stop_t, NULL); \
63 : print_elapse(start_t, stop_t, ops); \
64 : } while (0)
65 :
66 :
67 : static const char *progname;
68 :
69 : static unsigned int secs_per_test = 5;
70 : static int needs_unlink = 0;
71 : alignas(PGAlignedXLogBlock) static char buf[DEFAULT_XLOG_SEG_SIZE];
72 : static char *filename = FSYNC_FILENAME;
73 : static struct timeval start_t,
74 : stop_t;
75 : static sig_atomic_t alarm_triggered = false;
76 :
77 :
78 : static void handle_args(int argc, char *argv[]);
79 : static void prepare_buf(void);
80 : static void test_open(void);
81 : static void test_non_sync(void);
82 : static void test_sync(int writes_per_op);
83 : static void test_open_syncs(void);
84 : static void test_open_sync(const char *msg, int writes_size);
85 : static void test_file_descriptor_sync(void);
86 :
87 : #ifndef WIN32
88 : static void process_alarm(SIGNAL_ARGS);
89 : #else
90 : static DWORD WINAPI process_alarm(LPVOID param);
91 : #endif
92 : static void signal_cleanup(SIGNAL_ARGS);
93 :
94 : #ifdef HAVE_FSYNC_WRITETHROUGH
95 : static int pg_fsync_writethrough(int fd);
96 : #endif
97 : static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops);
98 :
99 : #define die(msg) pg_fatal("%s: %m", _(msg))
100 :
101 :
102 : int
103 10 : main(int argc, char *argv[])
104 : {
105 10 : pg_logging_init(argv[0]);
106 10 : set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_fsync"));
107 10 : progname = get_progname(argv[0]);
108 :
109 10 : handle_args(argc, argv);
110 :
111 : /* Prevent leaving behind the test file */
112 0 : pqsignal(SIGINT, signal_cleanup);
113 0 : pqsignal(SIGTERM, signal_cleanup);
114 :
115 : /* the following are not valid on Windows */
116 : #ifndef WIN32
117 0 : pqsignal(SIGALRM, process_alarm);
118 0 : pqsignal(SIGHUP, signal_cleanup);
119 : #endif
120 :
121 0 : pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL));
122 :
123 0 : prepare_buf();
124 :
125 0 : test_open();
126 :
127 : /* Test using 1 XLOG_BLCKSZ write */
128 0 : test_sync(1);
129 :
130 : /* Test using 2 XLOG_BLCKSZ writes */
131 0 : test_sync(2);
132 :
133 0 : test_open_syncs();
134 :
135 0 : test_file_descriptor_sync();
136 :
137 0 : test_non_sync();
138 :
139 0 : unlink(filename);
140 :
141 0 : return 0;
142 : }
143 :
144 : static void
145 10 : handle_args(int argc, char *argv[])
146 : {
147 : static struct option long_options[] = {
148 : {"filename", required_argument, NULL, 'f'},
149 : {"secs-per-test", required_argument, NULL, 's'},
150 : {NULL, 0, NULL, 0}
151 : };
152 :
153 : int option; /* Command line option */
154 10 : int optindex = 0; /* used by getopt_long */
155 : unsigned long optval; /* used for option parsing */
156 : char *endptr;
157 :
158 10 : if (argc > 1)
159 : {
160 10 : if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
161 : {
162 2 : printf(_("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n"), progname);
163 2 : exit(0);
164 : }
165 8 : if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
166 : {
167 2 : puts("pg_test_fsync (PostgreSQL) " PG_VERSION);
168 2 : exit(0);
169 : }
170 : }
171 :
172 6 : while ((option = getopt_long(argc, argv, "f:s:",
173 6 : long_options, &optindex)) != -1)
174 : {
175 6 : switch (option)
176 : {
177 0 : case 'f':
178 0 : filename = pg_strdup(optarg);
179 0 : break;
180 :
181 4 : case 's':
182 4 : errno = 0;
183 4 : optval = strtoul(optarg, &endptr, 10);
184 :
185 4 : if (endptr == optarg || *endptr != '\0' ||
186 2 : errno != 0 || optval != (unsigned int) optval)
187 : {
188 2 : pg_log_error("invalid argument for option %s", "--secs-per-test");
189 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
190 2 : exit(1);
191 : }
192 :
193 2 : secs_per_test = (unsigned int) optval;
194 2 : if (secs_per_test == 0)
195 2 : pg_fatal("%s must be in range %u..%u",
196 : "--secs-per-test", 1, UINT_MAX);
197 0 : break;
198 :
199 2 : default:
200 : /* getopt_long already emitted a complaint */
201 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
202 2 : exit(1);
203 : }
204 : }
205 :
206 0 : if (argc > optind)
207 : {
208 0 : pg_log_error("too many command-line arguments (first is \"%s\")",
209 : argv[optind]);
210 0 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
211 0 : exit(1);
212 : }
213 :
214 0 : printf(ngettext("%u second per test\n",
215 : "%u seconds per test\n",
216 : secs_per_test),
217 : secs_per_test);
218 : #if defined(O_DIRECT)
219 0 : printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n"));
220 : #elif defined(F_NOCACHE)
221 : printf(_("F_NOCACHE supported on this platform for open_datasync and open_sync.\n"));
222 : #else
223 : printf(_("Direct I/O is not supported on this platform.\n"));
224 : #endif
225 0 : }
226 :
227 : static void
228 0 : prepare_buf(void)
229 : {
230 : int ops;
231 :
232 : /* write random data into buffer */
233 0 : for (ops = 0; ops < DEFAULT_XLOG_SEG_SIZE; ops++)
234 0 : buf[ops] = (char) pg_prng_int32(&pg_global_prng_state);
235 0 : }
236 :
237 : static void
238 0 : test_open(void)
239 : {
240 : int tmpfile;
241 :
242 : /*
243 : * test if we can open the target file
244 : */
245 0 : if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
246 0 : die("could not open output file");
247 0 : needs_unlink = 1;
248 0 : if (write(tmpfile, buf, DEFAULT_XLOG_SEG_SIZE) !=
249 : DEFAULT_XLOG_SEG_SIZE)
250 0 : die("write failed");
251 :
252 : /* fsync now so that dirty buffers don't skew later tests */
253 0 : if (fsync(tmpfile) != 0)
254 0 : die("fsync failed");
255 :
256 0 : close(tmpfile);
257 0 : }
258 :
259 : static int
260 0 : open_direct(const char *path, int flags, mode_t mode)
261 : {
262 : int fd;
263 :
264 : #ifdef O_DIRECT
265 0 : flags |= O_DIRECT;
266 : #endif
267 :
268 0 : fd = open(path, flags, mode);
269 :
270 : #if !defined(O_DIRECT) && defined(F_NOCACHE)
271 : if (fd >= 0 && fcntl(fd, F_NOCACHE, 1) < 0)
272 : {
273 : int save_errno = errno;
274 :
275 : close(fd);
276 : errno = save_errno;
277 : return -1;
278 : }
279 : #endif
280 :
281 0 : return fd;
282 : }
283 :
284 : static void
285 0 : test_sync(int writes_per_op)
286 : {
287 : int tmpfile,
288 : ops,
289 : writes;
290 0 : bool fs_warning = false;
291 :
292 0 : if (writes_per_op == 1)
293 0 : printf(_("\nCompare file sync methods using one %dkB write:\n"), XLOG_BLCKSZ_K);
294 : else
295 0 : printf(_("\nCompare file sync methods using two %dkB writes:\n"), XLOG_BLCKSZ_K);
296 0 : printf(_("(in \"wal_sync_method\" preference order, except fdatasync is Linux's default)\n"));
297 :
298 : /*
299 : * Test open_datasync if available
300 : */
301 0 : printf(LABEL_FORMAT, "open_datasync");
302 0 : fflush(stdout);
303 :
304 : #ifdef O_DSYNC
305 0 : if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
306 : {
307 0 : printf(NA_FORMAT, _("n/a*"));
308 0 : fs_warning = true;
309 : }
310 : else
311 : {
312 0 : START_TIMER;
313 0 : for (ops = 0; alarm_triggered == false; ops++)
314 : {
315 0 : for (writes = 0; writes < writes_per_op; writes++)
316 0 : if (pg_pwrite(tmpfile,
317 : buf,
318 : XLOG_BLCKSZ,
319 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
320 0 : die("write failed");
321 : }
322 0 : STOP_TIMER;
323 0 : close(tmpfile);
324 : }
325 : #else
326 : printf(NA_FORMAT, _("n/a"));
327 : #endif
328 :
329 : /*
330 : * Test fdatasync if available
331 : */
332 0 : printf(LABEL_FORMAT, "fdatasync");
333 0 : fflush(stdout);
334 :
335 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
336 0 : die("could not open output file");
337 0 : START_TIMER;
338 0 : for (ops = 0; alarm_triggered == false; ops++)
339 : {
340 0 : for (writes = 0; writes < writes_per_op; writes++)
341 0 : if (pg_pwrite(tmpfile,
342 : buf,
343 : XLOG_BLCKSZ,
344 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
345 0 : die("write failed");
346 0 : fdatasync(tmpfile);
347 : }
348 0 : STOP_TIMER;
349 0 : close(tmpfile);
350 :
351 : /*
352 : * Test fsync
353 : */
354 0 : printf(LABEL_FORMAT, "fsync");
355 0 : fflush(stdout);
356 :
357 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
358 0 : die("could not open output file");
359 0 : START_TIMER;
360 0 : for (ops = 0; alarm_triggered == false; ops++)
361 : {
362 0 : for (writes = 0; writes < writes_per_op; writes++)
363 0 : if (pg_pwrite(tmpfile,
364 : buf,
365 : XLOG_BLCKSZ,
366 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
367 0 : die("write failed");
368 0 : if (fsync(tmpfile) != 0)
369 0 : die("fsync failed");
370 : }
371 0 : STOP_TIMER;
372 0 : close(tmpfile);
373 :
374 : /*
375 : * If fsync_writethrough is available, test as well
376 : */
377 0 : printf(LABEL_FORMAT, "fsync_writethrough");
378 0 : fflush(stdout);
379 :
380 : #ifdef HAVE_FSYNC_WRITETHROUGH
381 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
382 : die("could not open output file");
383 : START_TIMER;
384 : for (ops = 0; alarm_triggered == false; ops++)
385 : {
386 : for (writes = 0; writes < writes_per_op; writes++)
387 : if (pg_pwrite(tmpfile,
388 : buf,
389 : XLOG_BLCKSZ,
390 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
391 : die("write failed");
392 : if (pg_fsync_writethrough(tmpfile) != 0)
393 : die("fsync failed");
394 : }
395 : STOP_TIMER;
396 : close(tmpfile);
397 : #else
398 0 : printf(NA_FORMAT, _("n/a"));
399 : #endif
400 :
401 : /*
402 : * Test open_sync if available
403 : */
404 0 : printf(LABEL_FORMAT, "open_sync");
405 0 : fflush(stdout);
406 :
407 : #ifdef O_SYNC
408 0 : if ((tmpfile = open_direct(filename, O_RDWR | O_SYNC | PG_BINARY, 0)) == -1)
409 : {
410 0 : printf(NA_FORMAT, _("n/a*"));
411 0 : fs_warning = true;
412 : }
413 : else
414 : {
415 0 : START_TIMER;
416 0 : for (ops = 0; alarm_triggered == false; ops++)
417 : {
418 0 : for (writes = 0; writes < writes_per_op; writes++)
419 0 : if (pg_pwrite(tmpfile,
420 : buf,
421 : XLOG_BLCKSZ,
422 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
423 :
424 : /*
425 : * This can generate write failures if the filesystem has
426 : * a large block size, e.g. 4k, and there is no support
427 : * for O_DIRECT writes smaller than the file system block
428 : * size, e.g. XFS.
429 : */
430 0 : die("write failed");
431 : }
432 0 : STOP_TIMER;
433 0 : close(tmpfile);
434 : }
435 : #else
436 : printf(NA_FORMAT, _("n/a"));
437 : #endif
438 :
439 0 : if (fs_warning)
440 : {
441 0 : printf(_("* This file system and its mount options do not support direct\n"
442 : " I/O, e.g. ext4 in journaled mode.\n"));
443 : }
444 0 : }
445 :
446 : static void
447 0 : test_open_syncs(void)
448 : {
449 0 : printf(_("\nCompare open_sync with different write sizes:\n"));
450 0 : printf(_("(This is designed to compare the cost of writing 16kB in different write\n"
451 : "open_sync sizes.)\n"));
452 :
453 0 : test_open_sync(_(" 1 * 16kB open_sync write"), 16);
454 0 : test_open_sync(_(" 2 * 8kB open_sync writes"), 8);
455 0 : test_open_sync(_(" 4 * 4kB open_sync writes"), 4);
456 0 : test_open_sync(_(" 8 * 2kB open_sync writes"), 2);
457 0 : test_open_sync(_("16 * 1kB open_sync writes"), 1);
458 0 : }
459 :
460 : /*
461 : * Test open_sync with different size files
462 : */
463 : static void
464 0 : test_open_sync(const char *msg, int writes_size)
465 : {
466 : #ifdef O_SYNC
467 : int tmpfile,
468 : ops,
469 : writes;
470 : #endif
471 :
472 0 : printf(LABEL_FORMAT, msg);
473 0 : fflush(stdout);
474 :
475 : #ifdef O_SYNC
476 0 : if ((tmpfile = open_direct(filename, O_RDWR | O_SYNC | PG_BINARY, 0)) == -1)
477 0 : printf(NA_FORMAT, _("n/a*"));
478 : else
479 : {
480 0 : START_TIMER;
481 0 : for (ops = 0; alarm_triggered == false; ops++)
482 : {
483 0 : for (writes = 0; writes < 16 / writes_size; writes++)
484 0 : if (pg_pwrite(tmpfile,
485 : buf,
486 0 : writes_size * 1024,
487 0 : writes * writes_size * 1024) !=
488 0 : writes_size * 1024)
489 0 : die("write failed");
490 : }
491 0 : STOP_TIMER;
492 0 : close(tmpfile);
493 : }
494 : #else
495 : printf(NA_FORMAT, _("n/a"));
496 : #endif
497 0 : }
498 :
499 : static void
500 0 : test_file_descriptor_sync(void)
501 : {
502 : int tmpfile,
503 : ops;
504 :
505 : /*
506 : * Test whether fsync can sync data written on a different descriptor for
507 : * the same file. This checks the efficiency of multi-process fsyncs
508 : * against the same file. Possibly this should be done with writethrough
509 : * on platforms which support it.
510 : */
511 0 : printf(_("\nTest if fsync on non-write file descriptor is honored:\n"));
512 0 : printf(_("(If the times are similar, fsync() can sync data written on a different\n"
513 : "descriptor.)\n"));
514 :
515 : /*
516 : * first write, fsync and close, which is the normal behavior without
517 : * multiple descriptors
518 : */
519 0 : printf(LABEL_FORMAT, "write, fsync, close");
520 0 : fflush(stdout);
521 :
522 0 : START_TIMER;
523 0 : for (ops = 0; alarm_triggered == false; ops++)
524 : {
525 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
526 0 : die("could not open output file");
527 0 : if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
528 0 : die("write failed");
529 0 : if (fsync(tmpfile) != 0)
530 0 : die("fsync failed");
531 0 : close(tmpfile);
532 :
533 : /*
534 : * open and close the file again to be consistent with the following
535 : * test
536 : */
537 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
538 0 : die("could not open output file");
539 0 : close(tmpfile);
540 : }
541 0 : STOP_TIMER;
542 :
543 : /*
544 : * Now open, write, close, open again and fsync This simulates processes
545 : * fsyncing each other's writes.
546 : */
547 0 : printf(LABEL_FORMAT, "write, close, fsync");
548 0 : fflush(stdout);
549 :
550 0 : START_TIMER;
551 0 : for (ops = 0; alarm_triggered == false; ops++)
552 : {
553 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
554 0 : die("could not open output file");
555 0 : if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
556 0 : die("write failed");
557 0 : close(tmpfile);
558 : /* reopen file */
559 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
560 0 : die("could not open output file");
561 0 : if (fsync(tmpfile) != 0)
562 0 : die("fsync failed");
563 0 : close(tmpfile);
564 : }
565 0 : STOP_TIMER;
566 0 : }
567 :
568 : static void
569 0 : test_non_sync(void)
570 : {
571 : int tmpfile,
572 : ops;
573 :
574 : /*
575 : * Test a simple write without fsync
576 : */
577 0 : printf(_("\nNon-sync'ed %dkB writes:\n"), XLOG_BLCKSZ_K);
578 0 : printf(LABEL_FORMAT, "write");
579 0 : fflush(stdout);
580 :
581 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
582 0 : die("could not open output file");
583 0 : START_TIMER;
584 0 : for (ops = 0; alarm_triggered == false; ops++)
585 : {
586 0 : if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ)
587 0 : die("write failed");
588 : }
589 0 : STOP_TIMER;
590 0 : close(tmpfile);
591 0 : }
592 :
593 : static void
594 0 : signal_cleanup(SIGNAL_ARGS)
595 : {
596 : int rc;
597 :
598 : /* Delete the file if it exists. Ignore errors */
599 0 : if (needs_unlink)
600 0 : unlink(filename);
601 : /* Finish incomplete line on stdout */
602 0 : rc = write(STDOUT_FILENO, "\n", 1);
603 : (void) rc; /* silence compiler warnings */
604 0 : _exit(1);
605 : }
606 :
607 : #ifdef HAVE_FSYNC_WRITETHROUGH
608 :
609 : static int
610 : pg_fsync_writethrough(int fd)
611 : {
612 : #if defined(F_FULLFSYNC)
613 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
614 : #else
615 : errno = ENOSYS;
616 : return -1;
617 : #endif
618 : }
619 : #endif
620 :
621 : /*
622 : * print out the writes per second for tests
623 : */
624 : static void
625 0 : print_elapse(struct timeval start_t, struct timeval stop_t, int ops)
626 : {
627 0 : double total_time = (stop_t.tv_sec - start_t.tv_sec) +
628 0 : (stop_t.tv_usec - start_t.tv_usec) * 0.000001;
629 0 : double per_second = ops / total_time;
630 0 : double avg_op_time_us = (total_time / ops) * USECS_SEC;
631 :
632 0 : printf(_(OPS_FORMAT), per_second, avg_op_time_us);
633 0 : }
634 :
635 : #ifndef WIN32
636 : static void
637 0 : process_alarm(SIGNAL_ARGS)
638 : {
639 0 : alarm_triggered = true;
640 0 : }
641 : #else
642 : static DWORD WINAPI
643 : process_alarm(LPVOID param)
644 : {
645 : /* WIN32 doesn't support alarm, so we create a thread and sleep here */
646 : Sleep(secs_per_test * 1000);
647 : alarm_triggered = true;
648 : ExitThread(0);
649 : }
650 : #endif
|