Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_test_fsync --- tests all supported fsync() methods
4 : *
5 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6 : *
7 : * src/bin/pg_test_fsync/pg_test_fsync.c
8 : *
9 : *-------------------------------------------------------------------------
10 : */
11 :
12 : #include "postgres_fe.h"
13 :
14 : #include <limits.h>
15 : #include <sys/stat.h>
16 : #include <sys/time.h>
17 : #include <fcntl.h>
18 : #include <time.h>
19 : #include <unistd.h>
20 : #include <signal.h>
21 :
22 : #include "common/logging.h"
23 : #include "common/pg_prng.h"
24 : #include "getopt_long.h"
25 :
26 : /*
27 : * put the temp files in the local directory
28 : * unless the user specifies otherwise
29 : */
30 : #define FSYNC_FILENAME "./pg_test_fsync.out"
31 :
32 : #define XLOG_BLCKSZ_K (XLOG_BLCKSZ / 1024)
33 :
34 : #define LABEL_FORMAT " %-30s"
35 : #define NA_FORMAT "%21s\n"
36 : /* translator: maintain alignment with NA_FORMAT */
37 : #define OPS_FORMAT gettext_noop("%13.3f ops/sec %6.0f usecs/op\n")
38 : #define USECS_SEC 1000000
39 :
40 : /* These are macros to avoid timing the function call overhead. */
41 : #ifndef WIN32
42 : #define START_TIMER \
43 : do { \
44 : alarm_triggered = false; \
45 : alarm(secs_per_test); \
46 : gettimeofday(&start_t, NULL); \
47 : } while (0)
48 : #else
49 : /* WIN32 doesn't support alarm, so we create a thread and sleep there */
50 : #define START_TIMER \
51 : do { \
52 : alarm_triggered = false; \
53 : if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \
54 : INVALID_HANDLE_VALUE) \
55 : pg_fatal("could not create thread for alarm"); \
56 : gettimeofday(&start_t, NULL); \
57 : } while (0)
58 : #endif
59 :
60 : #define STOP_TIMER \
61 : do { \
62 : gettimeofday(&stop_t, NULL); \
63 : print_elapse(start_t, stop_t, ops); \
64 : } while (0)
65 :
66 :
67 : static const char *progname;
68 :
69 : static unsigned int secs_per_test = 5;
70 : static int needs_unlink = 0;
71 : static char full_buf[DEFAULT_XLOG_SEG_SIZE],
72 : *buf,
73 : *filename = FSYNC_FILENAME;
74 : static struct timeval start_t,
75 : stop_t;
76 : static sig_atomic_t alarm_triggered = false;
77 :
78 :
79 : static void handle_args(int argc, char *argv[]);
80 : static void prepare_buf(void);
81 : static void test_open(void);
82 : static void test_non_sync(void);
83 : static void test_sync(int writes_per_op);
84 : static void test_open_syncs(void);
85 : static void test_open_sync(const char *msg, int writes_size);
86 : static void test_file_descriptor_sync(void);
87 :
88 : #ifndef WIN32
89 : static void process_alarm(SIGNAL_ARGS);
90 : #else
91 : static DWORD WINAPI process_alarm(LPVOID param);
92 : #endif
93 : static void signal_cleanup(SIGNAL_ARGS);
94 :
95 : #ifdef HAVE_FSYNC_WRITETHROUGH
96 : static int pg_fsync_writethrough(int fd);
97 : #endif
98 : static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops);
99 :
100 : #define die(msg) pg_fatal("%s: %m", _(msg))
101 :
102 :
103 : int
104 10 : main(int argc, char *argv[])
105 : {
106 10 : pg_logging_init(argv[0]);
107 10 : set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_fsync"));
108 10 : progname = get_progname(argv[0]);
109 :
110 10 : handle_args(argc, argv);
111 :
112 : /* Prevent leaving behind the test file */
113 0 : pqsignal(SIGINT, signal_cleanup);
114 0 : pqsignal(SIGTERM, signal_cleanup);
115 :
116 : /* the following are not valid on Windows */
117 : #ifndef WIN32
118 0 : pqsignal(SIGALRM, process_alarm);
119 0 : pqsignal(SIGHUP, signal_cleanup);
120 : #endif
121 :
122 0 : pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL));
123 :
124 0 : prepare_buf();
125 :
126 0 : test_open();
127 :
128 : /* Test using 1 XLOG_BLCKSZ write */
129 0 : test_sync(1);
130 :
131 : /* Test using 2 XLOG_BLCKSZ writes */
132 0 : test_sync(2);
133 :
134 0 : test_open_syncs();
135 :
136 0 : test_file_descriptor_sync();
137 :
138 0 : test_non_sync();
139 :
140 0 : unlink(filename);
141 :
142 0 : return 0;
143 : }
144 :
145 : static void
146 10 : handle_args(int argc, char *argv[])
147 : {
148 : static struct option long_options[] = {
149 : {"filename", required_argument, NULL, 'f'},
150 : {"secs-per-test", required_argument, NULL, 's'},
151 : {NULL, 0, NULL, 0}
152 : };
153 :
154 : int option; /* Command line option */
155 10 : int optindex = 0; /* used by getopt_long */
156 : unsigned long optval; /* used for option parsing */
157 : char *endptr;
158 :
159 10 : if (argc > 1)
160 : {
161 10 : if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
162 : {
163 2 : printf(_("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n"), progname);
164 2 : exit(0);
165 : }
166 8 : if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
167 : {
168 2 : puts("pg_test_fsync (PostgreSQL) " PG_VERSION);
169 2 : exit(0);
170 : }
171 : }
172 :
173 6 : while ((option = getopt_long(argc, argv, "f:s:",
174 : long_options, &optindex)) != -1)
175 : {
176 6 : switch (option)
177 : {
178 0 : case 'f':
179 0 : filename = pg_strdup(optarg);
180 0 : break;
181 :
182 4 : case 's':
183 4 : errno = 0;
184 4 : optval = strtoul(optarg, &endptr, 10);
185 :
186 4 : if (endptr == optarg || *endptr != '\0' ||
187 2 : errno != 0 || optval != (unsigned int) optval)
188 : {
189 2 : pg_log_error("invalid argument for option %s", "--secs-per-test");
190 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
191 2 : exit(1);
192 : }
193 :
194 2 : secs_per_test = (unsigned int) optval;
195 2 : if (secs_per_test == 0)
196 2 : pg_fatal("%s must be in range %u..%u",
197 : "--secs-per-test", 1, UINT_MAX);
198 0 : break;
199 :
200 2 : default:
201 : /* getopt_long already emitted a complaint */
202 2 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
203 2 : exit(1);
204 : }
205 : }
206 :
207 0 : if (argc > optind)
208 : {
209 0 : pg_log_error("too many command-line arguments (first is \"%s\")",
210 : argv[optind]);
211 0 : pg_log_error_hint("Try \"%s --help\" for more information.", progname);
212 0 : exit(1);
213 : }
214 :
215 0 : printf(ngettext("%u second per test\n",
216 : "%u seconds per test\n",
217 : secs_per_test),
218 : secs_per_test);
219 : #if defined(O_DIRECT)
220 0 : printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n"));
221 : #elif defined(F_NOCACHE)
222 : printf(_("F_NOCACHE supported on this platform for open_datasync and open_sync.\n"));
223 : #else
224 : printf(_("Direct I/O is not supported on this platform.\n"));
225 : #endif
226 0 : }
227 :
228 : static void
229 0 : prepare_buf(void)
230 : {
231 : int ops;
232 :
233 : /* write random data into buffer */
234 0 : for (ops = 0; ops < DEFAULT_XLOG_SEG_SIZE; ops++)
235 0 : full_buf[ops] = (char) pg_prng_int32(&pg_global_prng_state);
236 :
237 0 : buf = (char *) TYPEALIGN(XLOG_BLCKSZ, full_buf);
238 0 : }
239 :
240 : static void
241 0 : test_open(void)
242 : {
243 : int tmpfile;
244 :
245 : /*
246 : * test if we can open the target file
247 : */
248 0 : if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
249 0 : die("could not open output file");
250 0 : needs_unlink = 1;
251 0 : if (write(tmpfile, full_buf, DEFAULT_XLOG_SEG_SIZE) !=
252 : DEFAULT_XLOG_SEG_SIZE)
253 0 : die("write failed");
254 :
255 : /* fsync now so that dirty buffers don't skew later tests */
256 0 : if (fsync(tmpfile) != 0)
257 0 : die("fsync failed");
258 :
259 0 : close(tmpfile);
260 0 : }
261 :
262 : static int
263 0 : open_direct(const char *path, int flags, mode_t mode)
264 : {
265 : int fd;
266 :
267 : #ifdef O_DIRECT
268 0 : flags |= O_DIRECT;
269 : #endif
270 :
271 0 : fd = open(path, flags, mode);
272 :
273 : #if !defined(O_DIRECT) && defined(F_NOCACHE)
274 : if (fd >= 0 && fcntl(fd, F_NOCACHE, 1) < 0)
275 : {
276 : int save_errno = errno;
277 :
278 : close(fd);
279 : errno = save_errno;
280 : return -1;
281 : }
282 : #endif
283 :
284 0 : return fd;
285 : }
286 :
287 : static void
288 0 : test_sync(int writes_per_op)
289 : {
290 : int tmpfile,
291 : ops,
292 : writes;
293 0 : bool fs_warning = false;
294 :
295 0 : if (writes_per_op == 1)
296 0 : printf(_("\nCompare file sync methods using one %dkB write:\n"), XLOG_BLCKSZ_K);
297 : else
298 0 : printf(_("\nCompare file sync methods using two %dkB writes:\n"), XLOG_BLCKSZ_K);
299 0 : printf(_("(in \"wal_sync_method\" preference order, except fdatasync is Linux's default)\n"));
300 :
301 : /*
302 : * Test open_datasync if available
303 : */
304 0 : printf(LABEL_FORMAT, "open_datasync");
305 0 : fflush(stdout);
306 :
307 : #ifdef O_DSYNC
308 0 : if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
309 : {
310 0 : printf(NA_FORMAT, _("n/a*"));
311 0 : fs_warning = true;
312 : }
313 : else
314 : {
315 0 : START_TIMER;
316 0 : for (ops = 0; alarm_triggered == false; ops++)
317 : {
318 0 : for (writes = 0; writes < writes_per_op; writes++)
319 0 : if (pg_pwrite(tmpfile,
320 : buf,
321 : XLOG_BLCKSZ,
322 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
323 0 : die("write failed");
324 : }
325 0 : STOP_TIMER;
326 0 : close(tmpfile);
327 : }
328 : #else
329 : printf(NA_FORMAT, _("n/a"));
330 : #endif
331 :
332 : /*
333 : * Test fdatasync if available
334 : */
335 0 : printf(LABEL_FORMAT, "fdatasync");
336 0 : fflush(stdout);
337 :
338 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
339 0 : die("could not open output file");
340 0 : START_TIMER;
341 0 : for (ops = 0; alarm_triggered == false; ops++)
342 : {
343 0 : for (writes = 0; writes < writes_per_op; writes++)
344 0 : if (pg_pwrite(tmpfile,
345 : buf,
346 : XLOG_BLCKSZ,
347 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
348 0 : die("write failed");
349 0 : fdatasync(tmpfile);
350 : }
351 0 : STOP_TIMER;
352 0 : close(tmpfile);
353 :
354 : /*
355 : * Test fsync
356 : */
357 0 : printf(LABEL_FORMAT, "fsync");
358 0 : fflush(stdout);
359 :
360 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
361 0 : die("could not open output file");
362 0 : START_TIMER;
363 0 : for (ops = 0; alarm_triggered == false; ops++)
364 : {
365 0 : for (writes = 0; writes < writes_per_op; writes++)
366 0 : if (pg_pwrite(tmpfile,
367 : buf,
368 : XLOG_BLCKSZ,
369 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
370 0 : die("write failed");
371 0 : if (fsync(tmpfile) != 0)
372 0 : die("fsync failed");
373 : }
374 0 : STOP_TIMER;
375 0 : close(tmpfile);
376 :
377 : /*
378 : * If fsync_writethrough is available, test as well
379 : */
380 0 : printf(LABEL_FORMAT, "fsync_writethrough");
381 0 : fflush(stdout);
382 :
383 : #ifdef HAVE_FSYNC_WRITETHROUGH
384 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
385 : die("could not open output file");
386 : START_TIMER;
387 : for (ops = 0; alarm_triggered == false; ops++)
388 : {
389 : for (writes = 0; writes < writes_per_op; writes++)
390 : if (pg_pwrite(tmpfile,
391 : buf,
392 : XLOG_BLCKSZ,
393 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
394 : die("write failed");
395 : if (pg_fsync_writethrough(tmpfile) != 0)
396 : die("fsync failed");
397 : }
398 : STOP_TIMER;
399 : close(tmpfile);
400 : #else
401 0 : printf(NA_FORMAT, _("n/a"));
402 : #endif
403 :
404 : /*
405 : * Test open_sync if available
406 : */
407 0 : printf(LABEL_FORMAT, "open_sync");
408 0 : fflush(stdout);
409 :
410 : #ifdef O_SYNC
411 0 : if ((tmpfile = open_direct(filename, O_RDWR | O_SYNC | PG_BINARY, 0)) == -1)
412 : {
413 0 : printf(NA_FORMAT, _("n/a*"));
414 0 : fs_warning = true;
415 : }
416 : else
417 : {
418 0 : START_TIMER;
419 0 : for (ops = 0; alarm_triggered == false; ops++)
420 : {
421 0 : for (writes = 0; writes < writes_per_op; writes++)
422 0 : if (pg_pwrite(tmpfile,
423 : buf,
424 : XLOG_BLCKSZ,
425 0 : writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
426 :
427 : /*
428 : * This can generate write failures if the filesystem has
429 : * a large block size, e.g. 4k, and there is no support
430 : * for O_DIRECT writes smaller than the file system block
431 : * size, e.g. XFS.
432 : */
433 0 : die("write failed");
434 : }
435 0 : STOP_TIMER;
436 0 : close(tmpfile);
437 : }
438 : #else
439 : printf(NA_FORMAT, _("n/a"));
440 : #endif
441 :
442 0 : if (fs_warning)
443 : {
444 0 : printf(_("* This file system and its mount options do not support direct\n"
445 : " I/O, e.g. ext4 in journaled mode.\n"));
446 : }
447 0 : }
448 :
449 : static void
450 0 : test_open_syncs(void)
451 : {
452 0 : printf(_("\nCompare open_sync with different write sizes:\n"));
453 0 : printf(_("(This is designed to compare the cost of writing 16kB in different write\n"
454 : "open_sync sizes.)\n"));
455 :
456 0 : test_open_sync(_(" 1 * 16kB open_sync write"), 16);
457 0 : test_open_sync(_(" 2 * 8kB open_sync writes"), 8);
458 0 : test_open_sync(_(" 4 * 4kB open_sync writes"), 4);
459 0 : test_open_sync(_(" 8 * 2kB open_sync writes"), 2);
460 0 : test_open_sync(_("16 * 1kB open_sync writes"), 1);
461 0 : }
462 :
463 : /*
464 : * Test open_sync with different size files
465 : */
466 : static void
467 0 : test_open_sync(const char *msg, int writes_size)
468 : {
469 : #ifdef O_SYNC
470 : int tmpfile,
471 : ops,
472 : writes;
473 : #endif
474 :
475 0 : printf(LABEL_FORMAT, msg);
476 0 : fflush(stdout);
477 :
478 : #ifdef O_SYNC
479 0 : if ((tmpfile = open_direct(filename, O_RDWR | O_SYNC | PG_BINARY, 0)) == -1)
480 0 : printf(NA_FORMAT, _("n/a*"));
481 : else
482 : {
483 0 : START_TIMER;
484 0 : for (ops = 0; alarm_triggered == false; ops++)
485 : {
486 0 : for (writes = 0; writes < 16 / writes_size; writes++)
487 0 : if (pg_pwrite(tmpfile,
488 : buf,
489 0 : writes_size * 1024,
490 0 : writes * writes_size * 1024) !=
491 0 : writes_size * 1024)
492 0 : die("write failed");
493 : }
494 0 : STOP_TIMER;
495 0 : close(tmpfile);
496 : }
497 : #else
498 : printf(NA_FORMAT, _("n/a"));
499 : #endif
500 0 : }
501 :
502 : static void
503 0 : test_file_descriptor_sync(void)
504 : {
505 : int tmpfile,
506 : ops;
507 :
508 : /*
509 : * Test whether fsync can sync data written on a different descriptor for
510 : * the same file. This checks the efficiency of multi-process fsyncs
511 : * against the same file. Possibly this should be done with writethrough
512 : * on platforms which support it.
513 : */
514 0 : printf(_("\nTest if fsync on non-write file descriptor is honored:\n"));
515 0 : printf(_("(If the times are similar, fsync() can sync data written on a different\n"
516 : "descriptor.)\n"));
517 :
518 : /*
519 : * first write, fsync and close, which is the normal behavior without
520 : * multiple descriptors
521 : */
522 0 : printf(LABEL_FORMAT, "write, fsync, close");
523 0 : fflush(stdout);
524 :
525 0 : START_TIMER;
526 0 : for (ops = 0; alarm_triggered == false; ops++)
527 : {
528 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
529 0 : die("could not open output file");
530 0 : if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
531 0 : die("write failed");
532 0 : if (fsync(tmpfile) != 0)
533 0 : die("fsync failed");
534 0 : close(tmpfile);
535 :
536 : /*
537 : * open and close the file again to be consistent with the following
538 : * test
539 : */
540 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
541 0 : die("could not open output file");
542 0 : close(tmpfile);
543 : }
544 0 : STOP_TIMER;
545 :
546 : /*
547 : * Now open, write, close, open again and fsync This simulates processes
548 : * fsyncing each other's writes.
549 : */
550 0 : printf(LABEL_FORMAT, "write, close, fsync");
551 0 : fflush(stdout);
552 :
553 0 : START_TIMER;
554 0 : for (ops = 0; alarm_triggered == false; ops++)
555 : {
556 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
557 0 : die("could not open output file");
558 0 : if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
559 0 : die("write failed");
560 0 : close(tmpfile);
561 : /* reopen file */
562 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
563 0 : die("could not open output file");
564 0 : if (fsync(tmpfile) != 0)
565 0 : die("fsync failed");
566 0 : close(tmpfile);
567 : }
568 0 : STOP_TIMER;
569 0 : }
570 :
571 : static void
572 0 : test_non_sync(void)
573 : {
574 : int tmpfile,
575 : ops;
576 :
577 : /*
578 : * Test a simple write without fsync
579 : */
580 0 : printf(_("\nNon-sync'ed %dkB writes:\n"), XLOG_BLCKSZ_K);
581 0 : printf(LABEL_FORMAT, "write");
582 0 : fflush(stdout);
583 :
584 0 : if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
585 0 : die("could not open output file");
586 0 : START_TIMER;
587 0 : for (ops = 0; alarm_triggered == false; ops++)
588 : {
589 0 : if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ)
590 0 : die("write failed");
591 : }
592 0 : STOP_TIMER;
593 0 : close(tmpfile);
594 0 : }
595 :
596 : static void
597 0 : signal_cleanup(SIGNAL_ARGS)
598 : {
599 : int rc;
600 :
601 : /* Delete the file if it exists. Ignore errors */
602 0 : if (needs_unlink)
603 0 : unlink(filename);
604 : /* Finish incomplete line on stdout */
605 0 : rc = write(STDOUT_FILENO, "\n", 1);
606 : (void) rc; /* silence compiler warnings */
607 0 : _exit(1);
608 : }
609 :
610 : #ifdef HAVE_FSYNC_WRITETHROUGH
611 :
612 : static int
613 : pg_fsync_writethrough(int fd)
614 : {
615 : #if defined(F_FULLFSYNC)
616 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
617 : #else
618 : errno = ENOSYS;
619 : return -1;
620 : #endif
621 : }
622 : #endif
623 :
624 : /*
625 : * print out the writes per second for tests
626 : */
627 : static void
628 0 : print_elapse(struct timeval start_t, struct timeval stop_t, int ops)
629 : {
630 0 : double total_time = (stop_t.tv_sec - start_t.tv_sec) +
631 0 : (stop_t.tv_usec - start_t.tv_usec) * 0.000001;
632 0 : double per_second = ops / total_time;
633 0 : double avg_op_time_us = (total_time / ops) * USECS_SEC;
634 :
635 0 : printf(_(OPS_FORMAT), per_second, avg_op_time_us);
636 0 : }
637 :
638 : #ifndef WIN32
639 : static void
640 0 : process_alarm(SIGNAL_ARGS)
641 : {
642 0 : alarm_triggered = true;
643 0 : }
644 : #else
645 : static DWORD WINAPI
646 : process_alarm(LPVOID param)
647 : {
648 : /* WIN32 doesn't support alarm, so we create a thread and sleep here */
649 : Sleep(secs_per_test * 1000);
650 : alarm_triggered = true;
651 : ExitThread(0);
652 : }
653 : #endif
|