Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * File-processing utility routines.
4 : *
5 : * Assorted utility functions to work on files.
6 : *
7 : *
8 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
9 : * Portions Copyright (c) 1994, Regents of the University of California
10 : *
11 : * src/common/file_utils.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #ifndef FRONTEND
17 : #include "postgres.h"
18 : #else
19 : #include "postgres_fe.h"
20 : #endif
21 :
22 : #include <dirent.h>
23 : #include <fcntl.h>
24 : #include <sys/stat.h>
25 : #include <unistd.h>
26 :
27 : #include "common/file_utils.h"
28 : #ifdef FRONTEND
29 : #include "common/logging.h"
30 : #endif
31 : #include "port/pg_iovec.h"
32 :
33 : #ifdef FRONTEND
34 :
35 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
36 : #if defined(HAVE_SYNC_FILE_RANGE)
37 : #define PG_FLUSH_DATA_WORKS 1
38 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
39 : #define PG_FLUSH_DATA_WORKS 1
40 : #endif
41 :
42 : /*
43 : * pg_xlog has been renamed to pg_wal in version 10.
44 : */
45 : #define MINIMUM_VERSION_FOR_PG_WAL 100000
46 :
47 : #ifdef PG_FLUSH_DATA_WORKS
48 : static int pre_sync_fname(const char *fname, bool isdir);
49 : #endif
50 : static void walkdir(const char *path,
51 : int (*action) (const char *fname, bool isdir),
52 : bool process_symlinks);
53 :
54 : #ifdef HAVE_SYNCFS
55 :
56 : /*
57 : * do_syncfs -- Try to syncfs a file system
58 : *
59 : * Reports errors trying to open the path. syncfs() errors are fatal.
60 : */
61 : static void
62 4 : do_syncfs(const char *path)
63 : {
64 : int fd;
65 :
66 4 : fd = open(path, O_RDONLY, 0);
67 :
68 4 : if (fd < 0)
69 : {
70 0 : pg_log_error("could not open file \"%s\": %m", path);
71 0 : return;
72 : }
73 :
74 4 : if (syncfs(fd) < 0)
75 : {
76 0 : pg_log_error("could not synchronize file system for file \"%s\": %m", path);
77 0 : (void) close(fd);
78 0 : exit(EXIT_FAILURE);
79 : }
80 :
81 4 : (void) close(fd);
82 : }
83 :
84 : #endif /* HAVE_SYNCFS */
85 :
86 : /*
87 : * Synchronize PGDATA and all its contents.
88 : *
89 : * We sync regular files and directories wherever they are, but we follow
90 : * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
91 : * Other symlinks are presumed to point at files we're not responsible for
92 : * syncing, and might not have privileges to write at all.
93 : *
94 : * serverVersion indicates the version of the server to be sync'd.
95 : */
96 : void
97 10 : sync_pgdata(const char *pg_data,
98 : int serverVersion,
99 : DataDirSyncMethod sync_method)
100 : {
101 : bool xlog_is_symlink;
102 : char pg_wal[MAXPGPATH];
103 : char pg_tblspc[MAXPGPATH];
104 :
105 : /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
106 10 : snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
107 : serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
108 10 : snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
109 :
110 : /*
111 : * If pg_wal is a symlink, we'll need to recurse into it separately,
112 : * because the first walkdir below will ignore it.
113 : */
114 10 : xlog_is_symlink = false;
115 :
116 : {
117 : struct stat st;
118 :
119 10 : if (lstat(pg_wal, &st) < 0)
120 0 : pg_log_error("could not stat file \"%s\": %m", pg_wal);
121 10 : else if (S_ISLNK(st.st_mode))
122 4 : xlog_is_symlink = true;
123 : }
124 :
125 10 : switch (sync_method)
126 : {
127 2 : case DATA_DIR_SYNC_METHOD_SYNCFS:
128 : {
129 : #ifndef HAVE_SYNCFS
130 : pg_log_error("this build does not support sync method \"%s\"",
131 : "syncfs");
132 : exit(EXIT_FAILURE);
133 : #else
134 : DIR *dir;
135 : struct dirent *de;
136 :
137 : /*
138 : * On Linux, we don't have to open every single file one by
139 : * one. We can use syncfs() to sync whole filesystems. We
140 : * only expect filesystem boundaries to exist where we
141 : * tolerate symlinks, namely pg_wal and the tablespaces, so we
142 : * call syncfs() for each of those directories.
143 : */
144 :
145 : /* Sync the top level pgdata directory. */
146 2 : do_syncfs(pg_data);
147 :
148 : /* If any tablespaces are configured, sync each of those. */
149 2 : dir = opendir(pg_tblspc);
150 2 : if (dir == NULL)
151 0 : pg_log_error("could not open directory \"%s\": %m",
152 : pg_tblspc);
153 : else
154 : {
155 6 : while (errno = 0, (de = readdir(dir)) != NULL)
156 : {
157 : char subpath[MAXPGPATH * 2];
158 :
159 4 : if (strcmp(de->d_name, ".") == 0 ||
160 2 : strcmp(de->d_name, "..") == 0)
161 4 : continue;
162 :
163 0 : snprintf(subpath, sizeof(subpath), "%s/%s",
164 0 : pg_tblspc, de->d_name);
165 0 : do_syncfs(subpath);
166 : }
167 :
168 2 : if (errno)
169 0 : pg_log_error("could not read directory \"%s\": %m",
170 : pg_tblspc);
171 :
172 2 : (void) closedir(dir);
173 : }
174 :
175 : /* If pg_wal is a symlink, process that too. */
176 2 : if (xlog_is_symlink)
177 2 : do_syncfs(pg_wal);
178 : #endif /* HAVE_SYNCFS */
179 : }
180 2 : break;
181 :
182 8 : case DATA_DIR_SYNC_METHOD_FSYNC:
183 : {
184 : /*
185 : * If possible, hint to the kernel that we're soon going to
186 : * fsync the data directory and its contents.
187 : */
188 : #ifdef PG_FLUSH_DATA_WORKS
189 8 : walkdir(pg_data, pre_sync_fname, false);
190 8 : if (xlog_is_symlink)
191 2 : walkdir(pg_wal, pre_sync_fname, false);
192 8 : walkdir(pg_tblspc, pre_sync_fname, true);
193 : #endif
194 :
195 : /*
196 : * Now we do the fsync()s in the same order.
197 : *
198 : * The main call ignores symlinks, so in addition to specially
199 : * processing pg_wal if it's a symlink, pg_tblspc has to be
200 : * visited separately with process_symlinks = true. Note that
201 : * if there are any plain directories in pg_tblspc, they'll
202 : * get fsync'd twice. That's not an expected case so we don't
203 : * worry about optimizing it.
204 : */
205 8 : walkdir(pg_data, fsync_fname, false);
206 8 : if (xlog_is_symlink)
207 2 : walkdir(pg_wal, fsync_fname, false);
208 8 : walkdir(pg_tblspc, fsync_fname, true);
209 : }
210 8 : break;
211 : }
212 10 : }
213 :
214 : /*
215 : * Synchronize the given directory and all its contents.
216 : *
217 : * This is a convenient wrapper on top of walkdir() and do_syncfs().
218 : */
219 : void
220 8 : sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
221 : {
222 8 : switch (sync_method)
223 : {
224 0 : case DATA_DIR_SYNC_METHOD_SYNCFS:
225 : {
226 : #ifndef HAVE_SYNCFS
227 : pg_log_error("this build does not support sync method \"%s\"",
228 : "syncfs");
229 : exit(EXIT_FAILURE);
230 : #else
231 : /*
232 : * On Linux, we don't have to open every single file one by
233 : * one. We can use syncfs() to sync the whole filesystem.
234 : */
235 0 : do_syncfs(dir);
236 : #endif /* HAVE_SYNCFS */
237 : }
238 0 : break;
239 :
240 8 : case DATA_DIR_SYNC_METHOD_FSYNC:
241 : {
242 : /*
243 : * If possible, hint to the kernel that we're soon going to
244 : * fsync the data directory and its contents.
245 : */
246 : #ifdef PG_FLUSH_DATA_WORKS
247 8 : walkdir(dir, pre_sync_fname, false);
248 : #endif
249 :
250 8 : walkdir(dir, fsync_fname, false);
251 : }
252 8 : break;
253 : }
254 8 : }
255 :
256 : /*
257 : * walkdir: recursively walk a directory, applying the action to each
258 : * regular file and directory (including the named directory itself).
259 : *
260 : * If process_symlinks is true, the action and recursion are also applied
261 : * to regular files and directories that are pointed to by symlinks in the
262 : * given directory; otherwise symlinks are ignored. Symlinks are always
263 : * ignored in subdirectories, ie we intentionally don't pass down the
264 : * process_symlinks flag to recursive calls.
265 : *
266 : * Errors are reported but not considered fatal.
267 : *
268 : * See also walkdir in fd.c, which is a backend version of this logic.
269 : */
270 : static void
271 452 : walkdir(const char *path,
272 : int (*action) (const char *fname, bool isdir),
273 : bool process_symlinks)
274 : {
275 : DIR *dir;
276 : struct dirent *de;
277 :
278 452 : dir = opendir(path);
279 452 : if (dir == NULL)
280 : {
281 0 : pg_log_error("could not open directory \"%s\": %m", path);
282 0 : return;
283 : }
284 :
285 17828 : while (errno = 0, (de = readdir(dir)) != NULL)
286 : {
287 : char subpath[MAXPGPATH * 2];
288 :
289 17376 : if (strcmp(de->d_name, ".") == 0 ||
290 16924 : strcmp(de->d_name, "..") == 0)
291 904 : continue;
292 :
293 16472 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
294 :
295 16472 : switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
296 : {
297 16068 : case PGFILETYPE_REG:
298 16068 : (*action) (subpath, false);
299 16068 : break;
300 400 : case PGFILETYPE_DIR:
301 400 : walkdir(subpath, action, false);
302 400 : break;
303 4 : default:
304 :
305 : /*
306 : * Errors are already reported directly by get_dirent_type(),
307 : * and any remaining symlinks and unknown file types are
308 : * ignored.
309 : */
310 4 : break;
311 : }
312 : }
313 :
314 452 : if (errno)
315 0 : pg_log_error("could not read directory \"%s\": %m", path);
316 :
317 452 : (void) closedir(dir);
318 :
319 : /*
320 : * It's important to fsync the destination directory itself as individual
321 : * file fsyncs don't guarantee that the directory entry for the file is
322 : * synced. Recent versions of ext4 have made the window much wider but
323 : * it's been an issue for ext3 and other filesystems in the past.
324 : */
325 452 : (*action) (path, true);
326 : }
327 :
328 : /*
329 : * Hint to the OS that it should get ready to fsync() this file.
330 : *
331 : * Ignores errors trying to open unreadable files, and reports other errors
332 : * non-fatally.
333 : */
334 : #ifdef PG_FLUSH_DATA_WORKS
335 :
336 : static int
337 8260 : pre_sync_fname(const char *fname, bool isdir)
338 : {
339 : int fd;
340 :
341 8260 : fd = open(fname, O_RDONLY | PG_BINARY, 0);
342 :
343 8260 : if (fd < 0)
344 : {
345 0 : if (errno == EACCES || (isdir && errno == EISDIR))
346 0 : return 0;
347 0 : pg_log_error("could not open file \"%s\": %m", fname);
348 0 : return -1;
349 : }
350 :
351 : /*
352 : * We do what pg_flush_data() would do in the backend: prefer to use
353 : * sync_file_range, but fall back to posix_fadvise. We ignore errors
354 : * because this is only a hint.
355 : */
356 : #if defined(HAVE_SYNC_FILE_RANGE)
357 8260 : (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
358 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
359 : (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
360 : #else
361 : #error PG_FLUSH_DATA_WORKS should not have been defined
362 : #endif
363 :
364 8260 : (void) close(fd);
365 8260 : return 0;
366 : }
367 :
368 : #endif /* PG_FLUSH_DATA_WORKS */
369 :
370 : /*
371 : * fsync_fname -- Try to fsync a file or directory
372 : *
373 : * Ignores errors trying to open unreadable files, or trying to fsync
374 : * directories on systems where that isn't allowed/required. All other errors
375 : * are fatal.
376 : */
377 : int
378 8366 : fsync_fname(const char *fname, bool isdir)
379 : {
380 : int fd;
381 : int flags;
382 : int returncode;
383 :
384 : /*
385 : * Some OSs require directories to be opened read-only whereas other
386 : * systems don't allow us to fsync files opened read-only; so we need both
387 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
388 : * not writable by our userid, but we assume that's OK.
389 : */
390 8366 : flags = PG_BINARY;
391 8366 : if (!isdir)
392 8104 : flags |= O_RDWR;
393 : else
394 262 : flags |= O_RDONLY;
395 :
396 : /*
397 : * Open the file, silently ignoring errors about unreadable files (or
398 : * unsupported operations, e.g. opening a directory under Windows), and
399 : * logging others.
400 : */
401 8366 : fd = open(fname, flags, 0);
402 8366 : if (fd < 0)
403 : {
404 0 : if (errno == EACCES || (isdir && errno == EISDIR))
405 0 : return 0;
406 0 : pg_log_error("could not open file \"%s\": %m", fname);
407 0 : return -1;
408 : }
409 :
410 8366 : returncode = fsync(fd);
411 :
412 : /*
413 : * Some OSes don't allow us to fsync directories at all, so we can ignore
414 : * those errors. Anything else needs to be reported.
415 : */
416 8366 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
417 : {
418 0 : pg_log_error("could not fsync file \"%s\": %m", fname);
419 0 : (void) close(fd);
420 0 : exit(EXIT_FAILURE);
421 : }
422 :
423 8366 : (void) close(fd);
424 8366 : return 0;
425 : }
426 :
427 : /*
428 : * fsync_parent_path -- fsync the parent path of a file or directory
429 : *
430 : * This is aimed at making file operations persistent on disk in case of
431 : * an OS crash or power failure.
432 : */
433 : int
434 28 : fsync_parent_path(const char *fname)
435 : {
436 : char parentpath[MAXPGPATH];
437 :
438 28 : strlcpy(parentpath, fname, MAXPGPATH);
439 28 : get_parent_directory(parentpath);
440 :
441 : /*
442 : * get_parent_directory() returns an empty string if the input argument is
443 : * just a file name (see comments in path.c), so handle that as being the
444 : * current directory.
445 : */
446 28 : if (strlen(parentpath) == 0)
447 0 : strlcpy(parentpath, ".", MAXPGPATH);
448 :
449 28 : if (fsync_fname(parentpath, true) != 0)
450 0 : return -1;
451 :
452 28 : return 0;
453 : }
454 :
455 : /*
456 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
457 : *
458 : * Wrapper around rename, similar to the backend version.
459 : */
460 : int
461 6 : durable_rename(const char *oldfile, const char *newfile)
462 : {
463 : int fd;
464 :
465 : /*
466 : * First fsync the old and target path (if it exists), to ensure that they
467 : * are properly persistent on disk. Syncing the target file is not
468 : * strictly necessary, but it makes it easier to reason about crashes;
469 : * because it's then guaranteed that either source or target file exists
470 : * after a crash.
471 : */
472 6 : if (fsync_fname(oldfile, false) != 0)
473 0 : return -1;
474 :
475 6 : fd = open(newfile, PG_BINARY | O_RDWR, 0);
476 6 : if (fd < 0)
477 : {
478 6 : if (errno != ENOENT)
479 : {
480 0 : pg_log_error("could not open file \"%s\": %m", newfile);
481 0 : return -1;
482 : }
483 : }
484 : else
485 : {
486 0 : if (fsync(fd) != 0)
487 : {
488 0 : pg_log_error("could not fsync file \"%s\": %m", newfile);
489 0 : close(fd);
490 0 : exit(EXIT_FAILURE);
491 : }
492 0 : close(fd);
493 : }
494 :
495 : /* Time to do the real deal... */
496 6 : if (rename(oldfile, newfile) != 0)
497 : {
498 0 : pg_log_error("could not rename file \"%s\" to \"%s\": %m",
499 : oldfile, newfile);
500 0 : return -1;
501 : }
502 :
503 : /*
504 : * To guarantee renaming the file is persistent, fsync the file with its
505 : * new name, and its containing directory.
506 : */
507 6 : if (fsync_fname(newfile, false) != 0)
508 0 : return -1;
509 :
510 6 : if (fsync_parent_path(newfile) != 0)
511 0 : return -1;
512 :
513 6 : return 0;
514 : }
515 :
516 : #endif /* FRONTEND */
517 :
518 : /*
519 : * Return the type of a directory entry.
520 : *
521 : * In frontend code, elevel should be a level from logging.h; in backend code
522 : * it should be a level from elog.h.
523 : */
524 : PGFileType
525 319896 : get_dirent_type(const char *path,
526 : const struct dirent *de,
527 : bool look_through_symlinks,
528 : int elevel)
529 : {
530 : PGFileType result;
531 :
532 : /*
533 : * Some systems tell us the type directly in the dirent struct, but that's
534 : * a BSD and Linux extension not required by POSIX. Even when the
535 : * interface is present, sometimes the type is unknown, depending on the
536 : * filesystem.
537 : */
538 : #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
539 319896 : if (de->d_type == DT_REG)
540 314366 : result = PGFILETYPE_REG;
541 5530 : else if (de->d_type == DT_DIR)
542 5492 : result = PGFILETYPE_DIR;
543 38 : else if (de->d_type == DT_LNK && !look_through_symlinks)
544 38 : result = PGFILETYPE_LNK;
545 : else
546 0 : result = PGFILETYPE_UNKNOWN;
547 : #else
548 : result = PGFILETYPE_UNKNOWN;
549 : #endif
550 :
551 319896 : if (result == PGFILETYPE_UNKNOWN)
552 : {
553 : struct stat fst;
554 : int sret;
555 :
556 :
557 0 : if (look_through_symlinks)
558 0 : sret = stat(path, &fst);
559 : else
560 0 : sret = lstat(path, &fst);
561 :
562 0 : if (sret < 0)
563 : {
564 0 : result = PGFILETYPE_ERROR;
565 : #ifdef FRONTEND
566 0 : pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
567 : #else
568 0 : ereport(elevel,
569 : (errcode_for_file_access(),
570 : errmsg("could not stat file \"%s\": %m", path)));
571 : #endif
572 : }
573 0 : else if (S_ISREG(fst.st_mode))
574 0 : result = PGFILETYPE_REG;
575 0 : else if (S_ISDIR(fst.st_mode))
576 0 : result = PGFILETYPE_DIR;
577 0 : else if (S_ISLNK(fst.st_mode))
578 0 : result = PGFILETYPE_LNK;
579 : }
580 :
581 319896 : return result;
582 : }
583 :
584 : /*
585 : * pg_pwritev_with_retry
586 : *
587 : * Convenience wrapper for pg_pwritev() that retries on partial write. If an
588 : * error is returned, it is unspecified how much has been written.
589 : */
590 : ssize_t
591 393250 : pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
592 : {
593 : struct iovec iov_copy[PG_IOV_MAX];
594 393250 : ssize_t sum = 0;
595 : ssize_t part;
596 :
597 : /* We'd better have space to make a copy, in case we need to retry. */
598 393250 : if (iovcnt > PG_IOV_MAX)
599 : {
600 0 : errno = EINVAL;
601 0 : return -1;
602 : }
603 :
604 : for (;;)
605 : {
606 : /* Write as much as we can. */
607 393250 : part = pg_pwritev(fd, iov, iovcnt, offset);
608 393250 : if (part < 0)
609 0 : return -1;
610 :
611 : #ifdef SIMULATE_SHORT_WRITE
612 : part = Min(part, 4096);
613 : #endif
614 :
615 : /* Count our progress. */
616 393250 : sum += part;
617 393250 : offset += part;
618 :
619 : /* Step over iovecs that are done. */
620 2107788 : while (iovcnt > 0 && iov->iov_len <= part)
621 : {
622 1714538 : part -= iov->iov_len;
623 1714538 : ++iov;
624 1714538 : --iovcnt;
625 : }
626 :
627 : /* Are they all done? */
628 393250 : if (iovcnt == 0)
629 : {
630 : /* We don't expect the kernel to write more than requested. */
631 : Assert(part == 0);
632 393250 : break;
633 : }
634 :
635 : /*
636 : * Move whatever's left to the front of our mutable copy and adjust
637 : * the leading iovec.
638 : */
639 : Assert(iovcnt > 0);
640 0 : memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
641 : Assert(iov->iov_len > part);
642 0 : iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
643 0 : iov_copy[0].iov_len -= part;
644 0 : iov = iov_copy;
645 : }
646 :
647 393250 : return sum;
648 : }
649 :
650 : /*
651 : * pg_pwrite_zeros
652 : *
653 : * Writes zeros to file worth "size" bytes at "offset" (from the start of the
654 : * file), using vectored I/O.
655 : *
656 : * Returns the total amount of data written. On failure, a negative value
657 : * is returned with errno set.
658 : */
659 : ssize_t
660 352096 : pg_pwrite_zeros(int fd, size_t size, off_t offset)
661 : {
662 : static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */
663 352096 : void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
664 : struct iovec iov[PG_IOV_MAX];
665 352096 : size_t remaining_size = size;
666 352096 : ssize_t total_written = 0;
667 :
668 : /* Loop, writing as many blocks as we can for each system call. */
669 745346 : while (remaining_size > 0)
670 : {
671 393250 : int iovcnt = 0;
672 : ssize_t written;
673 :
674 2107788 : for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
675 : {
676 : size_t this_iov_size;
677 :
678 1714538 : iov[iovcnt].iov_base = zerobuf_addr;
679 :
680 1714538 : if (remaining_size < BLCKSZ)
681 0 : this_iov_size = remaining_size;
682 : else
683 1714538 : this_iov_size = BLCKSZ;
684 :
685 1714538 : iov[iovcnt].iov_len = this_iov_size;
686 1714538 : remaining_size -= this_iov_size;
687 : }
688 :
689 393250 : written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
690 :
691 393250 : if (written < 0)
692 0 : return written;
693 :
694 393250 : offset += written;
695 393250 : total_written += written;
696 : }
697 :
698 : Assert(total_written == size);
699 :
700 352096 : return total_written;
701 : }
|