LCOV - code coverage report
Current view: top level - src/common - file_utils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 157 216 72.7 %
Date: 2025-04-01 14:15:22 Functions: 12 12 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * File-processing utility routines.
       4             :  *
       5             :  * Assorted utility functions to work on files.
       6             :  *
       7             :  *
       8             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       9             :  * Portions Copyright (c) 1994, Regents of the University of California
      10             :  *
      11             :  * src/common/file_utils.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #ifndef FRONTEND
      17             : #include "postgres.h"
      18             : #else
      19             : #include "postgres_fe.h"
      20             : #endif
      21             : 
      22             : #include <dirent.h>
      23             : #include <fcntl.h>
      24             : #include <sys/stat.h>
      25             : #include <unistd.h>
      26             : 
      27             : #include "common/file_utils.h"
      28             : #ifdef FRONTEND
      29             : #include "common/logging.h"
      30             : #endif
      31             : #include "common/relpath.h"
      32             : #include "port/pg_iovec.h"
      33             : 
      34             : #ifdef FRONTEND
      35             : 
      36             : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
      37             : #if defined(HAVE_SYNC_FILE_RANGE)
      38             : #define PG_FLUSH_DATA_WORKS 1
      39             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
      40             : #define PG_FLUSH_DATA_WORKS 1
      41             : #endif
      42             : 
      43             : /*
      44             :  * pg_xlog has been renamed to pg_wal in version 10.
      45             :  */
      46             : #define MINIMUM_VERSION_FOR_PG_WAL  100000
      47             : 
      48             : static void walkdir(const char *path,
      49             :                     int (*action) (const char *fname, bool isdir),
      50             :                     bool process_symlinks,
      51             :                     const char *exclude_dir);
      52             : 
      53             : #ifdef HAVE_SYNCFS
      54             : 
      55             : /*
      56             :  * do_syncfs -- Try to syncfs a file system
      57             :  *
      58             :  * Reports errors trying to open the path.  syncfs() errors are fatal.
      59             :  */
      60             : static void
      61           4 : do_syncfs(const char *path)
      62             : {
      63             :     int         fd;
      64             : 
      65           4 :     fd = open(path, O_RDONLY, 0);
      66             : 
      67           4 :     if (fd < 0)
      68             :     {
      69           0 :         pg_log_error("could not open file \"%s\": %m", path);
      70           0 :         return;
      71             :     }
      72             : 
      73           4 :     if (syncfs(fd) < 0)
      74             :     {
      75           0 :         pg_log_error("could not synchronize file system for file \"%s\": %m", path);
      76           0 :         (void) close(fd);
      77           0 :         exit(EXIT_FAILURE);
      78             :     }
      79             : 
      80           4 :     (void) close(fd);
      81             : }
      82             : 
      83             : #endif                          /* HAVE_SYNCFS */
      84             : 
      85             : /*
      86             :  * Synchronize PGDATA and all its contents.
      87             :  *
      88             :  * We sync regular files and directories wherever they are, but we follow
      89             :  * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
      90             :  * Other symlinks are presumed to point at files we're not responsible for
      91             :  * syncing, and might not have privileges to write at all.
      92             :  *
      93             :  * serverVersion indicates the version of the server to be sync'd.
      94             :  *
      95             :  * If sync_data_files is false, this function skips syncing "base/" and any
      96             :  * other tablespace directories.
      97             :  */
      98             : void
      99          32 : sync_pgdata(const char *pg_data,
     100             :             int serverVersion,
     101             :             DataDirSyncMethod sync_method,
     102             :             bool sync_data_files)
     103             : {
     104             :     bool        xlog_is_symlink;
     105             :     char        pg_wal[MAXPGPATH];
     106             :     char        pg_tblspc[MAXPGPATH];
     107             : 
     108             :     /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
     109          32 :     snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
     110             :              serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
     111          32 :     snprintf(pg_tblspc, MAXPGPATH, "%s/%s", pg_data, PG_TBLSPC_DIR);
     112             : 
     113             :     /*
     114             :      * If pg_wal is a symlink, we'll need to recurse into it separately,
     115             :      * because the first walkdir below will ignore it.
     116             :      */
     117          32 :     xlog_is_symlink = false;
     118             : 
     119             :     {
     120             :         struct stat st;
     121             : 
     122          32 :         if (lstat(pg_wal, &st) < 0)
     123           0 :             pg_log_error("could not stat file \"%s\": %m", pg_wal);
     124          32 :         else if (S_ISLNK(st.st_mode))
     125           6 :             xlog_is_symlink = true;
     126             :     }
     127             : 
     128          32 :     switch (sync_method)
     129             :     {
     130           2 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     131             :             {
     132             : #ifndef HAVE_SYNCFS
     133             :                 pg_log_error("this build does not support sync method \"%s\"",
     134             :                              "syncfs");
     135             :                 exit(EXIT_FAILURE);
     136             : #else
     137             :                 DIR        *dir;
     138             :                 struct dirent *de;
     139             : 
     140             :                 /*
     141             :                  * On Linux, we don't have to open every single file one by
     142             :                  * one.  We can use syncfs() to sync whole filesystems.  We
     143             :                  * only expect filesystem boundaries to exist where we
     144             :                  * tolerate symlinks, namely pg_wal and the tablespaces, so we
     145             :                  * call syncfs() for each of those directories.
     146             :                  */
     147             : 
     148             :                 /* Sync the top level pgdata directory. */
     149           2 :                 do_syncfs(pg_data);
     150             : 
     151             :                 /* If any tablespaces are configured, sync each of those. */
     152           2 :                 if (sync_data_files)
     153             :                 {
     154           2 :                     dir = opendir(pg_tblspc);
     155           2 :                     if (dir == NULL)
     156           0 :                         pg_log_error("could not open directory \"%s\": %m",
     157             :                                      pg_tblspc);
     158             :                     else
     159             :                     {
     160           6 :                         while (errno = 0, (de = readdir(dir)) != NULL)
     161             :                         {
     162             :                             char        subpath[MAXPGPATH * 2];
     163             : 
     164           4 :                             if (strcmp(de->d_name, ".") == 0 ||
     165           2 :                                 strcmp(de->d_name, "..") == 0)
     166           4 :                                 continue;
     167             : 
     168           0 :                             snprintf(subpath, sizeof(subpath), "%s/%s",
     169           0 :                                      pg_tblspc, de->d_name);
     170           0 :                             do_syncfs(subpath);
     171             :                         }
     172             : 
     173           2 :                         if (errno)
     174           0 :                             pg_log_error("could not read directory \"%s\": %m",
     175             :                                          pg_tblspc);
     176             : 
     177           2 :                         (void) closedir(dir);
     178             :                     }
     179             :                 }
     180             : 
     181             :                 /* If pg_wal is a symlink, process that too. */
     182           2 :                 if (xlog_is_symlink)
     183           2 :                     do_syncfs(pg_wal);
     184             : #endif                          /* HAVE_SYNCFS */
     185             :             }
     186           2 :             break;
     187             : 
     188          30 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     189             :             {
     190          30 :                 char       *exclude_dir = NULL;
     191             : 
     192          30 :                 if (!sync_data_files)
     193           2 :                     exclude_dir = psprintf("%s/base", pg_data);
     194             : 
     195             :                 /*
     196             :                  * If possible, hint to the kernel that we're soon going to
     197             :                  * fsync the data directory and its contents.
     198             :                  */
     199             : #ifdef PG_FLUSH_DATA_WORKS
     200          30 :                 walkdir(pg_data, pre_sync_fname, false, exclude_dir);
     201          30 :                 if (xlog_is_symlink)
     202           4 :                     walkdir(pg_wal, pre_sync_fname, false, NULL);
     203          30 :                 if (sync_data_files)
     204          28 :                     walkdir(pg_tblspc, pre_sync_fname, true, NULL);
     205             : #endif
     206             : 
     207             :                 /*
     208             :                  * Now we do the fsync()s in the same order.
     209             :                  *
     210             :                  * The main call ignores symlinks, so in addition to specially
     211             :                  * processing pg_wal if it's a symlink, pg_tblspc has to be
     212             :                  * visited separately with process_symlinks = true.  Note that
     213             :                  * if there are any plain directories in pg_tblspc, they'll
     214             :                  * get fsync'd twice. That's not an expected case so we don't
     215             :                  * worry about optimizing it.
     216             :                  */
     217          30 :                 walkdir(pg_data, fsync_fname, false, exclude_dir);
     218          30 :                 if (xlog_is_symlink)
     219           4 :                     walkdir(pg_wal, fsync_fname, false, NULL);
     220          30 :                 if (sync_data_files)
     221          28 :                     walkdir(pg_tblspc, fsync_fname, true, NULL);
     222             : 
     223          30 :                 if (exclude_dir)
     224           2 :                     pfree(exclude_dir);
     225             :             }
     226          30 :             break;
     227             :     }
     228          32 : }
     229             : 
     230             : /*
     231             :  * Synchronize the given directory and all its contents.
     232             :  *
     233             :  * This is a convenient wrapper on top of walkdir() and do_syncfs().
     234             :  */
     235             : void
     236           8 : sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
     237             : {
     238           8 :     switch (sync_method)
     239             :     {
     240           0 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     241             :             {
     242             : #ifndef HAVE_SYNCFS
     243             :                 pg_log_error("this build does not support sync method \"%s\"",
     244             :                              "syncfs");
     245             :                 exit(EXIT_FAILURE);
     246             : #else
     247             :                 /*
     248             :                  * On Linux, we don't have to open every single file one by
     249             :                  * one.  We can use syncfs() to sync the whole filesystem.
     250             :                  */
     251           0 :                 do_syncfs(dir);
     252             : #endif                          /* HAVE_SYNCFS */
     253             :             }
     254           0 :             break;
     255             : 
     256           8 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     257             :             {
     258             :                 /*
     259             :                  * If possible, hint to the kernel that we're soon going to
     260             :                  * fsync the data directory and its contents.
     261             :                  */
     262             : #ifdef PG_FLUSH_DATA_WORKS
     263           8 :                 walkdir(dir, pre_sync_fname, false, NULL);
     264             : #endif
     265             : 
     266           8 :                 walkdir(dir, fsync_fname, false, NULL);
     267             :             }
     268           8 :             break;
     269             :     }
     270           8 : }
     271             : 
     272             : /*
     273             :  * walkdir: recursively walk a directory, applying the action to each
     274             :  * regular file and directory (including the named directory itself).
     275             :  *
     276             :  * If process_symlinks is true, the action and recursion are also applied
     277             :  * to regular files and directories that are pointed to by symlinks in the
     278             :  * given directory; otherwise symlinks are ignored.  Symlinks are always
     279             :  * ignored in subdirectories, ie we intentionally don't pass down the
     280             :  * process_symlinks flag to recursive calls.
     281             :  *
     282             :  * If exclude_dir is not NULL, it specifies a directory path to skip
     283             :  * processing.
     284             :  *
     285             :  * Errors are reported but not considered fatal.
     286             :  *
     287             :  * See also walkdir in fd.c, which is a backend version of this logic.
     288             :  */
     289             : static void
     290        1704 : walkdir(const char *path,
     291             :         int (*action) (const char *fname, bool isdir),
     292             :         bool process_symlinks,
     293             :         const char *exclude_dir)
     294             : {
     295             :     DIR        *dir;
     296             :     struct dirent *de;
     297             : 
     298        1704 :     if (exclude_dir && strcmp(exclude_dir, path) == 0)
     299           4 :         return;
     300             : 
     301        1700 :     dir = opendir(path);
     302        1700 :     if (dir == NULL)
     303             :     {
     304           0 :         pg_log_error("could not open directory \"%s\": %m", path);
     305           0 :         return;
     306             :     }
     307             : 
     308       64688 :     while (errno = 0, (de = readdir(dir)) != NULL)
     309             :     {
     310             :         char        subpath[MAXPGPATH * 2];
     311             : 
     312       62988 :         if (strcmp(de->d_name, ".") == 0 ||
     313       61288 :             strcmp(de->d_name, "..") == 0)
     314        3400 :             continue;
     315             : 
     316       59588 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
     317             : 
     318       59588 :         switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
     319             :         {
     320       58012 :             case PGFILETYPE_REG:
     321       58012 :                 (*action) (subpath, false);
     322       58012 :                 break;
     323        1564 :             case PGFILETYPE_DIR:
     324        1564 :                 walkdir(subpath, action, false, exclude_dir);
     325        1564 :                 break;
     326          12 :             default:
     327             : 
     328             :                 /*
     329             :                  * Errors are already reported directly by get_dirent_type(),
     330             :                  * and any remaining symlinks and unknown file types are
     331             :                  * ignored.
     332             :                  */
     333          12 :                 break;
     334             :         }
     335             :     }
     336             : 
     337        1700 :     if (errno)
     338           0 :         pg_log_error("could not read directory \"%s\": %m", path);
     339             : 
     340        1700 :     (void) closedir(dir);
     341             : 
     342             :     /*
     343             :      * It's important to fsync the destination directory itself as individual
     344             :      * file fsyncs don't guarantee that the directory entry for the file is
     345             :      * synced.  Recent versions of ext4 have made the window much wider but
     346             :      * it's been an issue for ext3 and other filesystems in the past.
     347             :      */
     348        1700 :     (*action) (path, true);
     349             : }
     350             : 
     351             : /*
     352             :  * Hint to the OS that it should get ready to fsync() this file, if supported
     353             :  * by the platform.
     354             :  *
     355             :  * Ignores errors trying to open unreadable files, and reports other errors
     356             :  * non-fatally.
     357             :  */
     358             : int
     359       29856 : pre_sync_fname(const char *fname, bool isdir)
     360             : {
     361             : #ifdef PG_FLUSH_DATA_WORKS
     362             :     int         fd;
     363             : 
     364       29856 :     fd = open(fname, O_RDONLY | PG_BINARY, 0);
     365             : 
     366       29856 :     if (fd < 0)
     367             :     {
     368           0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     369           0 :             return 0;
     370           0 :         pg_log_error("could not open file \"%s\": %m", fname);
     371           0 :         return -1;
     372             :     }
     373             : 
     374             :     /*
     375             :      * We do what pg_flush_data() would do in the backend: prefer to use
     376             :      * sync_file_range, but fall back to posix_fadvise.  We ignore errors
     377             :      * because this is only a hint.
     378             :      */
     379             : #if defined(HAVE_SYNC_FILE_RANGE)
     380       29856 :     (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
     381             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     382             :     (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
     383             : #else
     384             : #error PG_FLUSH_DATA_WORKS should not have been defined
     385             : #endif
     386             : 
     387       29856 :     (void) close(fd);
     388             : #endif                          /* PG_FLUSH_DATA_WORKS */
     389       29856 :     return 0;
     390             : }
     391             : 
     392             : /*
     393             :  * fsync_fname -- Try to fsync a file or directory
     394             :  *
     395             :  * Ignores errors trying to open unreadable files, or trying to fsync
     396             :  * directories on systems where that isn't allowed/required.  All other errors
     397             :  * are fatal.
     398             :  */
     399             : int
     400       29954 : fsync_fname(const char *fname, bool isdir)
     401             : {
     402             :     int         fd;
     403             :     int         flags;
     404             :     int         returncode;
     405             : 
     406             :     /*
     407             :      * Some OSs require directories to be opened read-only whereas other
     408             :      * systems don't allow us to fsync files opened read-only; so we need both
     409             :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
     410             :      * not writable by our userid, but we assume that's OK.
     411             :      */
     412       29954 :     flags = PG_BINARY;
     413       29954 :     if (!isdir)
     414       29056 :         flags |= O_RDWR;
     415             :     else
     416         898 :         flags |= O_RDONLY;
     417             : 
     418             :     /*
     419             :      * Open the file, silently ignoring errors about unreadable files (or
     420             :      * unsupported operations, e.g. opening a directory under Windows), and
     421             :      * logging others.
     422             :      */
     423       29954 :     fd = open(fname, flags, 0);
     424       29954 :     if (fd < 0)
     425             :     {
     426           0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     427           0 :             return 0;
     428           0 :         pg_log_error("could not open file \"%s\": %m", fname);
     429           0 :         return -1;
     430             :     }
     431             : 
     432       29954 :     returncode = fsync(fd);
     433             : 
     434             :     /*
     435             :      * Some OSes don't allow us to fsync directories at all, so we can ignore
     436             :      * those errors. Anything else needs to be reported.
     437             :      */
     438       29954 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
     439             :     {
     440           0 :         pg_log_error("could not fsync file \"%s\": %m", fname);
     441           0 :         (void) close(fd);
     442           0 :         exit(EXIT_FAILURE);
     443             :     }
     444             : 
     445       29954 :     (void) close(fd);
     446       29954 :     return 0;
     447             : }
     448             : 
     449             : /*
     450             :  * fsync_parent_path -- fsync the parent path of a file or directory
     451             :  *
     452             :  * This is aimed at making file operations persistent on disk in case of
     453             :  * an OS crash or power failure.
     454             :  */
     455             : int
     456          34 : fsync_parent_path(const char *fname)
     457             : {
     458             :     char        parentpath[MAXPGPATH];
     459             : 
     460          34 :     strlcpy(parentpath, fname, MAXPGPATH);
     461          34 :     get_parent_directory(parentpath);
     462             : 
     463             :     /*
     464             :      * get_parent_directory() returns an empty string if the input argument is
     465             :      * just a file name (see comments in path.c), so handle that as being the
     466             :      * current directory.
     467             :      */
     468          34 :     if (strlen(parentpath) == 0)
     469           0 :         strlcpy(parentpath, ".", MAXPGPATH);
     470             : 
     471          34 :     if (fsync_fname(parentpath, true) != 0)
     472           0 :         return -1;
     473             : 
     474          34 :     return 0;
     475             : }
     476             : 
     477             : /*
     478             :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     479             :  *
     480             :  * Wrapper around rename, similar to the backend version.
     481             :  */
     482             : int
     483           6 : durable_rename(const char *oldfile, const char *newfile)
     484             : {
     485             :     int         fd;
     486             : 
     487             :     /*
     488             :      * First fsync the old and target path (if it exists), to ensure that they
     489             :      * are properly persistent on disk. Syncing the target file is not
     490             :      * strictly necessary, but it makes it easier to reason about crashes;
     491             :      * because it's then guaranteed that either source or target file exists
     492             :      * after a crash.
     493             :      */
     494           6 :     if (fsync_fname(oldfile, false) != 0)
     495           0 :         return -1;
     496             : 
     497           6 :     fd = open(newfile, PG_BINARY | O_RDWR, 0);
     498           6 :     if (fd < 0)
     499             :     {
     500           6 :         if (errno != ENOENT)
     501             :         {
     502           0 :             pg_log_error("could not open file \"%s\": %m", newfile);
     503           0 :             return -1;
     504             :         }
     505             :     }
     506             :     else
     507             :     {
     508           0 :         if (fsync(fd) != 0)
     509             :         {
     510           0 :             pg_log_error("could not fsync file \"%s\": %m", newfile);
     511           0 :             close(fd);
     512           0 :             exit(EXIT_FAILURE);
     513             :         }
     514           0 :         close(fd);
     515             :     }
     516             : 
     517             :     /* Time to do the real deal... */
     518           6 :     if (rename(oldfile, newfile) != 0)
     519             :     {
     520           0 :         pg_log_error("could not rename file \"%s\" to \"%s\": %m",
     521             :                      oldfile, newfile);
     522           0 :         return -1;
     523             :     }
     524             : 
     525             :     /*
     526             :      * To guarantee renaming the file is persistent, fsync the file with its
     527             :      * new name, and its containing directory.
     528             :      */
     529           6 :     if (fsync_fname(newfile, false) != 0)
     530           0 :         return -1;
     531             : 
     532           6 :     if (fsync_parent_path(newfile) != 0)
     533           0 :         return -1;
     534             : 
     535           6 :     return 0;
     536             : }
     537             : 
     538             : #endif                          /* FRONTEND */
     539             : 
     540             : /*
     541             :  * Return the type of a directory entry.
     542             :  *
     543             :  * In frontend code, elevel should be a level from logging.h; in backend code
     544             :  * it should be a level from elog.h.
     545             :  */
     546             : PGFileType
     547      470614 : get_dirent_type(const char *path,
     548             :                 const struct dirent *de,
     549             :                 bool look_through_symlinks,
     550             :                 int elevel)
     551             : {
     552             :     PGFileType  result;
     553             : 
     554             :     /*
     555             :      * Some systems tell us the type directly in the dirent struct, but that's
     556             :      * a BSD and Linux extension not required by POSIX.  Even when the
     557             :      * interface is present, sometimes the type is unknown, depending on the
     558             :      * filesystem.
     559             :      */
     560             : #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
     561      470614 :     if (de->d_type == DT_REG)
     562      462426 :         result = PGFILETYPE_REG;
     563        8188 :     else if (de->d_type == DT_DIR)
     564        8118 :         result = PGFILETYPE_DIR;
     565          70 :     else if (de->d_type == DT_LNK && !look_through_symlinks)
     566          66 :         result = PGFILETYPE_LNK;
     567             :     else
     568           4 :         result = PGFILETYPE_UNKNOWN;
     569             : #else
     570             :     result = PGFILETYPE_UNKNOWN;
     571             : #endif
     572             : 
     573      470614 :     if (result == PGFILETYPE_UNKNOWN)
     574             :     {
     575             :         struct stat fst;
     576             :         int         sret;
     577             : 
     578             : 
     579           4 :         if (look_through_symlinks)
     580           4 :             sret = stat(path, &fst);
     581             :         else
     582           0 :             sret = lstat(path, &fst);
     583             : 
     584           4 :         if (sret < 0)
     585             :         {
     586           0 :             result = PGFILETYPE_ERROR;
     587             : #ifdef FRONTEND
     588           0 :             pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
     589             : #else
     590           0 :             ereport(elevel,
     591             :                     (errcode_for_file_access(),
     592             :                      errmsg("could not stat file \"%s\": %m", path)));
     593             : #endif
     594             :         }
     595           4 :         else if (S_ISREG(fst.st_mode))
     596           0 :             result = PGFILETYPE_REG;
     597           4 :         else if (S_ISDIR(fst.st_mode))
     598           4 :             result = PGFILETYPE_DIR;
     599           0 :         else if (S_ISLNK(fst.st_mode))
     600           0 :             result = PGFILETYPE_LNK;
     601             :     }
     602             : 
     603      470614 :     return result;
     604             : }
     605             : 
     606             : /*
     607             :  * Compute what remains to be done after a possibly partial vectored read or
     608             :  * write.  The part of 'source' beginning after 'transferred' bytes is copied
     609             :  * to 'destination', and its length is returned.  'source' and 'destination'
     610             :  * may point to the same array, for in-place adjustment.  A return value of
     611             :  * zero indicates completion (for callers without a cheaper way to know that).
     612             :  */
     613             : int
     614      449690 : compute_remaining_iovec(struct iovec *destination,
     615             :                         const struct iovec *source,
     616             :                         int iovcnt,
     617             :                         size_t transferred)
     618             : {
     619             :     Assert(iovcnt > 0);
     620             : 
     621             :     /* Skip wholly transferred iovecs. */
     622     5674764 :     while (source->iov_len <= transferred)
     623             :     {
     624     5674764 :         transferred -= source->iov_len;
     625     5674764 :         source++;
     626     5674764 :         iovcnt--;
     627             : 
     628             :         /* All iovecs transferred? */
     629     5674764 :         if (iovcnt == 0)
     630             :         {
     631             :             /*
     632             :              * We don't expect the kernel to transfer more than we asked it
     633             :              * to, or something is out of sync.
     634             :              */
     635             :             Assert(transferred == 0);
     636      449690 :             return 0;
     637             :         }
     638             :     }
     639             : 
     640             :     /* Copy the remaining iovecs to the front of the array. */
     641           0 :     if (source != destination)
     642           0 :         memmove(destination, source, sizeof(*source) * iovcnt);
     643             : 
     644             :     /* Adjust leading iovec, which may have been partially transferred. */
     645             :     Assert(destination->iov_len > transferred);
     646           0 :     destination->iov_base = (char *) destination->iov_base + transferred;
     647           0 :     destination->iov_len -= transferred;
     648             : 
     649           0 :     return iovcnt;
     650             : }
     651             : 
     652             : /*
     653             :  * pg_pwritev_with_retry
     654             :  *
     655             :  * Convenience wrapper for pg_pwritev() that retries on partial write.  If an
     656             :  * error is returned, it is unspecified how much has been written.
     657             :  */
     658             : ssize_t
     659      449690 : pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
     660             : {
     661             :     struct iovec iov_copy[PG_IOV_MAX];
     662      449690 :     ssize_t     sum = 0;
     663             :     ssize_t     part;
     664             : 
     665             :     /* We'd better have space to make a copy, in case we need to retry. */
     666      449690 :     if (iovcnt > PG_IOV_MAX)
     667             :     {
     668           0 :         errno = EINVAL;
     669           0 :         return -1;
     670             :     }
     671             : 
     672             :     do
     673             :     {
     674             :         /* Write as much as we can. */
     675      449690 :         part = pg_pwritev(fd, iov, iovcnt, offset);
     676      449690 :         if (part < 0)
     677           0 :             return -1;
     678             : 
     679             : #ifdef SIMULATE_SHORT_WRITE
     680             :         part = Min(part, 4096);
     681             : #endif
     682             : 
     683             :         /* Count our progress. */
     684      449690 :         sum += part;
     685      449690 :         offset += part;
     686             : 
     687             :         /*
     688             :          * See what is left.  On the first loop we used the caller's array,
     689             :          * but in later loops we'll use our local copy that we are allowed to
     690             :          * mutate.
     691             :          */
     692      449690 :         iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
     693      449690 :         iov = iov_copy;
     694      449690 :     } while (iovcnt > 0);
     695             : 
     696      449690 :     return sum;
     697             : }
     698             : 
     699             : /*
     700             :  * pg_pwrite_zeros
     701             :  *
     702             :  * Writes zeros to file worth "size" bytes at "offset" (from the start of the
     703             :  * file), using vectored I/O.
     704             :  *
     705             :  * Returns the total amount of data written.  On failure, a negative value
     706             :  * is returned with errno set.
     707             :  */
     708             : ssize_t
     709      411530 : pg_pwrite_zeros(int fd, size_t size, off_t offset)
     710             : {
     711             :     static const PGIOAlignedBlock zbuffer = {0};    /* worth BLCKSZ */
     712      411530 :     void       *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
     713             :     struct iovec iov[PG_IOV_MAX];
     714      411530 :     size_t      remaining_size = size;
     715      411530 :     ssize_t     total_written = 0;
     716             : 
     717             :     /* Loop, writing as many blocks as we can for each system call. */
     718      861220 :     while (remaining_size > 0)
     719             :     {
     720      449690 :         int         iovcnt = 0;
     721             :         ssize_t     written;
     722             : 
     723     6124454 :         for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
     724             :         {
     725             :             size_t      this_iov_size;
     726             : 
     727     5674764 :             iov[iovcnt].iov_base = zerobuf_addr;
     728             : 
     729     5674764 :             if (remaining_size < BLCKSZ)
     730           0 :                 this_iov_size = remaining_size;
     731             :             else
     732     5674764 :                 this_iov_size = BLCKSZ;
     733             : 
     734     5674764 :             iov[iovcnt].iov_len = this_iov_size;
     735     5674764 :             remaining_size -= this_iov_size;
     736             :         }
     737             : 
     738      449690 :         written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
     739             : 
     740      449690 :         if (written < 0)
     741           0 :             return written;
     742             : 
     743      449690 :         offset += written;
     744      449690 :         total_written += written;
     745             :     }
     746             : 
     747             :     Assert(total_written == size);
     748             : 
     749      411530 :     return total_written;
     750             : }

Generated by: LCOV version 1.14