LCOV - code coverage report
Current view: top level - src/common - file_utils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 147 206 71.4 %
Date: 2025-01-18 04:15:08 Functions: 12 12 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * File-processing utility routines.
       4             :  *
       5             :  * Assorted utility functions to work on files.
       6             :  *
       7             :  *
       8             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       9             :  * Portions Copyright (c) 1994, Regents of the University of California
      10             :  *
      11             :  * src/common/file_utils.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #ifndef FRONTEND
      17             : #include "postgres.h"
      18             : #else
      19             : #include "postgres_fe.h"
      20             : #endif
      21             : 
      22             : #include <dirent.h>
      23             : #include <fcntl.h>
      24             : #include <sys/stat.h>
      25             : #include <unistd.h>
      26             : 
      27             : #include "common/file_utils.h"
      28             : #ifdef FRONTEND
      29             : #include "common/logging.h"
      30             : #endif
      31             : #include "common/relpath.h"
      32             : #include "port/pg_iovec.h"
      33             : 
      34             : #ifdef FRONTEND
      35             : 
      36             : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
      37             : #if defined(HAVE_SYNC_FILE_RANGE)
      38             : #define PG_FLUSH_DATA_WORKS 1
      39             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
      40             : #define PG_FLUSH_DATA_WORKS 1
      41             : #endif
      42             : 
      43             : /*
      44             :  * pg_xlog has been renamed to pg_wal in version 10.
      45             :  */
      46             : #define MINIMUM_VERSION_FOR_PG_WAL  100000
      47             : 
      48             : #ifdef PG_FLUSH_DATA_WORKS
      49             : static int  pre_sync_fname(const char *fname, bool isdir);
      50             : #endif
      51             : static void walkdir(const char *path,
      52             :                     int (*action) (const char *fname, bool isdir),
      53             :                     bool process_symlinks);
      54             : 
      55             : #ifdef HAVE_SYNCFS
      56             : 
      57             : /*
      58             :  * do_syncfs -- Try to syncfs a file system
      59             :  *
      60             :  * Reports errors trying to open the path.  syncfs() errors are fatal.
      61             :  */
      62             : static void
      63           4 : do_syncfs(const char *path)
      64             : {
      65             :     int         fd;
      66             : 
      67           4 :     fd = open(path, O_RDONLY, 0);
      68             : 
      69           4 :     if (fd < 0)
      70             :     {
      71           0 :         pg_log_error("could not open file \"%s\": %m", path);
      72           0 :         return;
      73             :     }
      74             : 
      75           4 :     if (syncfs(fd) < 0)
      76             :     {
      77           0 :         pg_log_error("could not synchronize file system for file \"%s\": %m", path);
      78           0 :         (void) close(fd);
      79           0 :         exit(EXIT_FAILURE);
      80             :     }
      81             : 
      82           4 :     (void) close(fd);
      83             : }
      84             : 
      85             : #endif                          /* HAVE_SYNCFS */
      86             : 
      87             : /*
      88             :  * Synchronize PGDATA and all its contents.
      89             :  *
      90             :  * We sync regular files and directories wherever they are, but we follow
      91             :  * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
      92             :  * Other symlinks are presumed to point at files we're not responsible for
      93             :  * syncing, and might not have privileges to write at all.
      94             :  *
      95             :  * serverVersion indicates the version of the server to be sync'd.
      96             :  */
      97             : void
      98          30 : sync_pgdata(const char *pg_data,
      99             :             int serverVersion,
     100             :             DataDirSyncMethod sync_method)
     101             : {
     102             :     bool        xlog_is_symlink;
     103             :     char        pg_wal[MAXPGPATH];
     104             :     char        pg_tblspc[MAXPGPATH];
     105             : 
     106             :     /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
     107          30 :     snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
     108             :              serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
     109          30 :     snprintf(pg_tblspc, MAXPGPATH, "%s/%s", pg_data, PG_TBLSPC_DIR);
     110             : 
     111             :     /*
     112             :      * If pg_wal is a symlink, we'll need to recurse into it separately,
     113             :      * because the first walkdir below will ignore it.
     114             :      */
     115          30 :     xlog_is_symlink = false;
     116             : 
     117             :     {
     118             :         struct stat st;
     119             : 
     120          30 :         if (lstat(pg_wal, &st) < 0)
     121           0 :             pg_log_error("could not stat file \"%s\": %m", pg_wal);
     122          30 :         else if (S_ISLNK(st.st_mode))
     123           4 :             xlog_is_symlink = true;
     124             :     }
     125             : 
     126          30 :     switch (sync_method)
     127             :     {
     128           2 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     129             :             {
     130             : #ifndef HAVE_SYNCFS
     131             :                 pg_log_error("this build does not support sync method \"%s\"",
     132             :                              "syncfs");
     133             :                 exit(EXIT_FAILURE);
     134             : #else
     135             :                 DIR        *dir;
     136             :                 struct dirent *de;
     137             : 
     138             :                 /*
     139             :                  * On Linux, we don't have to open every single file one by
     140             :                  * one.  We can use syncfs() to sync whole filesystems.  We
     141             :                  * only expect filesystem boundaries to exist where we
     142             :                  * tolerate symlinks, namely pg_wal and the tablespaces, so we
     143             :                  * call syncfs() for each of those directories.
     144             :                  */
     145             : 
     146             :                 /* Sync the top level pgdata directory. */
     147           2 :                 do_syncfs(pg_data);
     148             : 
     149             :                 /* If any tablespaces are configured, sync each of those. */
     150           2 :                 dir = opendir(pg_tblspc);
     151           2 :                 if (dir == NULL)
     152           0 :                     pg_log_error("could not open directory \"%s\": %m",
     153             :                                  pg_tblspc);
     154             :                 else
     155             :                 {
     156           6 :                     while (errno = 0, (de = readdir(dir)) != NULL)
     157             :                     {
     158             :                         char        subpath[MAXPGPATH * 2];
     159             : 
     160           4 :                         if (strcmp(de->d_name, ".") == 0 ||
     161           2 :                             strcmp(de->d_name, "..") == 0)
     162           4 :                             continue;
     163             : 
     164           0 :                         snprintf(subpath, sizeof(subpath), "%s/%s",
     165           0 :                                  pg_tblspc, de->d_name);
     166           0 :                         do_syncfs(subpath);
     167             :                     }
     168             : 
     169           2 :                     if (errno)
     170           0 :                         pg_log_error("could not read directory \"%s\": %m",
     171             :                                      pg_tblspc);
     172             : 
     173           2 :                     (void) closedir(dir);
     174             :                 }
     175             : 
     176             :                 /* If pg_wal is a symlink, process that too. */
     177           2 :                 if (xlog_is_symlink)
     178           2 :                     do_syncfs(pg_wal);
     179             : #endif                          /* HAVE_SYNCFS */
     180             :             }
     181           2 :             break;
     182             : 
     183          28 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     184             :             {
     185             :                 /*
     186             :                  * If possible, hint to the kernel that we're soon going to
     187             :                  * fsync the data directory and its contents.
     188             :                  */
     189             : #ifdef PG_FLUSH_DATA_WORKS
     190          28 :                 walkdir(pg_data, pre_sync_fname, false);
     191          28 :                 if (xlog_is_symlink)
     192           2 :                     walkdir(pg_wal, pre_sync_fname, false);
     193          28 :                 walkdir(pg_tblspc, pre_sync_fname, true);
     194             : #endif
     195             : 
     196             :                 /*
     197             :                  * Now we do the fsync()s in the same order.
     198             :                  *
     199             :                  * The main call ignores symlinks, so in addition to specially
     200             :                  * processing pg_wal if it's a symlink, pg_tblspc has to be
     201             :                  * visited separately with process_symlinks = true.  Note that
     202             :                  * if there are any plain directories in pg_tblspc, they'll
     203             :                  * get fsync'd twice. That's not an expected case so we don't
     204             :                  * worry about optimizing it.
     205             :                  */
     206          28 :                 walkdir(pg_data, fsync_fname, false);
     207          28 :                 if (xlog_is_symlink)
     208           2 :                     walkdir(pg_wal, fsync_fname, false);
     209          28 :                 walkdir(pg_tblspc, fsync_fname, true);
     210             :             }
     211          28 :             break;
     212             :     }
     213          30 : }
     214             : 
     215             : /*
     216             :  * Synchronize the given directory and all its contents.
     217             :  *
     218             :  * This is a convenient wrapper on top of walkdir() and do_syncfs().
     219             :  */
     220             : void
     221           8 : sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
     222             : {
     223           8 :     switch (sync_method)
     224             :     {
     225           0 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     226             :             {
     227             : #ifndef HAVE_SYNCFS
     228             :                 pg_log_error("this build does not support sync method \"%s\"",
     229             :                              "syncfs");
     230             :                 exit(EXIT_FAILURE);
     231             : #else
     232             :                 /*
     233             :                  * On Linux, we don't have to open every single file one by
     234             :                  * one.  We can use syncfs() to sync the whole filesystem.
     235             :                  */
     236           0 :                 do_syncfs(dir);
     237             : #endif                          /* HAVE_SYNCFS */
     238             :             }
     239           0 :             break;
     240             : 
     241           8 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     242             :             {
     243             :                 /*
     244             :                  * If possible, hint to the kernel that we're soon going to
     245             :                  * fsync the data directory and its contents.
     246             :                  */
     247             : #ifdef PG_FLUSH_DATA_WORKS
     248           8 :                 walkdir(dir, pre_sync_fname, false);
     249             : #endif
     250             : 
     251           8 :                 walkdir(dir, fsync_fname, false);
     252             :             }
     253           8 :             break;
     254             :     }
     255           8 : }
     256             : 
     257             : /*
     258             :  * walkdir: recursively walk a directory, applying the action to each
     259             :  * regular file and directory (including the named directory itself).
     260             :  *
     261             :  * If process_symlinks is true, the action and recursion are also applied
     262             :  * to regular files and directories that are pointed to by symlinks in the
     263             :  * given directory; otherwise symlinks are ignored.  Symlinks are always
     264             :  * ignored in subdirectories, ie we intentionally don't pass down the
     265             :  * process_symlinks flag to recursive calls.
     266             :  *
     267             :  * Errors are reported but not considered fatal.
     268             :  *
     269             :  * See also walkdir in fd.c, which is a backend version of this logic.
     270             :  */
     271             : static void
     272        1608 : walkdir(const char *path,
     273             :         int (*action) (const char *fname, bool isdir),
     274             :         bool process_symlinks)
     275             : {
     276             :     DIR        *dir;
     277             :     struct dirent *de;
     278             : 
     279        1608 :     dir = opendir(path);
     280        1608 :     if (dir == NULL)
     281             :     {
     282           0 :         pg_log_error("could not open directory \"%s\": %m", path);
     283           0 :         return;
     284             :     }
     285             : 
     286       63992 :     while (errno = 0, (de = readdir(dir)) != NULL)
     287             :     {
     288             :         char        subpath[MAXPGPATH * 2];
     289             : 
     290       62384 :         if (strcmp(de->d_name, ".") == 0 ||
     291       60776 :             strcmp(de->d_name, "..") == 0)
     292        3216 :             continue;
     293             : 
     294       59168 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
     295             : 
     296       59168 :         switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
     297             :         {
     298       57684 :             case PGFILETYPE_REG:
     299       57684 :                 (*action) (subpath, false);
     300       57684 :                 break;
     301        1476 :             case PGFILETYPE_DIR:
     302        1476 :                 walkdir(subpath, action, false);
     303        1476 :                 break;
     304           8 :             default:
     305             : 
     306             :                 /*
     307             :                  * Errors are already reported directly by get_dirent_type(),
     308             :                  * and any remaining symlinks and unknown file types are
     309             :                  * ignored.
     310             :                  */
     311           8 :                 break;
     312             :         }
     313             :     }
     314             : 
     315        1608 :     if (errno)
     316           0 :         pg_log_error("could not read directory \"%s\": %m", path);
     317             : 
     318        1608 :     (void) closedir(dir);
     319             : 
     320             :     /*
     321             :      * It's important to fsync the destination directory itself as individual
     322             :      * file fsyncs don't guarantee that the directory entry for the file is
     323             :      * synced.  Recent versions of ext4 have made the window much wider but
     324             :      * it's been an issue for ext3 and other filesystems in the past.
     325             :      */
     326        1608 :     (*action) (path, true);
     327             : }
     328             : 
     329             : /*
     330             :  * Hint to the OS that it should get ready to fsync() this file.
     331             :  *
     332             :  * Ignores errors trying to open unreadable files, and reports other errors
     333             :  * non-fatally.
     334             :  */
     335             : #ifdef PG_FLUSH_DATA_WORKS
     336             : 
     337             : static int
     338       29646 : pre_sync_fname(const char *fname, bool isdir)
     339             : {
     340             :     int         fd;
     341             : 
     342       29646 :     fd = open(fname, O_RDONLY | PG_BINARY, 0);
     343             : 
     344       29646 :     if (fd < 0)
     345             :     {
     346           0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     347           0 :             return 0;
     348           0 :         pg_log_error("could not open file \"%s\": %m", fname);
     349           0 :         return -1;
     350             :     }
     351             : 
     352             :     /*
     353             :      * We do what pg_flush_data() would do in the backend: prefer to use
     354             :      * sync_file_range, but fall back to posix_fadvise.  We ignore errors
     355             :      * because this is only a hint.
     356             :      */
     357             : #if defined(HAVE_SYNC_FILE_RANGE)
     358       29646 :     (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
     359             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     360             :     (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
     361             : #else
     362             : #error PG_FLUSH_DATA_WORKS should not have been defined
     363             : #endif
     364             : 
     365       29646 :     (void) close(fd);
     366       29646 :     return 0;
     367             : }
     368             : 
     369             : #endif                          /* PG_FLUSH_DATA_WORKS */
     370             : 
     371             : /*
     372             :  * fsync_fname -- Try to fsync a file or directory
     373             :  *
     374             :  * Ignores errors trying to open unreadable files, or trying to fsync
     375             :  * directories on systems where that isn't allowed/required.  All other errors
     376             :  * are fatal.
     377             :  */
     378             : int
     379       29732 : fsync_fname(const char *fname, bool isdir)
     380             : {
     381             :     int         fd;
     382             :     int         flags;
     383             :     int         returncode;
     384             : 
     385             :     /*
     386             :      * Some OSs require directories to be opened read-only whereas other
     387             :      * systems don't allow us to fsync files opened read-only; so we need both
     388             :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
     389             :      * not writable by our userid, but we assume that's OK.
     390             :      */
     391       29732 :     flags = PG_BINARY;
     392       29732 :     if (!isdir)
     393       28892 :         flags |= O_RDWR;
     394             :     else
     395         840 :         flags |= O_RDONLY;
     396             : 
     397             :     /*
     398             :      * Open the file, silently ignoring errors about unreadable files (or
     399             :      * unsupported operations, e.g. opening a directory under Windows), and
     400             :      * logging others.
     401             :      */
     402       29732 :     fd = open(fname, flags, 0);
     403       29732 :     if (fd < 0)
     404             :     {
     405           0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     406           0 :             return 0;
     407           0 :         pg_log_error("could not open file \"%s\": %m", fname);
     408           0 :         return -1;
     409             :     }
     410             : 
     411       29732 :     returncode = fsync(fd);
     412             : 
     413             :     /*
     414             :      * Some OSes don't allow us to fsync directories at all, so we can ignore
     415             :      * those errors. Anything else needs to be reported.
     416             :      */
     417       29732 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
     418             :     {
     419           0 :         pg_log_error("could not fsync file \"%s\": %m", fname);
     420           0 :         (void) close(fd);
     421           0 :         exit(EXIT_FAILURE);
     422             :     }
     423             : 
     424       29732 :     (void) close(fd);
     425       29732 :     return 0;
     426             : }
     427             : 
     428             : /*
     429             :  * fsync_parent_path -- fsync the parent path of a file or directory
     430             :  *
     431             :  * This is aimed at making file operations persistent on disk in case of
     432             :  * an OS crash or power failure.
     433             :  */
     434             : int
     435          28 : fsync_parent_path(const char *fname)
     436             : {
     437             :     char        parentpath[MAXPGPATH];
     438             : 
     439          28 :     strlcpy(parentpath, fname, MAXPGPATH);
     440          28 :     get_parent_directory(parentpath);
     441             : 
     442             :     /*
     443             :      * get_parent_directory() returns an empty string if the input argument is
     444             :      * just a file name (see comments in path.c), so handle that as being the
     445             :      * current directory.
     446             :      */
     447          28 :     if (strlen(parentpath) == 0)
     448           0 :         strlcpy(parentpath, ".", MAXPGPATH);
     449             : 
     450          28 :     if (fsync_fname(parentpath, true) != 0)
     451           0 :         return -1;
     452             : 
     453          28 :     return 0;
     454             : }
     455             : 
     456             : /*
     457             :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     458             :  *
     459             :  * Wrapper around rename, similar to the backend version.
     460             :  */
     461             : int
     462           6 : durable_rename(const char *oldfile, const char *newfile)
     463             : {
     464             :     int         fd;
     465             : 
     466             :     /*
     467             :      * First fsync the old and target path (if it exists), to ensure that they
     468             :      * are properly persistent on disk. Syncing the target file is not
     469             :      * strictly necessary, but it makes it easier to reason about crashes;
     470             :      * because it's then guaranteed that either source or target file exists
     471             :      * after a crash.
     472             :      */
     473           6 :     if (fsync_fname(oldfile, false) != 0)
     474           0 :         return -1;
     475             : 
     476           6 :     fd = open(newfile, PG_BINARY | O_RDWR, 0);
     477           6 :     if (fd < 0)
     478             :     {
     479           6 :         if (errno != ENOENT)
     480             :         {
     481           0 :             pg_log_error("could not open file \"%s\": %m", newfile);
     482           0 :             return -1;
     483             :         }
     484             :     }
     485             :     else
     486             :     {
     487           0 :         if (fsync(fd) != 0)
     488             :         {
     489           0 :             pg_log_error("could not fsync file \"%s\": %m", newfile);
     490           0 :             close(fd);
     491           0 :             exit(EXIT_FAILURE);
     492             :         }
     493           0 :         close(fd);
     494             :     }
     495             : 
     496             :     /* Time to do the real deal... */
     497           6 :     if (rename(oldfile, newfile) != 0)
     498             :     {
     499           0 :         pg_log_error("could not rename file \"%s\" to \"%s\": %m",
     500             :                      oldfile, newfile);
     501           0 :         return -1;
     502             :     }
     503             : 
     504             :     /*
     505             :      * To guarantee renaming the file is persistent, fsync the file with its
     506             :      * new name, and its containing directory.
     507             :      */
     508           6 :     if (fsync_fname(newfile, false) != 0)
     509           0 :         return -1;
     510             : 
     511           6 :     if (fsync_parent_path(newfile) != 0)
     512           0 :         return -1;
     513             : 
     514           6 :     return 0;
     515             : }
     516             : 
     517             : #endif                          /* FRONTEND */
     518             : 
     519             : /*
     520             :  * Return the type of a directory entry.
     521             :  *
     522             :  * In frontend code, elevel should be a level from logging.h; in backend code
     523             :  * it should be a level from elog.h.
     524             :  */
     525             : PGFileType
     526      445408 : get_dirent_type(const char *path,
     527             :                 const struct dirent *de,
     528             :                 bool look_through_symlinks,
     529             :                 int elevel)
     530             : {
     531             :     PGFileType  result;
     532             : 
     533             :     /*
     534             :      * Some systems tell us the type directly in the dirent struct, but that's
     535             :      * a BSD and Linux extension not required by POSIX.  Even when the
     536             :      * interface is present, sometimes the type is unknown, depending on the
     537             :      * filesystem.
     538             :      */
     539             : #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
     540      445408 :     if (de->d_type == DT_REG)
     541      437488 :         result = PGFILETYPE_REG;
     542        7920 :     else if (de->d_type == DT_DIR)
     543        7854 :         result = PGFILETYPE_DIR;
     544          66 :     else if (de->d_type == DT_LNK && !look_through_symlinks)
     545          62 :         result = PGFILETYPE_LNK;
     546             :     else
     547           4 :         result = PGFILETYPE_UNKNOWN;
     548             : #else
     549             :     result = PGFILETYPE_UNKNOWN;
     550             : #endif
     551             : 
     552      445408 :     if (result == PGFILETYPE_UNKNOWN)
     553             :     {
     554             :         struct stat fst;
     555             :         int         sret;
     556             : 
     557             : 
     558           4 :         if (look_through_symlinks)
     559           4 :             sret = stat(path, &fst);
     560             :         else
     561           0 :             sret = lstat(path, &fst);
     562             : 
     563           4 :         if (sret < 0)
     564             :         {
     565           0 :             result = PGFILETYPE_ERROR;
     566             : #ifdef FRONTEND
     567           0 :             pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
     568             : #else
     569           0 :             ereport(elevel,
     570             :                     (errcode_for_file_access(),
     571             :                      errmsg("could not stat file \"%s\": %m", path)));
     572             : #endif
     573             :         }
     574           4 :         else if (S_ISREG(fst.st_mode))
     575           0 :             result = PGFILETYPE_REG;
     576           4 :         else if (S_ISDIR(fst.st_mode))
     577           4 :             result = PGFILETYPE_DIR;
     578           0 :         else if (S_ISLNK(fst.st_mode))
     579           0 :             result = PGFILETYPE_LNK;
     580             :     }
     581             : 
     582      445408 :     return result;
     583             : }
     584             : 
     585             : /*
     586             :  * Compute what remains to be done after a possibly partial vectored read or
     587             :  * write.  The part of 'source' beginning after 'transferred' bytes is copied
     588             :  * to 'destination', and its length is returned.  'source' and 'destination'
     589             :  * may point to the same array, for in-place adjustment.  A return value of
     590             :  * zero indicates completion (for callers without a cheaper way to know that).
     591             :  */
     592             : int
     593      558608 : compute_remaining_iovec(struct iovec *destination,
     594             :                         const struct iovec *source,
     595             :                         int iovcnt,
     596             :                         size_t transferred)
     597             : {
     598             :     Assert(iovcnt > 0);
     599             : 
     600             :     /* Skip wholly transferred iovecs. */
     601     5739492 :     while (source->iov_len <= transferred)
     602             :     {
     603     5739492 :         transferred -= source->iov_len;
     604     5739492 :         source++;
     605     5739492 :         iovcnt--;
     606             : 
     607             :         /* All iovecs transferred? */
     608     5739492 :         if (iovcnt == 0)
     609             :         {
     610             :             /*
     611             :              * We don't expect the kernel to transfer more than we asked it
     612             :              * to, or something is out of sync.
     613             :              */
     614             :             Assert(transferred == 0);
     615      558608 :             return 0;
     616             :         }
     617             :     }
     618             : 
     619             :     /* Copy the remaining iovecs to the front of the array. */
     620           0 :     if (source != destination)
     621           0 :         memmove(destination, source, sizeof(*source) * iovcnt);
     622             : 
     623             :     /* Adjust leading iovec, which may have been partially transferred. */
     624             :     Assert(destination->iov_len > transferred);
     625           0 :     destination->iov_base = (char *) destination->iov_base + transferred;
     626           0 :     destination->iov_len -= transferred;
     627             : 
     628           0 :     return iovcnt;
     629             : }
     630             : 
     631             : /*
     632             :  * pg_pwritev_with_retry
     633             :  *
     634             :  * Convenience wrapper for pg_pwritev() that retries on partial write.  If an
     635             :  * error is returned, it is unspecified how much has been written.
     636             :  */
     637             : ssize_t
     638      558608 : pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
     639             : {
     640             :     struct iovec iov_copy[PG_IOV_MAX];
     641      558608 :     ssize_t     sum = 0;
     642             :     ssize_t     part;
     643             : 
     644             :     /* We'd better have space to make a copy, in case we need to retry. */
     645      558608 :     if (iovcnt > PG_IOV_MAX)
     646             :     {
     647           0 :         errno = EINVAL;
     648           0 :         return -1;
     649             :     }
     650             : 
     651             :     do
     652             :     {
     653             :         /* Write as much as we can. */
     654      558608 :         part = pg_pwritev(fd, iov, iovcnt, offset);
     655      558608 :         if (part < 0)
     656           0 :             return -1;
     657             : 
     658             : #ifdef SIMULATE_SHORT_WRITE
     659             :         part = Min(part, 4096);
     660             : #endif
     661             : 
     662             :         /* Count our progress. */
     663      558608 :         sum += part;
     664      558608 :         offset += part;
     665             : 
     666             :         /*
     667             :          * See what is left.  On the first loop we used the caller's array,
     668             :          * but in later loops we'll use our local copy that we are allowed to
     669             :          * mutate.
     670             :          */
     671      558608 :         iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
     672      558608 :         iov = iov_copy;
     673      558608 :     } while (iovcnt > 0);
     674             : 
     675      558608 :     return sum;
     676             : }
     677             : 
     678             : /*
     679             :  * pg_pwrite_zeros
     680             :  *
     681             :  * Writes zeros to file worth "size" bytes at "offset" (from the start of the
     682             :  * file), using vectored I/O.
     683             :  *
     684             :  * Returns the total amount of data written.  On failure, a negative value
     685             :  * is returned with errno set.
     686             :  */
     687             : ssize_t
     688      394922 : pg_pwrite_zeros(int fd, size_t size, off_t offset)
     689             : {
     690             :     static const PGIOAlignedBlock zbuffer = {{0}};  /* worth BLCKSZ */
     691      394922 :     void       *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
     692             :     struct iovec iov[PG_IOV_MAX];
     693      394922 :     size_t      remaining_size = size;
     694      394922 :     ssize_t     total_written = 0;
     695             : 
     696             :     /* Loop, writing as many blocks as we can for each system call. */
     697      953530 :     while (remaining_size > 0)
     698             :     {
     699      558608 :         int         iovcnt = 0;
     700             :         ssize_t     written;
     701             : 
     702     6298100 :         for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
     703             :         {
     704             :             size_t      this_iov_size;
     705             : 
     706     5739492 :             iov[iovcnt].iov_base = zerobuf_addr;
     707             : 
     708     5739492 :             if (remaining_size < BLCKSZ)
     709           0 :                 this_iov_size = remaining_size;
     710             :             else
     711     5739492 :                 this_iov_size = BLCKSZ;
     712             : 
     713     5739492 :             iov[iovcnt].iov_len = this_iov_size;
     714     5739492 :             remaining_size -= this_iov_size;
     715             :         }
     716             : 
     717      558608 :         written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
     718             : 
     719      558608 :         if (written < 0)
     720           0 :             return written;
     721             : 
     722      558608 :         offset += written;
     723      558608 :         total_written += written;
     724             :     }
     725             : 
     726             :     Assert(total_written == size);
     727             : 
     728      394922 :     return total_written;
     729             : }

Generated by: LCOV version 1.14