LCOV - code coverage report
Current view: top level - src/common - file_utils.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 140 206 68.0 %
Date: 2024-04-19 04:11:42 Functions: 12 12 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * File-processing utility routines.
       4             :  *
       5             :  * Assorted utility functions to work on files.
       6             :  *
       7             :  *
       8             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       9             :  * Portions Copyright (c) 1994, Regents of the University of California
      10             :  *
      11             :  * src/common/file_utils.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #ifndef FRONTEND
      17             : #include "postgres.h"
      18             : #else
      19             : #include "postgres_fe.h"
      20             : #endif
      21             : 
      22             : #include <dirent.h>
      23             : #include <fcntl.h>
      24             : #include <sys/stat.h>
      25             : #include <unistd.h>
      26             : 
      27             : #include "common/file_utils.h"
      28             : #ifdef FRONTEND
      29             : #include "common/logging.h"
      30             : #endif
      31             : #include "port/pg_iovec.h"
      32             : 
      33             : #ifdef FRONTEND
      34             : 
      35             : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
      36             : #if defined(HAVE_SYNC_FILE_RANGE)
      37             : #define PG_FLUSH_DATA_WORKS 1
      38             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
      39             : #define PG_FLUSH_DATA_WORKS 1
      40             : #endif
      41             : 
      42             : /*
      43             :  * pg_xlog has been renamed to pg_wal in version 10.
      44             :  */
      45             : #define MINIMUM_VERSION_FOR_PG_WAL  100000
      46             : 
      47             : #ifdef PG_FLUSH_DATA_WORKS
      48             : static int  pre_sync_fname(const char *fname, bool isdir);
      49             : #endif
      50             : static void walkdir(const char *path,
      51             :                     int (*action) (const char *fname, bool isdir),
      52             :                     bool process_symlinks);
      53             : 
      54             : #ifdef HAVE_SYNCFS
      55             : 
      56             : /*
      57             :  * do_syncfs -- Try to syncfs a file system
      58             :  *
      59             :  * Reports errors trying to open the path.  syncfs() errors are fatal.
      60             :  */
      61             : static void
      62           4 : do_syncfs(const char *path)
      63             : {
      64             :     int         fd;
      65             : 
      66           4 :     fd = open(path, O_RDONLY, 0);
      67             : 
      68           4 :     if (fd < 0)
      69             :     {
      70           0 :         pg_log_error("could not open file \"%s\": %m", path);
      71           0 :         return;
      72             :     }
      73             : 
      74           4 :     if (syncfs(fd) < 0)
      75             :     {
      76           0 :         pg_log_error("could not synchronize file system for file \"%s\": %m", path);
      77           0 :         (void) close(fd);
      78           0 :         exit(EXIT_FAILURE);
      79             :     }
      80             : 
      81           4 :     (void) close(fd);
      82             : }
      83             : 
      84             : #endif                          /* HAVE_SYNCFS */
      85             : 
      86             : /*
      87             :  * Synchronize PGDATA and all its contents.
      88             :  *
      89             :  * We sync regular files and directories wherever they are, but we follow
      90             :  * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
      91             :  * Other symlinks are presumed to point at files we're not responsible for
      92             :  * syncing, and might not have privileges to write at all.
      93             :  *
      94             :  * serverVersion indicates the version of the server to be sync'd.
      95             :  */
      96             : void
      97          26 : sync_pgdata(const char *pg_data,
      98             :             int serverVersion,
      99             :             DataDirSyncMethod sync_method)
     100             : {
     101             :     bool        xlog_is_symlink;
     102             :     char        pg_wal[MAXPGPATH];
     103             :     char        pg_tblspc[MAXPGPATH];
     104             : 
     105             :     /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
     106          26 :     snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
     107             :              serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
     108          26 :     snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
     109             : 
     110             :     /*
     111             :      * If pg_wal is a symlink, we'll need to recurse into it separately,
     112             :      * because the first walkdir below will ignore it.
     113             :      */
     114          26 :     xlog_is_symlink = false;
     115             : 
     116             :     {
     117             :         struct stat st;
     118             : 
     119          26 :         if (lstat(pg_wal, &st) < 0)
     120           0 :             pg_log_error("could not stat file \"%s\": %m", pg_wal);
     121          26 :         else if (S_ISLNK(st.st_mode))
     122           4 :             xlog_is_symlink = true;
     123             :     }
     124             : 
     125          26 :     switch (sync_method)
     126             :     {
     127           2 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     128             :             {
     129             : #ifndef HAVE_SYNCFS
     130             :                 pg_log_error("this build does not support sync method \"%s\"",
     131             :                              "syncfs");
     132             :                 exit(EXIT_FAILURE);
     133             : #else
     134             :                 DIR        *dir;
     135             :                 struct dirent *de;
     136             : 
     137             :                 /*
     138             :                  * On Linux, we don't have to open every single file one by
     139             :                  * one.  We can use syncfs() to sync whole filesystems.  We
     140             :                  * only expect filesystem boundaries to exist where we
     141             :                  * tolerate symlinks, namely pg_wal and the tablespaces, so we
     142             :                  * call syncfs() for each of those directories.
     143             :                  */
     144             : 
     145             :                 /* Sync the top level pgdata directory. */
     146           2 :                 do_syncfs(pg_data);
     147             : 
     148             :                 /* If any tablespaces are configured, sync each of those. */
     149           2 :                 dir = opendir(pg_tblspc);
     150           2 :                 if (dir == NULL)
     151           0 :                     pg_log_error("could not open directory \"%s\": %m",
     152             :                                  pg_tblspc);
     153             :                 else
     154             :                 {
     155           6 :                     while (errno = 0, (de = readdir(dir)) != NULL)
     156             :                     {
     157             :                         char        subpath[MAXPGPATH * 2];
     158             : 
     159           4 :                         if (strcmp(de->d_name, ".") == 0 ||
     160           2 :                             strcmp(de->d_name, "..") == 0)
     161           4 :                             continue;
     162             : 
     163           0 :                         snprintf(subpath, sizeof(subpath), "%s/%s",
     164           0 :                                  pg_tblspc, de->d_name);
     165           0 :                         do_syncfs(subpath);
     166             :                     }
     167             : 
     168           2 :                     if (errno)
     169           0 :                         pg_log_error("could not read directory \"%s\": %m",
     170             :                                      pg_tblspc);
     171             : 
     172           2 :                     (void) closedir(dir);
     173             :                 }
     174             : 
     175             :                 /* If pg_wal is a symlink, process that too. */
     176           2 :                 if (xlog_is_symlink)
     177           2 :                     do_syncfs(pg_wal);
     178             : #endif                          /* HAVE_SYNCFS */
     179             :             }
     180           2 :             break;
     181             : 
     182          24 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     183             :             {
     184             :                 /*
     185             :                  * If possible, hint to the kernel that we're soon going to
     186             :                  * fsync the data directory and its contents.
     187             :                  */
     188             : #ifdef PG_FLUSH_DATA_WORKS
     189          24 :                 walkdir(pg_data, pre_sync_fname, false);
     190          24 :                 if (xlog_is_symlink)
     191           2 :                     walkdir(pg_wal, pre_sync_fname, false);
     192          24 :                 walkdir(pg_tblspc, pre_sync_fname, true);
     193             : #endif
     194             : 
     195             :                 /*
     196             :                  * Now we do the fsync()s in the same order.
     197             :                  *
     198             :                  * The main call ignores symlinks, so in addition to specially
     199             :                  * processing pg_wal if it's a symlink, pg_tblspc has to be
     200             :                  * visited separately with process_symlinks = true.  Note that
     201             :                  * if there are any plain directories in pg_tblspc, they'll
     202             :                  * get fsync'd twice. That's not an expected case so we don't
     203             :                  * worry about optimizing it.
     204             :                  */
     205          24 :                 walkdir(pg_data, fsync_fname, false);
     206          24 :                 if (xlog_is_symlink)
     207           2 :                     walkdir(pg_wal, fsync_fname, false);
     208          24 :                 walkdir(pg_tblspc, fsync_fname, true);
     209             :             }
     210          24 :             break;
     211             :     }
     212          26 : }
     213             : 
     214             : /*
     215             :  * Synchronize the given directory and all its contents.
     216             :  *
     217             :  * This is a convenient wrapper on top of walkdir() and do_syncfs().
     218             :  */
     219             : void
     220           8 : sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
     221             : {
     222           8 :     switch (sync_method)
     223             :     {
     224           0 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     225             :             {
     226             : #ifndef HAVE_SYNCFS
     227             :                 pg_log_error("this build does not support sync method \"%s\"",
     228             :                              "syncfs");
     229             :                 exit(EXIT_FAILURE);
     230             : #else
     231             :                 /*
     232             :                  * On Linux, we don't have to open every single file one by
     233             :                  * one.  We can use syncfs() to sync the whole filesystem.
     234             :                  */
     235           0 :                 do_syncfs(dir);
     236             : #endif                          /* HAVE_SYNCFS */
     237             :             }
     238           0 :             break;
     239             : 
     240           8 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     241             :             {
     242             :                 /*
     243             :                  * If possible, hint to the kernel that we're soon going to
     244             :                  * fsync the data directory and its contents.
     245             :                  */
     246             : #ifdef PG_FLUSH_DATA_WORKS
     247           8 :                 walkdir(dir, pre_sync_fname, false);
     248             : #endif
     249             : 
     250           8 :                 walkdir(dir, fsync_fname, false);
     251             :             }
     252           8 :             break;
     253             :     }
     254           8 : }
     255             : 
     256             : /*
     257             :  * walkdir: recursively walk a directory, applying the action to each
     258             :  * regular file and directory (including the named directory itself).
     259             :  *
     260             :  * If process_symlinks is true, the action and recursion are also applied
     261             :  * to regular files and directories that are pointed to by symlinks in the
     262             :  * given directory; otherwise symlinks are ignored.  Symlinks are always
     263             :  * ignored in subdirectories, ie we intentionally don't pass down the
     264             :  * process_symlinks flag to recursive calls.
     265             :  *
     266             :  * Errors are reported but not considered fatal.
     267             :  *
     268             :  * See also walkdir in fd.c, which is a backend version of this logic.
     269             :  */
     270             : static void
     271        1372 : walkdir(const char *path,
     272             :         int (*action) (const char *fname, bool isdir),
     273             :         bool process_symlinks)
     274             : {
     275             :     DIR        *dir;
     276             :     struct dirent *de;
     277             : 
     278        1372 :     dir = opendir(path);
     279        1372 :     if (dir == NULL)
     280             :     {
     281           0 :         pg_log_error("could not open directory \"%s\": %m", path);
     282           0 :         return;
     283             :     }
     284             : 
     285       55020 :     while (errno = 0, (de = readdir(dir)) != NULL)
     286             :     {
     287             :         char        subpath[MAXPGPATH * 2];
     288             : 
     289       53648 :         if (strcmp(de->d_name, ".") == 0 ||
     290       52276 :             strcmp(de->d_name, "..") == 0)
     291        2744 :             continue;
     292             : 
     293       50904 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
     294             : 
     295       50904 :         switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
     296             :         {
     297       49644 :             case PGFILETYPE_REG:
     298       49644 :                 (*action) (subpath, false);
     299       49644 :                 break;
     300        1256 :             case PGFILETYPE_DIR:
     301        1256 :                 walkdir(subpath, action, false);
     302        1256 :                 break;
     303           4 :             default:
     304             : 
     305             :                 /*
     306             :                  * Errors are already reported directly by get_dirent_type(),
     307             :                  * and any remaining symlinks and unknown file types are
     308             :                  * ignored.
     309             :                  */
     310           4 :                 break;
     311             :         }
     312             :     }
     313             : 
     314        1372 :     if (errno)
     315           0 :         pg_log_error("could not read directory \"%s\": %m", path);
     316             : 
     317        1372 :     (void) closedir(dir);
     318             : 
     319             :     /*
     320             :      * It's important to fsync the destination directory itself as individual
     321             :      * file fsyncs don't guarantee that the directory entry for the file is
     322             :      * synced.  Recent versions of ext4 have made the window much wider but
     323             :      * it's been an issue for ext3 and other filesystems in the past.
     324             :      */
     325        1372 :     (*action) (path, true);
     326             : }
     327             : 
     328             : /*
     329             :  * Hint to the OS that it should get ready to fsync() this file.
     330             :  *
     331             :  * Ignores errors trying to open unreadable files, and reports other errors
     332             :  * non-fatally.
     333             :  */
     334             : #ifdef PG_FLUSH_DATA_WORKS
     335             : 
     336             : static int
     337       25508 : pre_sync_fname(const char *fname, bool isdir)
     338             : {
     339             :     int         fd;
     340             : 
     341       25508 :     fd = open(fname, O_RDONLY | PG_BINARY, 0);
     342             : 
     343       25508 :     if (fd < 0)
     344             :     {
     345           0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     346           0 :             return 0;
     347           0 :         pg_log_error("could not open file \"%s\": %m", fname);
     348           0 :         return -1;
     349             :     }
     350             : 
     351             :     /*
     352             :      * We do what pg_flush_data() would do in the backend: prefer to use
     353             :      * sync_file_range, but fall back to posix_fadvise.  We ignore errors
     354             :      * because this is only a hint.
     355             :      */
     356             : #if defined(HAVE_SYNC_FILE_RANGE)
     357       25508 :     (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
     358             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     359             :     (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
     360             : #else
     361             : #error PG_FLUSH_DATA_WORKS should not have been defined
     362             : #endif
     363             : 
     364       25508 :     (void) close(fd);
     365       25508 :     return 0;
     366             : }
     367             : 
     368             : #endif                          /* PG_FLUSH_DATA_WORKS */
     369             : 
     370             : /*
     371             :  * fsync_fname -- Try to fsync a file or directory
     372             :  *
     373             :  * Ignores errors trying to open unreadable files, or trying to fsync
     374             :  * directories on systems where that isn't allowed/required.  All other errors
     375             :  * are fatal.
     376             :  */
     377             : int
     378       25626 : fsync_fname(const char *fname, bool isdir)
     379             : {
     380             :     int         fd;
     381             :     int         flags;
     382             :     int         returncode;
     383             : 
     384             :     /*
     385             :      * Some OSs require directories to be opened read-only whereas other
     386             :      * systems don't allow us to fsync files opened read-only; so we need both
     387             :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
     388             :      * not writable by our userid, but we assume that's OK.
     389             :      */
     390       25626 :     flags = PG_BINARY;
     391       25626 :     if (!isdir)
     392       24904 :         flags |= O_RDWR;
     393             :     else
     394         722 :         flags |= O_RDONLY;
     395             : 
     396             :     /*
     397             :      * Open the file, silently ignoring errors about unreadable files (or
     398             :      * unsupported operations, e.g. opening a directory under Windows), and
     399             :      * logging others.
     400             :      */
     401       25626 :     fd = open(fname, flags, 0);
     402       25626 :     if (fd < 0)
     403             :     {
     404           0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     405           0 :             return 0;
     406           0 :         pg_log_error("could not open file \"%s\": %m", fname);
     407           0 :         return -1;
     408             :     }
     409             : 
     410       25626 :     returncode = fsync(fd);
     411             : 
     412             :     /*
     413             :      * Some OSes don't allow us to fsync directories at all, so we can ignore
     414             :      * those errors. Anything else needs to be reported.
     415             :      */
     416       25626 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
     417             :     {
     418           0 :         pg_log_error("could not fsync file \"%s\": %m", fname);
     419           0 :         (void) close(fd);
     420           0 :         exit(EXIT_FAILURE);
     421             :     }
     422             : 
     423       25626 :     (void) close(fd);
     424       25626 :     return 0;
     425             : }
     426             : 
     427             : /*
     428             :  * fsync_parent_path -- fsync the parent path of a file or directory
     429             :  *
     430             :  * This is aimed at making file operations persistent on disk in case of
     431             :  * an OS crash or power failure.
     432             :  */
     433             : int
     434          28 : fsync_parent_path(const char *fname)
     435             : {
     436             :     char        parentpath[MAXPGPATH];
     437             : 
     438          28 :     strlcpy(parentpath, fname, MAXPGPATH);
     439          28 :     get_parent_directory(parentpath);
     440             : 
     441             :     /*
     442             :      * get_parent_directory() returns an empty string if the input argument is
     443             :      * just a file name (see comments in path.c), so handle that as being the
     444             :      * current directory.
     445             :      */
     446          28 :     if (strlen(parentpath) == 0)
     447           0 :         strlcpy(parentpath, ".", MAXPGPATH);
     448             : 
     449          28 :     if (fsync_fname(parentpath, true) != 0)
     450           0 :         return -1;
     451             : 
     452          28 :     return 0;
     453             : }
     454             : 
     455             : /*
     456             :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     457             :  *
     458             :  * Wrapper around rename, similar to the backend version.
     459             :  */
     460             : int
     461           6 : durable_rename(const char *oldfile, const char *newfile)
     462             : {
     463             :     int         fd;
     464             : 
     465             :     /*
     466             :      * First fsync the old and target path (if it exists), to ensure that they
     467             :      * are properly persistent on disk. Syncing the target file is not
     468             :      * strictly necessary, but it makes it easier to reason about crashes;
     469             :      * because it's then guaranteed that either source or target file exists
     470             :      * after a crash.
     471             :      */
     472           6 :     if (fsync_fname(oldfile, false) != 0)
     473           0 :         return -1;
     474             : 
     475           6 :     fd = open(newfile, PG_BINARY | O_RDWR, 0);
     476           6 :     if (fd < 0)
     477             :     {
     478           6 :         if (errno != ENOENT)
     479             :         {
     480           0 :             pg_log_error("could not open file \"%s\": %m", newfile);
     481           0 :             return -1;
     482             :         }
     483             :     }
     484             :     else
     485             :     {
     486           0 :         if (fsync(fd) != 0)
     487             :         {
     488           0 :             pg_log_error("could not fsync file \"%s\": %m", newfile);
     489           0 :             close(fd);
     490           0 :             exit(EXIT_FAILURE);
     491             :         }
     492           0 :         close(fd);
     493             :     }
     494             : 
     495             :     /* Time to do the real deal... */
     496           6 :     if (rename(oldfile, newfile) != 0)
     497             :     {
     498           0 :         pg_log_error("could not rename file \"%s\" to \"%s\": %m",
     499             :                      oldfile, newfile);
     500           0 :         return -1;
     501             :     }
     502             : 
     503             :     /*
     504             :      * To guarantee renaming the file is persistent, fsync the file with its
     505             :      * new name, and its containing directory.
     506             :      */
     507           6 :     if (fsync_fname(newfile, false) != 0)
     508           0 :         return -1;
     509             : 
     510           6 :     if (fsync_parent_path(newfile) != 0)
     511           0 :         return -1;
     512             : 
     513           6 :     return 0;
     514             : }
     515             : 
     516             : #endif                          /* FRONTEND */
     517             : 
     518             : /*
     519             :  * Return the type of a directory entry.
     520             :  *
     521             :  * In frontend code, elevel should be a level from logging.h; in backend code
     522             :  * it should be a level from elog.h.
     523             :  */
     524             : PGFileType
     525      395856 : get_dirent_type(const char *path,
     526             :                 const struct dirent *de,
     527             :                 bool look_through_symlinks,
     528             :                 int elevel)
     529             : {
     530             :     PGFileType  result;
     531             : 
     532             :     /*
     533             :      * Some systems tell us the type directly in the dirent struct, but that's
     534             :      * a BSD and Linux extension not required by POSIX.  Even when the
     535             :      * interface is present, sometimes the type is unknown, depending on the
     536             :      * filesystem.
     537             :      */
     538             : #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
     539      395856 :     if (de->d_type == DT_REG)
     540      388622 :         result = PGFILETYPE_REG;
     541        7234 :     else if (de->d_type == DT_DIR)
     542        7194 :         result = PGFILETYPE_DIR;
     543          40 :     else if (de->d_type == DT_LNK && !look_through_symlinks)
     544          40 :         result = PGFILETYPE_LNK;
     545             :     else
     546           0 :         result = PGFILETYPE_UNKNOWN;
     547             : #else
     548             :     result = PGFILETYPE_UNKNOWN;
     549             : #endif
     550             : 
     551      395856 :     if (result == PGFILETYPE_UNKNOWN)
     552             :     {
     553             :         struct stat fst;
     554             :         int         sret;
     555             : 
     556             : 
     557           0 :         if (look_through_symlinks)
     558           0 :             sret = stat(path, &fst);
     559             :         else
     560           0 :             sret = lstat(path, &fst);
     561             : 
     562           0 :         if (sret < 0)
     563             :         {
     564           0 :             result = PGFILETYPE_ERROR;
     565             : #ifdef FRONTEND
     566           0 :             pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
     567             : #else
     568           0 :             ereport(elevel,
     569             :                     (errcode_for_file_access(),
     570             :                      errmsg("could not stat file \"%s\": %m", path)));
     571             : #endif
     572             :         }
     573           0 :         else if (S_ISREG(fst.st_mode))
     574           0 :             result = PGFILETYPE_REG;
     575           0 :         else if (S_ISDIR(fst.st_mode))
     576           0 :             result = PGFILETYPE_DIR;
     577           0 :         else if (S_ISLNK(fst.st_mode))
     578           0 :             result = PGFILETYPE_LNK;
     579             :     }
     580             : 
     581      395856 :     return result;
     582             : }
     583             : 
     584             : /*
     585             :  * Compute what remains to be done after a possibly partial vectored read or
     586             :  * write.  The part of 'source' beginning after 'transferred' bytes is copied
     587             :  * to 'destination', and its length is returned.  'source' and 'destination'
     588             :  * may point to the same array, for in-place adjustment.  A return value of
     589             :  * zero indicates completion (for callers without a cheaper way to know that).
     590             :  */
     591             : int
     592      421698 : compute_remaining_iovec(struct iovec *destination,
     593             :                         const struct iovec *source,
     594             :                         int iovcnt,
     595             :                         size_t transferred)
     596             : {
     597             :     Assert(iovcnt > 0);
     598             : 
     599             :     /* Skip wholly transferred iovecs. */
     600     1943758 :     while (source->iov_len <= transferred)
     601             :     {
     602     1943758 :         transferred -= source->iov_len;
     603     1943758 :         source++;
     604     1943758 :         iovcnt--;
     605             : 
     606             :         /* All iovecs transferred? */
     607     1943758 :         if (iovcnt == 0)
     608             :         {
     609             :             /*
     610             :              * We don't expect the kernel to transfer more than we asked it
     611             :              * to, or something is out of sync.
     612             :              */
     613             :             Assert(transferred == 0);
     614      421698 :             return 0;
     615             :         }
     616             :     }
     617             : 
     618             :     /* Copy the remaining iovecs to the front of the array. */
     619           0 :     if (source != destination)
     620           0 :         memmove(destination, source, sizeof(*source) * iovcnt);
     621             : 
     622             :     /* Adjust leading iovec, which may have been partially transferred. */
     623             :     Assert(destination->iov_len > transferred);
     624           0 :     destination->iov_base = (char *) destination->iov_base + transferred;
     625           0 :     destination->iov_len -= transferred;
     626             : 
     627           0 :     return iovcnt;
     628             : }
     629             : 
     630             : /*
     631             :  * pg_pwritev_with_retry
     632             :  *
     633             :  * Convenience wrapper for pg_pwritev() that retries on partial write.  If an
     634             :  * error is returned, it is unspecified how much has been written.
     635             :  */
     636             : ssize_t
     637      421698 : pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
     638             : {
     639             :     struct iovec iov_copy[PG_IOV_MAX];
     640      421698 :     ssize_t     sum = 0;
     641             :     ssize_t     part;
     642             : 
     643             :     /* We'd better have space to make a copy, in case we need to retry. */
     644      421698 :     if (iovcnt > PG_IOV_MAX)
     645             :     {
     646           0 :         errno = EINVAL;
     647           0 :         return -1;
     648             :     }
     649             : 
     650             :     do
     651             :     {
     652             :         /* Write as much as we can. */
     653      421698 :         part = pg_pwritev(fd, iov, iovcnt, offset);
     654      421698 :         if (part < 0)
     655           0 :             return -1;
     656             : 
     657             : #ifdef SIMULATE_SHORT_WRITE
     658             :         part = Min(part, 4096);
     659             : #endif
     660             : 
     661             :         /* Count our progress. */
     662      421698 :         sum += part;
     663      421698 :         offset += part;
     664             : 
     665             :         /*
     666             :          * See what is left.  On the first loop we used the caller's array,
     667             :          * but in later loops we'll use our local copy that we are allowed to
     668             :          * mutate.
     669             :          */
     670      421698 :         iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
     671      421698 :         iov = iov_copy;
     672      421698 :     } while (iovcnt > 0);
     673             : 
     674      421698 :     return sum;
     675             : }
     676             : 
     677             : /*
     678             :  * pg_pwrite_zeros
     679             :  *
     680             :  * Writes zeros to file worth "size" bytes at "offset" (from the start of the
     681             :  * file), using vectored I/O.
     682             :  *
     683             :  * Returns the total amount of data written.  On failure, a negative value
     684             :  * is returned with errno set.
     685             :  */
     686             : ssize_t
     687      374190 : pg_pwrite_zeros(int fd, size_t size, off_t offset)
     688             : {
     689             :     static const PGIOAlignedBlock zbuffer = {{0}};  /* worth BLCKSZ */
     690      374190 :     void       *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
     691             :     struct iovec iov[PG_IOV_MAX];
     692      374190 :     size_t      remaining_size = size;
     693      374190 :     ssize_t     total_written = 0;
     694             : 
     695             :     /* Loop, writing as many blocks as we can for each system call. */
     696      795888 :     while (remaining_size > 0)
     697             :     {
     698      421698 :         int         iovcnt = 0;
     699             :         ssize_t     written;
     700             : 
     701     2365456 :         for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
     702             :         {
     703             :             size_t      this_iov_size;
     704             : 
     705     1943758 :             iov[iovcnt].iov_base = zerobuf_addr;
     706             : 
     707     1943758 :             if (remaining_size < BLCKSZ)
     708           0 :                 this_iov_size = remaining_size;
     709             :             else
     710     1943758 :                 this_iov_size = BLCKSZ;
     711             : 
     712     1943758 :             iov[iovcnt].iov_len = this_iov_size;
     713     1943758 :             remaining_size -= this_iov_size;
     714             :         }
     715             : 
     716      421698 :         written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
     717             : 
     718      421698 :         if (written < 0)
     719           0 :             return written;
     720             : 
     721      421698 :         offset += written;
     722      421698 :         total_written += written;
     723             :     }
     724             : 
     725             :     Assert(total_written == size);
     726             : 
     727      374190 :     return total_written;
     728             : }

Generated by: LCOV version 1.14