LCOV - code coverage report
Current view: top level - src/common - file_utils.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 72.7 % 216 157
Test Date: 2026-03-04 08:14:57 Functions: 100.0 % 12 12
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * File-processing utility routines.
       4              :  *
       5              :  * Assorted utility functions to work on files.
       6              :  *
       7              :  *
       8              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       9              :  * Portions Copyright (c) 1994, Regents of the University of California
      10              :  *
      11              :  * src/common/file_utils.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : 
      16              : #ifndef FRONTEND
      17              : #include "postgres.h"
      18              : #else
      19              : #include "postgres_fe.h"
      20              : #endif
      21              : 
      22              : #include <dirent.h>
      23              : #include <fcntl.h>
      24              : #include <sys/stat.h>
      25              : #include <unistd.h>
      26              : 
      27              : #include "common/file_utils.h"
      28              : #ifdef FRONTEND
      29              : #include "common/logging.h"
      30              : #endif
      31              : #include "common/relpath.h"
      32              : #include "port/pg_iovec.h"
      33              : 
      34              : #ifdef FRONTEND
      35              : 
      36              : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
      37              : #if defined(HAVE_SYNC_FILE_RANGE)
      38              : #define PG_FLUSH_DATA_WORKS 1
      39              : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
      40              : #define PG_FLUSH_DATA_WORKS 1
      41              : #endif
      42              : 
      43              : /*
      44              :  * pg_xlog has been renamed to pg_wal in version 10.
      45              :  */
      46              : #define MINIMUM_VERSION_FOR_PG_WAL  100000
      47              : 
      48              : static void walkdir(const char *path,
      49              :                     int (*action) (const char *fname, bool isdir),
      50              :                     bool process_symlinks,
      51              :                     const char *exclude_dir);
      52              : 
      53              : #ifdef HAVE_SYNCFS
      54              : 
      55              : /*
      56              :  * do_syncfs -- Try to syncfs a file system
      57              :  *
      58              :  * Reports errors trying to open the path.  syncfs() errors are fatal.
      59              :  */
      60              : static void
      61            2 : do_syncfs(const char *path)
      62              : {
      63              :     int         fd;
      64              : 
      65            2 :     fd = open(path, O_RDONLY, 0);
      66              : 
      67            2 :     if (fd < 0)
      68              :     {
      69            0 :         pg_log_error("could not open file \"%s\": %m", path);
      70            0 :         return;
      71              :     }
      72              : 
      73            2 :     if (syncfs(fd) < 0)
      74              :     {
      75            0 :         pg_log_error("could not synchronize file system for file \"%s\": %m", path);
      76            0 :         (void) close(fd);
      77            0 :         exit(EXIT_FAILURE);
      78              :     }
      79              : 
      80            2 :     (void) close(fd);
      81              : }
      82              : 
      83              : #endif                          /* HAVE_SYNCFS */
      84              : 
      85              : /*
      86              :  * Synchronize PGDATA and all its contents.
      87              :  *
      88              :  * We sync regular files and directories wherever they are, but we follow
      89              :  * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
      90              :  * Other symlinks are presumed to point at files we're not responsible for
      91              :  * syncing, and might not have privileges to write at all.
      92              :  *
      93              :  * serverVersion indicates the version of the server to be sync'd.
      94              :  *
      95              :  * If sync_data_files is false, this function skips syncing "base/" and any
      96              :  * other tablespace directories.
      97              :  */
      98              : void
      99           17 : sync_pgdata(const char *pg_data,
     100              :             int serverVersion,
     101              :             DataDirSyncMethod sync_method,
     102              :             bool sync_data_files)
     103              : {
     104              :     bool        xlog_is_symlink;
     105              :     char        pg_wal[MAXPGPATH];
     106              :     char        pg_tblspc[MAXPGPATH];
     107              : 
     108              :     /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
     109           17 :     snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
     110              :              serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
     111           17 :     snprintf(pg_tblspc, MAXPGPATH, "%s/%s", pg_data, PG_TBLSPC_DIR);
     112              : 
     113              :     /*
     114              :      * If pg_wal is a symlink, we'll need to recurse into it separately,
     115              :      * because the first walkdir below will ignore it.
     116              :      */
     117           17 :     xlog_is_symlink = false;
     118              : 
     119              :     {
     120              :         struct stat st;
     121              : 
     122           17 :         if (lstat(pg_wal, &st) < 0)
     123            0 :             pg_log_error("could not stat file \"%s\": %m", pg_wal);
     124           17 :         else if (S_ISLNK(st.st_mode))
     125            3 :             xlog_is_symlink = true;
     126              :     }
     127              : 
     128           17 :     switch (sync_method)
     129              :     {
     130            1 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     131              :             {
     132              : #ifndef HAVE_SYNCFS
     133              :                 pg_log_error("this build does not support sync method \"%s\"",
     134              :                              "syncfs");
     135              :                 exit(EXIT_FAILURE);
     136              : #else
     137              :                 DIR        *dir;
     138              :                 struct dirent *de;
     139              : 
     140              :                 /*
     141              :                  * On Linux, we don't have to open every single file one by
     142              :                  * one.  We can use syncfs() to sync whole filesystems.  We
     143              :                  * only expect filesystem boundaries to exist where we
     144              :                  * tolerate symlinks, namely pg_wal and the tablespaces, so we
     145              :                  * call syncfs() for each of those directories.
     146              :                  */
     147              : 
     148              :                 /* Sync the top level pgdata directory. */
     149            1 :                 do_syncfs(pg_data);
     150              : 
     151              :                 /* If any tablespaces are configured, sync each of those. */
     152            1 :                 if (sync_data_files)
     153              :                 {
     154            1 :                     dir = opendir(pg_tblspc);
     155            1 :                     if (dir == NULL)
     156            0 :                         pg_log_error("could not open directory \"%s\": %m",
     157              :                                      pg_tblspc);
     158              :                     else
     159              :                     {
     160            3 :                         while (errno = 0, (de = readdir(dir)) != NULL)
     161              :                         {
     162              :                             char        subpath[MAXPGPATH * 2];
     163              : 
     164            2 :                             if (strcmp(de->d_name, ".") == 0 ||
     165            1 :                                 strcmp(de->d_name, "..") == 0)
     166            2 :                                 continue;
     167              : 
     168            0 :                             snprintf(subpath, sizeof(subpath), "%s/%s",
     169            0 :                                      pg_tblspc, de->d_name);
     170            0 :                             do_syncfs(subpath);
     171              :                         }
     172              : 
     173            1 :                         if (errno)
     174            0 :                             pg_log_error("could not read directory \"%s\": %m",
     175              :                                          pg_tblspc);
     176              : 
     177            1 :                         (void) closedir(dir);
     178              :                     }
     179              :                 }
     180              : 
     181              :                 /* If pg_wal is a symlink, process that too. */
     182            1 :                 if (xlog_is_symlink)
     183            1 :                     do_syncfs(pg_wal);
     184              : #endif                          /* HAVE_SYNCFS */
     185              :             }
     186            1 :             break;
     187              : 
     188           16 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     189              :             {
     190           16 :                 char       *exclude_dir = NULL;
     191              : 
     192           16 :                 if (!sync_data_files)
     193            1 :                     exclude_dir = psprintf("%s/base", pg_data);
     194              : 
     195              :                 /*
     196              :                  * If possible, hint to the kernel that we're soon going to
     197              :                  * fsync the data directory and its contents.
     198              :                  */
     199              : #ifdef PG_FLUSH_DATA_WORKS
     200           16 :                 walkdir(pg_data, pre_sync_fname, false, exclude_dir);
     201           16 :                 if (xlog_is_symlink)
     202            2 :                     walkdir(pg_wal, pre_sync_fname, false, NULL);
     203           16 :                 if (sync_data_files)
     204           15 :                     walkdir(pg_tblspc, pre_sync_fname, true, NULL);
     205              : #endif
     206              : 
     207              :                 /*
     208              :                  * Now we do the fsync()s in the same order.
     209              :                  *
     210              :                  * The main call ignores symlinks, so in addition to specially
     211              :                  * processing pg_wal if it's a symlink, pg_tblspc has to be
     212              :                  * visited separately with process_symlinks = true.  Note that
     213              :                  * if there are any plain directories in pg_tblspc, they'll
     214              :                  * get fsync'd twice. That's not an expected case so we don't
     215              :                  * worry about optimizing it.
     216              :                  */
     217           16 :                 walkdir(pg_data, fsync_fname, false, exclude_dir);
     218           16 :                 if (xlog_is_symlink)
     219            2 :                     walkdir(pg_wal, fsync_fname, false, NULL);
     220           16 :                 if (sync_data_files)
     221           15 :                     walkdir(pg_tblspc, fsync_fname, true, NULL);
     222              : 
     223           16 :                 if (exclude_dir)
     224            1 :                     pfree(exclude_dir);
     225              :             }
     226           16 :             break;
     227              :     }
     228           17 : }
     229              : 
     230              : /*
     231              :  * Synchronize the given directory and all its contents.
     232              :  *
     233              :  * This is a convenient wrapper on top of walkdir() and do_syncfs().
     234              :  */
     235              : void
     236           49 : sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
     237              : {
     238           49 :     switch (sync_method)
     239              :     {
     240            0 :         case DATA_DIR_SYNC_METHOD_SYNCFS:
     241              :             {
     242              : #ifndef HAVE_SYNCFS
     243              :                 pg_log_error("this build does not support sync method \"%s\"",
     244              :                              "syncfs");
     245              :                 exit(EXIT_FAILURE);
     246              : #else
     247              :                 /*
     248              :                  * On Linux, we don't have to open every single file one by
     249              :                  * one.  We can use syncfs() to sync the whole filesystem.
     250              :                  */
     251            0 :                 do_syncfs(dir);
     252              : #endif                          /* HAVE_SYNCFS */
     253              :             }
     254            0 :             break;
     255              : 
     256           49 :         case DATA_DIR_SYNC_METHOD_FSYNC:
     257              :             {
     258              :                 /*
     259              :                  * If possible, hint to the kernel that we're soon going to
     260              :                  * fsync the data directory and its contents.
     261              :                  */
     262              : #ifdef PG_FLUSH_DATA_WORKS
     263           49 :                 walkdir(dir, pre_sync_fname, false, NULL);
     264              : #endif
     265              : 
     266           49 :                 walkdir(dir, fsync_fname, false, NULL);
     267              :             }
     268           49 :             break;
     269              :     }
     270           49 : }
     271              : 
     272              : /*
     273              :  * walkdir: recursively walk a directory, applying the action to each
     274              :  * regular file and directory (including the named directory itself).
     275              :  *
     276              :  * If process_symlinks is true, the action and recursion are also applied
     277              :  * to regular files and directories that are pointed to by symlinks in the
     278              :  * given directory; otherwise symlinks are ignored.  Symlinks are always
     279              :  * ignored in subdirectories, ie we intentionally don't pass down the
     280              :  * process_symlinks flag to recursive calls.
     281              :  *
     282              :  * If exclude_dir is not NULL, it specifies a directory path to skip
     283              :  * processing.
     284              :  *
     285              :  * Errors are reported but not considered fatal.
     286              :  *
     287              :  * See also walkdir in fd.c, which is a backend version of this logic.
     288              :  */
     289              : static void
     290          998 : walkdir(const char *path,
     291              :         int (*action) (const char *fname, bool isdir),
     292              :         bool process_symlinks,
     293              :         const char *exclude_dir)
     294              : {
     295              :     DIR        *dir;
     296              :     struct dirent *de;
     297              : 
     298          998 :     if (exclude_dir && strcmp(exclude_dir, path) == 0)
     299            2 :         return;
     300              : 
     301          996 :     dir = opendir(path);
     302          996 :     if (dir == NULL)
     303              :     {
     304            0 :         pg_log_error("could not open directory \"%s\": %m", path);
     305            0 :         return;
     306              :     }
     307              : 
     308        34874 :     while (errno = 0, (de = readdir(dir)) != NULL)
     309              :     {
     310              :         char        subpath[MAXPGPATH * 2];
     311              : 
     312        33878 :         if (strcmp(de->d_name, ".") == 0 ||
     313        32882 :             strcmp(de->d_name, "..") == 0)
     314         1992 :             continue;
     315              : 
     316        31886 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
     317              : 
     318        31886 :         switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
     319              :         {
     320        31046 :             case PGFILETYPE_REG:
     321        31046 :                 (*action) (subpath, false);
     322        31046 :                 break;
     323          834 :             case PGFILETYPE_DIR:
     324          834 :                 walkdir(subpath, action, false, exclude_dir);
     325          834 :                 break;
     326            6 :             default:
     327              : 
     328              :                 /*
     329              :                  * Errors are already reported directly by get_dirent_type(),
     330              :                  * and any remaining symlinks and unknown file types are
     331              :                  * ignored.
     332              :                  */
     333            6 :                 break;
     334              :         }
     335              :     }
     336              : 
     337          996 :     if (errno)
     338            0 :         pg_log_error("could not read directory \"%s\": %m", path);
     339              : 
     340          996 :     (void) closedir(dir);
     341              : 
     342              :     /*
     343              :      * It's important to fsync the destination directory itself as individual
     344              :      * file fsyncs don't guarantee that the directory entry for the file is
     345              :      * synced.  Recent versions of ext4 have made the window much wider but
     346              :      * it's been an issue for ext3 and other filesystems in the past.
     347              :      */
     348          996 :     (*action) (path, true);
     349              : }
     350              : 
     351              : /*
     352              :  * Hint to the OS that it should get ready to fsync() this file, if supported
     353              :  * by the platform.
     354              :  *
     355              :  * Ignores errors trying to open unreadable files, and reports other errors
     356              :  * non-fatally.
     357              :  */
     358              : int
     359        16021 : pre_sync_fname(const char *fname, bool isdir)
     360              : {
     361              : #ifdef PG_FLUSH_DATA_WORKS
     362              :     int         fd;
     363              : 
     364        16021 :     fd = open(fname, O_RDONLY | PG_BINARY, 0);
     365              : 
     366        16021 :     if (fd < 0)
     367              :     {
     368            0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     369            0 :             return 0;
     370            0 :         pg_log_error("could not open file \"%s\": %m", fname);
     371            0 :         return -1;
     372              :     }
     373              : 
     374              :     /*
     375              :      * We do what pg_flush_data() would do in the backend: prefer to use
     376              :      * sync_file_range, but fall back to posix_fadvise.  We ignore errors
     377              :      * because this is only a hint.
     378              :      */
     379              : #if defined(HAVE_SYNC_FILE_RANGE)
     380        16021 :     (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
     381              : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     382              :     (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
     383              : #else
     384              : #error PG_FLUSH_DATA_WORKS should not have been defined
     385              : #endif
     386              : 
     387        16021 :     (void) close(fd);
     388              : #endif                          /* PG_FLUSH_DATA_WORKS */
     389        16021 :     return 0;
     390              : }
     391              : 
     392              : /*
     393              :  * fsync_fname -- Try to fsync a file or directory
     394              :  *
     395              :  * Ignores errors trying to open unreadable files, or trying to fsync
     396              :  * directories on systems where that isn't allowed/required.  All other errors
     397              :  * are fatal.
     398              :  */
     399              : int
     400        16101 : fsync_fname(const char *fname, bool isdir)
     401              : {
     402              :     int         fd;
     403              :     int         flags;
     404              :     int         returncode;
     405              : 
     406              :     /*
     407              :      * Some OSs require directories to be opened read-only whereas other
     408              :      * systems don't allow us to fsync files opened read-only; so we need both
     409              :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
     410              :      * not writable by our userid, but we assume that's OK.
     411              :      */
     412        16101 :     flags = PG_BINARY;
     413        16101 :     if (!isdir)
     414        15574 :         flags |= O_RDWR;
     415              :     else
     416          527 :         flags |= O_RDONLY;
     417              : 
     418              :     /*
     419              :      * Open the file, silently ignoring errors about unreadable files (or
     420              :      * unsupported operations, e.g. opening a directory under Windows), and
     421              :      * logging others.
     422              :      */
     423        16101 :     fd = open(fname, flags, 0);
     424        16101 :     if (fd < 0)
     425              :     {
     426            0 :         if (errno == EACCES || (isdir && errno == EISDIR))
     427            0 :             return 0;
     428            0 :         pg_log_error("could not open file \"%s\": %m", fname);
     429            0 :         return -1;
     430              :     }
     431              : 
     432        16101 :     returncode = fsync(fd);
     433              : 
     434              :     /*
     435              :      * Some OSes don't allow us to fsync directories at all, so we can ignore
     436              :      * those errors. Anything else needs to be reported.
     437              :      */
     438        16101 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
     439              :     {
     440            0 :         pg_log_error("could not fsync file \"%s\": %m", fname);
     441            0 :         (void) close(fd);
     442            0 :         exit(EXIT_FAILURE);
     443              :     }
     444              : 
     445        16101 :     (void) close(fd);
     446        16101 :     return 0;
     447              : }
     448              : 
     449              : /*
     450              :  * fsync_parent_path -- fsync the parent path of a file or directory
     451              :  *
     452              :  * This is aimed at making file operations persistent on disk in case of
     453              :  * an OS crash or power failure.
     454              :  */
     455              : int
     456           20 : fsync_parent_path(const char *fname)
     457              : {
     458              :     char        parentpath[MAXPGPATH];
     459              : 
     460           20 :     strlcpy(parentpath, fname, MAXPGPATH);
     461           20 :     get_parent_directory(parentpath);
     462              : 
     463              :     /*
     464              :      * get_parent_directory() returns an empty string if the input argument is
     465              :      * just a file name (see comments in path.c), so handle that as being the
     466              :      * current directory.
     467              :      */
     468           20 :     if (strlen(parentpath) == 0)
     469            0 :         strlcpy(parentpath, ".", MAXPGPATH);
     470              : 
     471           20 :     if (fsync_fname(parentpath, true) != 0)
     472            0 :         return -1;
     473              : 
     474           20 :     return 0;
     475              : }
     476              : 
     477              : /*
     478              :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     479              :  *
     480              :  * Wrapper around rename, similar to the backend version.
     481              :  */
     482              : int
     483            4 : durable_rename(const char *oldfile, const char *newfile)
     484              : {
     485              :     int         fd;
     486              : 
     487              :     /*
     488              :      * First fsync the old and target path (if it exists), to ensure that they
     489              :      * are properly persistent on disk. Syncing the target file is not
     490              :      * strictly necessary, but it makes it easier to reason about crashes;
     491              :      * because it's then guaranteed that either source or target file exists
     492              :      * after a crash.
     493              :      */
     494            4 :     if (fsync_fname(oldfile, false) != 0)
     495            0 :         return -1;
     496              : 
     497            4 :     fd = open(newfile, PG_BINARY | O_RDWR, 0);
     498            4 :     if (fd < 0)
     499              :     {
     500            4 :         if (errno != ENOENT)
     501              :         {
     502            0 :             pg_log_error("could not open file \"%s\": %m", newfile);
     503            0 :             return -1;
     504              :         }
     505              :     }
     506              :     else
     507              :     {
     508            0 :         if (fsync(fd) != 0)
     509              :         {
     510            0 :             pg_log_error("could not fsync file \"%s\": %m", newfile);
     511            0 :             close(fd);
     512            0 :             exit(EXIT_FAILURE);
     513              :         }
     514            0 :         close(fd);
     515              :     }
     516              : 
     517              :     /* Time to do the real deal... */
     518            4 :     if (rename(oldfile, newfile) != 0)
     519              :     {
     520            0 :         pg_log_error("could not rename file \"%s\" to \"%s\": %m",
     521              :                      oldfile, newfile);
     522            0 :         return -1;
     523              :     }
     524              : 
     525              :     /*
     526              :      * To guarantee renaming the file is persistent, fsync the file with its
     527              :      * new name, and its containing directory.
     528              :      */
     529            4 :     if (fsync_fname(newfile, false) != 0)
     530            0 :         return -1;
     531              : 
     532            4 :     if (fsync_parent_path(newfile) != 0)
     533            0 :         return -1;
     534              : 
     535            4 :     return 0;
     536              : }
     537              : 
     538              : #endif                          /* FRONTEND */
     539              : 
     540              : /*
     541              :  * Return the type of a directory entry.
     542              :  *
     543              :  * In frontend code, elevel should be a level from logging.h; in backend code
     544              :  * it should be a level from elog.h.
     545              :  */
     546              : PGFileType
     547       263169 : get_dirent_type(const char *path,
     548              :                 const struct dirent *de,
     549              :                 bool look_through_symlinks,
     550              :                 int elevel)
     551              : {
     552              :     PGFileType  result;
     553              : 
     554              :     /*
     555              :      * Some systems tell us the type directly in the dirent struct, but that's
     556              :      * a BSD and Linux extension not required by POSIX.  Even when the
     557              :      * interface is present, sometimes the type is unknown, depending on the
     558              :      * filesystem.
     559              :      */
     560              : #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
     561       263169 :     if (de->d_type == DT_REG)
     562       258655 :         result = PGFILETYPE_REG;
     563         4514 :     else if (de->d_type == DT_DIR)
     564         4478 :         result = PGFILETYPE_DIR;
     565           36 :     else if (de->d_type == DT_LNK && !look_through_symlinks)
     566           34 :         result = PGFILETYPE_LNK;
     567              :     else
     568            2 :         result = PGFILETYPE_UNKNOWN;
     569              : #else
     570              :     result = PGFILETYPE_UNKNOWN;
     571              : #endif
     572              : 
     573       263169 :     if (result == PGFILETYPE_UNKNOWN)
     574              :     {
     575              :         struct stat fst;
     576              :         int         sret;
     577              : 
     578              : 
     579            2 :         if (look_through_symlinks)
     580            2 :             sret = stat(path, &fst);
     581              :         else
     582            0 :             sret = lstat(path, &fst);
     583              : 
     584            2 :         if (sret < 0)
     585              :         {
     586            0 :             result = PGFILETYPE_ERROR;
     587              : #ifdef FRONTEND
     588            0 :             pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
     589              : #else
     590            0 :             ereport(elevel,
     591              :                     (errcode_for_file_access(),
     592              :                      errmsg("could not stat file \"%s\": %m", path)));
     593              : #endif
     594              :         }
     595            2 :         else if (S_ISREG(fst.st_mode))
     596            0 :             result = PGFILETYPE_REG;
     597            2 :         else if (S_ISDIR(fst.st_mode))
     598            2 :             result = PGFILETYPE_DIR;
     599            0 :         else if (S_ISLNK(fst.st_mode))
     600            0 :             result = PGFILETYPE_LNK;
     601              :     }
     602              : 
     603       263169 :     return result;
     604              : }
     605              : 
     606              : /*
     607              :  * Compute what remains to be done after a possibly partial vectored read or
     608              :  * write.  The part of 'source' beginning after 'transferred' bytes is copied
     609              :  * to 'destination', and its length is returned.  'source' and 'destination'
     610              :  * may point to the same array, for in-place adjustment.  A return value of
     611              :  * zero indicates completion (for callers without a cheaper way to know that).
     612              :  */
     613              : int
     614       237907 : compute_remaining_iovec(struct iovec *destination,
     615              :                         const struct iovec *source,
     616              :                         int iovcnt,
     617              :                         size_t transferred)
     618              : {
     619              :     Assert(iovcnt > 0);
     620              : 
     621              :     /* Skip wholly transferred iovecs. */
     622      2904290 :     while (source->iov_len <= transferred)
     623              :     {
     624      2904290 :         transferred -= source->iov_len;
     625      2904290 :         source++;
     626      2904290 :         iovcnt--;
     627              : 
     628              :         /* All iovecs transferred? */
     629      2904290 :         if (iovcnt == 0)
     630              :         {
     631              :             /*
     632              :              * We don't expect the kernel to transfer more than we asked it
     633              :              * to, or something is out of sync.
     634              :              */
     635              :             Assert(transferred == 0);
     636       237907 :             return 0;
     637              :         }
     638              :     }
     639              : 
     640              :     /* Copy the remaining iovecs to the front of the array. */
     641            0 :     if (source != destination)
     642            0 :         memmove(destination, source, sizeof(*source) * iovcnt);
     643              : 
     644              :     /* Adjust leading iovec, which may have been partially transferred. */
     645              :     Assert(destination->iov_len > transferred);
     646            0 :     destination->iov_base = (char *) destination->iov_base + transferred;
     647            0 :     destination->iov_len -= transferred;
     648              : 
     649            0 :     return iovcnt;
     650              : }
     651              : 
     652              : /*
     653              :  * pg_pwritev_with_retry
     654              :  *
     655              :  * Convenience wrapper for pg_pwritev() that retries on partial write.  If an
     656              :  * error is returned, it is unspecified how much has been written.
     657              :  */
     658              : ssize_t
     659       237907 : pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
     660              : {
     661              :     struct iovec iov_copy[PG_IOV_MAX];
     662       237907 :     ssize_t     sum = 0;
     663              :     ssize_t     part;
     664              : 
     665              :     /* We'd better have space to make a copy, in case we need to retry. */
     666       237907 :     if (iovcnt > PG_IOV_MAX)
     667              :     {
     668            0 :         errno = EINVAL;
     669            0 :         return -1;
     670              :     }
     671              : 
     672              :     do
     673              :     {
     674              :         /* Write as much as we can. */
     675       237907 :         part = pg_pwritev(fd, iov, iovcnt, offset);
     676       237907 :         if (part < 0)
     677            0 :             return -1;
     678              : 
     679              : #ifdef SIMULATE_SHORT_WRITE
     680              :         part = Min(part, 4096);
     681              : #endif
     682              : 
     683              :         /* Count our progress. */
     684       237907 :         sum += part;
     685       237907 :         offset += part;
     686              : 
     687              :         /*
     688              :          * See what is left.  On the first loop we used the caller's array,
     689              :          * but in later loops we'll use our local copy that we are allowed to
     690              :          * mutate.
     691              :          */
     692       237907 :         iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
     693       237907 :         iov = iov_copy;
     694       237907 :     } while (iovcnt > 0);
     695              : 
     696       237907 :     return sum;
     697              : }
     698              : 
     699              : /*
     700              :  * pg_pwrite_zeros
     701              :  *
     702              :  * Writes zeros to file worth "size" bytes at "offset" (from the start of the
     703              :  * file), using vectored I/O.
     704              :  *
     705              :  * Returns the total amount of data written.  On failure, a negative value
     706              :  * is returned with errno set.
     707              :  */
     708              : ssize_t
     709       218482 : pg_pwrite_zeros(int fd, size_t size, pgoff_t offset)
     710              : {
     711              :     static const PGIOAlignedBlock zbuffer = {0};    /* worth BLCKSZ */
     712       218482 :     void       *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
     713              :     struct iovec iov[PG_IOV_MAX];
     714       218482 :     size_t      remaining_size = size;
     715       218482 :     ssize_t     total_written = 0;
     716              : 
     717              :     /* Loop, writing as many blocks as we can for each system call. */
     718       456389 :     while (remaining_size > 0)
     719              :     {
     720       237907 :         int         iovcnt = 0;
     721              :         ssize_t     written;
     722              : 
     723      3142197 :         for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
     724              :         {
     725              :             size_t      this_iov_size;
     726              : 
     727      2904290 :             iov[iovcnt].iov_base = zerobuf_addr;
     728              : 
     729      2904290 :             if (remaining_size < BLCKSZ)
     730            0 :                 this_iov_size = remaining_size;
     731              :             else
     732      2904290 :                 this_iov_size = BLCKSZ;
     733              : 
     734      2904290 :             iov[iovcnt].iov_len = this_iov_size;
     735      2904290 :             remaining_size -= this_iov_size;
     736              :         }
     737              : 
     738       237907 :         written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
     739              : 
     740       237907 :         if (written < 0)
     741            0 :             return written;
     742              : 
     743       237907 :         offset += written;
     744       237907 :         total_written += written;
     745              :     }
     746              : 
     747              :     Assert(total_written == size);
     748              : 
     749       218482 :     return total_written;
     750              : }
        

Generated by: LCOV version 2.0-1