Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * File-processing utility routines.
4 : *
5 : * Assorted utility functions to work on files.
6 : *
7 : *
8 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 : * Portions Copyright (c) 1994, Regents of the University of California
10 : *
11 : * src/common/file_utils.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #ifndef FRONTEND
17 : #include "postgres.h"
18 : #else
19 : #include "postgres_fe.h"
20 : #endif
21 :
22 : #include <dirent.h>
23 : #include <fcntl.h>
24 : #include <sys/stat.h>
25 : #include <unistd.h>
26 :
27 : #include "common/file_utils.h"
28 : #ifdef FRONTEND
29 : #include "common/logging.h"
30 : #endif
31 : #include "common/relpath.h"
32 : #include "port/pg_iovec.h"
33 :
34 : #ifdef FRONTEND
35 :
36 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
37 : #if defined(HAVE_SYNC_FILE_RANGE)
38 : #define PG_FLUSH_DATA_WORKS 1
39 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
40 : #define PG_FLUSH_DATA_WORKS 1
41 : #endif
42 :
43 : /*
44 : * pg_xlog has been renamed to pg_wal in version 10.
45 : */
46 : #define MINIMUM_VERSION_FOR_PG_WAL 100000
47 :
48 : static void walkdir(const char *path,
49 : int (*action) (const char *fname, bool isdir),
50 : bool process_symlinks,
51 : const char *exclude_dir);
52 :
53 : #ifdef HAVE_SYNCFS
54 :
55 : /*
56 : * do_syncfs -- Try to syncfs a file system
57 : *
58 : * Reports errors trying to open the path. syncfs() errors are fatal.
59 : */
60 : static void
61 4 : do_syncfs(const char *path)
62 : {
63 : int fd;
64 :
65 4 : fd = open(path, O_RDONLY, 0);
66 :
67 4 : if (fd < 0)
68 : {
69 0 : pg_log_error("could not open file \"%s\": %m", path);
70 0 : return;
71 : }
72 :
73 4 : if (syncfs(fd) < 0)
74 : {
75 0 : pg_log_error("could not synchronize file system for file \"%s\": %m", path);
76 0 : (void) close(fd);
77 0 : exit(EXIT_FAILURE);
78 : }
79 :
80 4 : (void) close(fd);
81 : }
82 :
83 : #endif /* HAVE_SYNCFS */
84 :
85 : /*
86 : * Synchronize PGDATA and all its contents.
87 : *
88 : * We sync regular files and directories wherever they are, but we follow
89 : * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
90 : * Other symlinks are presumed to point at files we're not responsible for
91 : * syncing, and might not have privileges to write at all.
92 : *
93 : * serverVersion indicates the version of the server to be sync'd.
94 : *
95 : * If sync_data_files is false, this function skips syncing "base/" and any
96 : * other tablespace directories.
97 : */
98 : void
99 32 : sync_pgdata(const char *pg_data,
100 : int serverVersion,
101 : DataDirSyncMethod sync_method,
102 : bool sync_data_files)
103 : {
104 : bool xlog_is_symlink;
105 : char pg_wal[MAXPGPATH];
106 : char pg_tblspc[MAXPGPATH];
107 :
108 : /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
109 32 : snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
110 : serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
111 32 : snprintf(pg_tblspc, MAXPGPATH, "%s/%s", pg_data, PG_TBLSPC_DIR);
112 :
113 : /*
114 : * If pg_wal is a symlink, we'll need to recurse into it separately,
115 : * because the first walkdir below will ignore it.
116 : */
117 32 : xlog_is_symlink = false;
118 :
119 : {
120 : struct stat st;
121 :
122 32 : if (lstat(pg_wal, &st) < 0)
123 0 : pg_log_error("could not stat file \"%s\": %m", pg_wal);
124 32 : else if (S_ISLNK(st.st_mode))
125 6 : xlog_is_symlink = true;
126 : }
127 :
128 32 : switch (sync_method)
129 : {
130 2 : case DATA_DIR_SYNC_METHOD_SYNCFS:
131 : {
132 : #ifndef HAVE_SYNCFS
133 : pg_log_error("this build does not support sync method \"%s\"",
134 : "syncfs");
135 : exit(EXIT_FAILURE);
136 : #else
137 : DIR *dir;
138 : struct dirent *de;
139 :
140 : /*
141 : * On Linux, we don't have to open every single file one by
142 : * one. We can use syncfs() to sync whole filesystems. We
143 : * only expect filesystem boundaries to exist where we
144 : * tolerate symlinks, namely pg_wal and the tablespaces, so we
145 : * call syncfs() for each of those directories.
146 : */
147 :
148 : /* Sync the top level pgdata directory. */
149 2 : do_syncfs(pg_data);
150 :
151 : /* If any tablespaces are configured, sync each of those. */
152 2 : if (sync_data_files)
153 : {
154 2 : dir = opendir(pg_tblspc);
155 2 : if (dir == NULL)
156 0 : pg_log_error("could not open directory \"%s\": %m",
157 : pg_tblspc);
158 : else
159 : {
160 6 : while (errno = 0, (de = readdir(dir)) != NULL)
161 : {
162 : char subpath[MAXPGPATH * 2];
163 :
164 4 : if (strcmp(de->d_name, ".") == 0 ||
165 2 : strcmp(de->d_name, "..") == 0)
166 4 : continue;
167 :
168 0 : snprintf(subpath, sizeof(subpath), "%s/%s",
169 0 : pg_tblspc, de->d_name);
170 0 : do_syncfs(subpath);
171 : }
172 :
173 2 : if (errno)
174 0 : pg_log_error("could not read directory \"%s\": %m",
175 : pg_tblspc);
176 :
177 2 : (void) closedir(dir);
178 : }
179 : }
180 :
181 : /* If pg_wal is a symlink, process that too. */
182 2 : if (xlog_is_symlink)
183 2 : do_syncfs(pg_wal);
184 : #endif /* HAVE_SYNCFS */
185 : }
186 2 : break;
187 :
188 30 : case DATA_DIR_SYNC_METHOD_FSYNC:
189 : {
190 30 : char *exclude_dir = NULL;
191 :
192 30 : if (!sync_data_files)
193 2 : exclude_dir = psprintf("%s/base", pg_data);
194 :
195 : /*
196 : * If possible, hint to the kernel that we're soon going to
197 : * fsync the data directory and its contents.
198 : */
199 : #ifdef PG_FLUSH_DATA_WORKS
200 30 : walkdir(pg_data, pre_sync_fname, false, exclude_dir);
201 30 : if (xlog_is_symlink)
202 4 : walkdir(pg_wal, pre_sync_fname, false, NULL);
203 30 : if (sync_data_files)
204 28 : walkdir(pg_tblspc, pre_sync_fname, true, NULL);
205 : #endif
206 :
207 : /*
208 : * Now we do the fsync()s in the same order.
209 : *
210 : * The main call ignores symlinks, so in addition to specially
211 : * processing pg_wal if it's a symlink, pg_tblspc has to be
212 : * visited separately with process_symlinks = true. Note that
213 : * if there are any plain directories in pg_tblspc, they'll
214 : * get fsync'd twice. That's not an expected case so we don't
215 : * worry about optimizing it.
216 : */
217 30 : walkdir(pg_data, fsync_fname, false, exclude_dir);
218 30 : if (xlog_is_symlink)
219 4 : walkdir(pg_wal, fsync_fname, false, NULL);
220 30 : if (sync_data_files)
221 28 : walkdir(pg_tblspc, fsync_fname, true, NULL);
222 :
223 30 : if (exclude_dir)
224 2 : pfree(exclude_dir);
225 : }
226 30 : break;
227 : }
228 32 : }
229 :
230 : /*
231 : * Synchronize the given directory and all its contents.
232 : *
233 : * This is a convenient wrapper on top of walkdir() and do_syncfs().
234 : */
235 : void
236 8 : sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
237 : {
238 8 : switch (sync_method)
239 : {
240 0 : case DATA_DIR_SYNC_METHOD_SYNCFS:
241 : {
242 : #ifndef HAVE_SYNCFS
243 : pg_log_error("this build does not support sync method \"%s\"",
244 : "syncfs");
245 : exit(EXIT_FAILURE);
246 : #else
247 : /*
248 : * On Linux, we don't have to open every single file one by
249 : * one. We can use syncfs() to sync the whole filesystem.
250 : */
251 0 : do_syncfs(dir);
252 : #endif /* HAVE_SYNCFS */
253 : }
254 0 : break;
255 :
256 8 : case DATA_DIR_SYNC_METHOD_FSYNC:
257 : {
258 : /*
259 : * If possible, hint to the kernel that we're soon going to
260 : * fsync the data directory and its contents.
261 : */
262 : #ifdef PG_FLUSH_DATA_WORKS
263 8 : walkdir(dir, pre_sync_fname, false, NULL);
264 : #endif
265 :
266 8 : walkdir(dir, fsync_fname, false, NULL);
267 : }
268 8 : break;
269 : }
270 8 : }
271 :
272 : /*
273 : * walkdir: recursively walk a directory, applying the action to each
274 : * regular file and directory (including the named directory itself).
275 : *
276 : * If process_symlinks is true, the action and recursion are also applied
277 : * to regular files and directories that are pointed to by symlinks in the
278 : * given directory; otherwise symlinks are ignored. Symlinks are always
279 : * ignored in subdirectories, ie we intentionally don't pass down the
280 : * process_symlinks flag to recursive calls.
281 : *
282 : * If exclude_dir is not NULL, it specifies a directory path to skip
283 : * processing.
284 : *
285 : * Errors are reported but not considered fatal.
286 : *
287 : * See also walkdir in fd.c, which is a backend version of this logic.
288 : */
289 : static void
290 1704 : walkdir(const char *path,
291 : int (*action) (const char *fname, bool isdir),
292 : bool process_symlinks,
293 : const char *exclude_dir)
294 : {
295 : DIR *dir;
296 : struct dirent *de;
297 :
298 1704 : if (exclude_dir && strcmp(exclude_dir, path) == 0)
299 4 : return;
300 :
301 1700 : dir = opendir(path);
302 1700 : if (dir == NULL)
303 : {
304 0 : pg_log_error("could not open directory \"%s\": %m", path);
305 0 : return;
306 : }
307 :
308 64688 : while (errno = 0, (de = readdir(dir)) != NULL)
309 : {
310 : char subpath[MAXPGPATH * 2];
311 :
312 62988 : if (strcmp(de->d_name, ".") == 0 ||
313 61288 : strcmp(de->d_name, "..") == 0)
314 3400 : continue;
315 :
316 59588 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
317 :
318 59588 : switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
319 : {
320 58012 : case PGFILETYPE_REG:
321 58012 : (*action) (subpath, false);
322 58012 : break;
323 1564 : case PGFILETYPE_DIR:
324 1564 : walkdir(subpath, action, false, exclude_dir);
325 1564 : break;
326 12 : default:
327 :
328 : /*
329 : * Errors are already reported directly by get_dirent_type(),
330 : * and any remaining symlinks and unknown file types are
331 : * ignored.
332 : */
333 12 : break;
334 : }
335 : }
336 :
337 1700 : if (errno)
338 0 : pg_log_error("could not read directory \"%s\": %m", path);
339 :
340 1700 : (void) closedir(dir);
341 :
342 : /*
343 : * It's important to fsync the destination directory itself as individual
344 : * file fsyncs don't guarantee that the directory entry for the file is
345 : * synced. Recent versions of ext4 have made the window much wider but
346 : * it's been an issue for ext3 and other filesystems in the past.
347 : */
348 1700 : (*action) (path, true);
349 : }
350 :
351 : /*
352 : * Hint to the OS that it should get ready to fsync() this file, if supported
353 : * by the platform.
354 : *
355 : * Ignores errors trying to open unreadable files, and reports other errors
356 : * non-fatally.
357 : */
358 : int
359 29856 : pre_sync_fname(const char *fname, bool isdir)
360 : {
361 : #ifdef PG_FLUSH_DATA_WORKS
362 : int fd;
363 :
364 29856 : fd = open(fname, O_RDONLY | PG_BINARY, 0);
365 :
366 29856 : if (fd < 0)
367 : {
368 0 : if (errno == EACCES || (isdir && errno == EISDIR))
369 0 : return 0;
370 0 : pg_log_error("could not open file \"%s\": %m", fname);
371 0 : return -1;
372 : }
373 :
374 : /*
375 : * We do what pg_flush_data() would do in the backend: prefer to use
376 : * sync_file_range, but fall back to posix_fadvise. We ignore errors
377 : * because this is only a hint.
378 : */
379 : #if defined(HAVE_SYNC_FILE_RANGE)
380 29856 : (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
381 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
382 : (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
383 : #else
384 : #error PG_FLUSH_DATA_WORKS should not have been defined
385 : #endif
386 :
387 29856 : (void) close(fd);
388 : #endif /* PG_FLUSH_DATA_WORKS */
389 29856 : return 0;
390 : }
391 :
392 : /*
393 : * fsync_fname -- Try to fsync a file or directory
394 : *
395 : * Ignores errors trying to open unreadable files, or trying to fsync
396 : * directories on systems where that isn't allowed/required. All other errors
397 : * are fatal.
398 : */
399 : int
400 29954 : fsync_fname(const char *fname, bool isdir)
401 : {
402 : int fd;
403 : int flags;
404 : int returncode;
405 :
406 : /*
407 : * Some OSs require directories to be opened read-only whereas other
408 : * systems don't allow us to fsync files opened read-only; so we need both
409 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
410 : * not writable by our userid, but we assume that's OK.
411 : */
412 29954 : flags = PG_BINARY;
413 29954 : if (!isdir)
414 29056 : flags |= O_RDWR;
415 : else
416 898 : flags |= O_RDONLY;
417 :
418 : /*
419 : * Open the file, silently ignoring errors about unreadable files (or
420 : * unsupported operations, e.g. opening a directory under Windows), and
421 : * logging others.
422 : */
423 29954 : fd = open(fname, flags, 0);
424 29954 : if (fd < 0)
425 : {
426 0 : if (errno == EACCES || (isdir && errno == EISDIR))
427 0 : return 0;
428 0 : pg_log_error("could not open file \"%s\": %m", fname);
429 0 : return -1;
430 : }
431 :
432 29954 : returncode = fsync(fd);
433 :
434 : /*
435 : * Some OSes don't allow us to fsync directories at all, so we can ignore
436 : * those errors. Anything else needs to be reported.
437 : */
438 29954 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
439 : {
440 0 : pg_log_error("could not fsync file \"%s\": %m", fname);
441 0 : (void) close(fd);
442 0 : exit(EXIT_FAILURE);
443 : }
444 :
445 29954 : (void) close(fd);
446 29954 : return 0;
447 : }
448 :
449 : /*
450 : * fsync_parent_path -- fsync the parent path of a file or directory
451 : *
452 : * This is aimed at making file operations persistent on disk in case of
453 : * an OS crash or power failure.
454 : */
455 : int
456 34 : fsync_parent_path(const char *fname)
457 : {
458 : char parentpath[MAXPGPATH];
459 :
460 34 : strlcpy(parentpath, fname, MAXPGPATH);
461 34 : get_parent_directory(parentpath);
462 :
463 : /*
464 : * get_parent_directory() returns an empty string if the input argument is
465 : * just a file name (see comments in path.c), so handle that as being the
466 : * current directory.
467 : */
468 34 : if (strlen(parentpath) == 0)
469 0 : strlcpy(parentpath, ".", MAXPGPATH);
470 :
471 34 : if (fsync_fname(parentpath, true) != 0)
472 0 : return -1;
473 :
474 34 : return 0;
475 : }
476 :
477 : /*
478 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
479 : *
480 : * Wrapper around rename, similar to the backend version.
481 : */
482 : int
483 6 : durable_rename(const char *oldfile, const char *newfile)
484 : {
485 : int fd;
486 :
487 : /*
488 : * First fsync the old and target path (if it exists), to ensure that they
489 : * are properly persistent on disk. Syncing the target file is not
490 : * strictly necessary, but it makes it easier to reason about crashes;
491 : * because it's then guaranteed that either source or target file exists
492 : * after a crash.
493 : */
494 6 : if (fsync_fname(oldfile, false) != 0)
495 0 : return -1;
496 :
497 6 : fd = open(newfile, PG_BINARY | O_RDWR, 0);
498 6 : if (fd < 0)
499 : {
500 6 : if (errno != ENOENT)
501 : {
502 0 : pg_log_error("could not open file \"%s\": %m", newfile);
503 0 : return -1;
504 : }
505 : }
506 : else
507 : {
508 0 : if (fsync(fd) != 0)
509 : {
510 0 : pg_log_error("could not fsync file \"%s\": %m", newfile);
511 0 : close(fd);
512 0 : exit(EXIT_FAILURE);
513 : }
514 0 : close(fd);
515 : }
516 :
517 : /* Time to do the real deal... */
518 6 : if (rename(oldfile, newfile) != 0)
519 : {
520 0 : pg_log_error("could not rename file \"%s\" to \"%s\": %m",
521 : oldfile, newfile);
522 0 : return -1;
523 : }
524 :
525 : /*
526 : * To guarantee renaming the file is persistent, fsync the file with its
527 : * new name, and its containing directory.
528 : */
529 6 : if (fsync_fname(newfile, false) != 0)
530 0 : return -1;
531 :
532 6 : if (fsync_parent_path(newfile) != 0)
533 0 : return -1;
534 :
535 6 : return 0;
536 : }
537 :
538 : #endif /* FRONTEND */
539 :
540 : /*
541 : * Return the type of a directory entry.
542 : *
543 : * In frontend code, elevel should be a level from logging.h; in backend code
544 : * it should be a level from elog.h.
545 : */
546 : PGFileType
547 470614 : get_dirent_type(const char *path,
548 : const struct dirent *de,
549 : bool look_through_symlinks,
550 : int elevel)
551 : {
552 : PGFileType result;
553 :
554 : /*
555 : * Some systems tell us the type directly in the dirent struct, but that's
556 : * a BSD and Linux extension not required by POSIX. Even when the
557 : * interface is present, sometimes the type is unknown, depending on the
558 : * filesystem.
559 : */
560 : #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
561 470614 : if (de->d_type == DT_REG)
562 462426 : result = PGFILETYPE_REG;
563 8188 : else if (de->d_type == DT_DIR)
564 8118 : result = PGFILETYPE_DIR;
565 70 : else if (de->d_type == DT_LNK && !look_through_symlinks)
566 66 : result = PGFILETYPE_LNK;
567 : else
568 4 : result = PGFILETYPE_UNKNOWN;
569 : #else
570 : result = PGFILETYPE_UNKNOWN;
571 : #endif
572 :
573 470614 : if (result == PGFILETYPE_UNKNOWN)
574 : {
575 : struct stat fst;
576 : int sret;
577 :
578 :
579 4 : if (look_through_symlinks)
580 4 : sret = stat(path, &fst);
581 : else
582 0 : sret = lstat(path, &fst);
583 :
584 4 : if (sret < 0)
585 : {
586 0 : result = PGFILETYPE_ERROR;
587 : #ifdef FRONTEND
588 0 : pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
589 : #else
590 0 : ereport(elevel,
591 : (errcode_for_file_access(),
592 : errmsg("could not stat file \"%s\": %m", path)));
593 : #endif
594 : }
595 4 : else if (S_ISREG(fst.st_mode))
596 0 : result = PGFILETYPE_REG;
597 4 : else if (S_ISDIR(fst.st_mode))
598 4 : result = PGFILETYPE_DIR;
599 0 : else if (S_ISLNK(fst.st_mode))
600 0 : result = PGFILETYPE_LNK;
601 : }
602 :
603 470614 : return result;
604 : }
605 :
606 : /*
607 : * Compute what remains to be done after a possibly partial vectored read or
608 : * write. The part of 'source' beginning after 'transferred' bytes is copied
609 : * to 'destination', and its length is returned. 'source' and 'destination'
610 : * may point to the same array, for in-place adjustment. A return value of
611 : * zero indicates completion (for callers without a cheaper way to know that).
612 : */
613 : int
614 449690 : compute_remaining_iovec(struct iovec *destination,
615 : const struct iovec *source,
616 : int iovcnt,
617 : size_t transferred)
618 : {
619 : Assert(iovcnt > 0);
620 :
621 : /* Skip wholly transferred iovecs. */
622 5674764 : while (source->iov_len <= transferred)
623 : {
624 5674764 : transferred -= source->iov_len;
625 5674764 : source++;
626 5674764 : iovcnt--;
627 :
628 : /* All iovecs transferred? */
629 5674764 : if (iovcnt == 0)
630 : {
631 : /*
632 : * We don't expect the kernel to transfer more than we asked it
633 : * to, or something is out of sync.
634 : */
635 : Assert(transferred == 0);
636 449690 : return 0;
637 : }
638 : }
639 :
640 : /* Copy the remaining iovecs to the front of the array. */
641 0 : if (source != destination)
642 0 : memmove(destination, source, sizeof(*source) * iovcnt);
643 :
644 : /* Adjust leading iovec, which may have been partially transferred. */
645 : Assert(destination->iov_len > transferred);
646 0 : destination->iov_base = (char *) destination->iov_base + transferred;
647 0 : destination->iov_len -= transferred;
648 :
649 0 : return iovcnt;
650 : }
651 :
652 : /*
653 : * pg_pwritev_with_retry
654 : *
655 : * Convenience wrapper for pg_pwritev() that retries on partial write. If an
656 : * error is returned, it is unspecified how much has been written.
657 : */
658 : ssize_t
659 449690 : pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
660 : {
661 : struct iovec iov_copy[PG_IOV_MAX];
662 449690 : ssize_t sum = 0;
663 : ssize_t part;
664 :
665 : /* We'd better have space to make a copy, in case we need to retry. */
666 449690 : if (iovcnt > PG_IOV_MAX)
667 : {
668 0 : errno = EINVAL;
669 0 : return -1;
670 : }
671 :
672 : do
673 : {
674 : /* Write as much as we can. */
675 449690 : part = pg_pwritev(fd, iov, iovcnt, offset);
676 449690 : if (part < 0)
677 0 : return -1;
678 :
679 : #ifdef SIMULATE_SHORT_WRITE
680 : part = Min(part, 4096);
681 : #endif
682 :
683 : /* Count our progress. */
684 449690 : sum += part;
685 449690 : offset += part;
686 :
687 : /*
688 : * See what is left. On the first loop we used the caller's array,
689 : * but in later loops we'll use our local copy that we are allowed to
690 : * mutate.
691 : */
692 449690 : iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
693 449690 : iov = iov_copy;
694 449690 : } while (iovcnt > 0);
695 :
696 449690 : return sum;
697 : }
698 :
699 : /*
700 : * pg_pwrite_zeros
701 : *
702 : * Writes zeros to file worth "size" bytes at "offset" (from the start of the
703 : * file), using vectored I/O.
704 : *
705 : * Returns the total amount of data written. On failure, a negative value
706 : * is returned with errno set.
707 : */
708 : ssize_t
709 411530 : pg_pwrite_zeros(int fd, size_t size, off_t offset)
710 : {
711 : static const PGIOAlignedBlock zbuffer = {0}; /* worth BLCKSZ */
712 411530 : void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
713 : struct iovec iov[PG_IOV_MAX];
714 411530 : size_t remaining_size = size;
715 411530 : ssize_t total_written = 0;
716 :
717 : /* Loop, writing as many blocks as we can for each system call. */
718 861220 : while (remaining_size > 0)
719 : {
720 449690 : int iovcnt = 0;
721 : ssize_t written;
722 :
723 6124454 : for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
724 : {
725 : size_t this_iov_size;
726 :
727 5674764 : iov[iovcnt].iov_base = zerobuf_addr;
728 :
729 5674764 : if (remaining_size < BLCKSZ)
730 0 : this_iov_size = remaining_size;
731 : else
732 5674764 : this_iov_size = BLCKSZ;
733 :
734 5674764 : iov[iovcnt].iov_len = this_iov_size;
735 5674764 : remaining_size -= this_iov_size;
736 : }
737 :
738 449690 : written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
739 :
740 449690 : if (written < 0)
741 0 : return written;
742 :
743 449690 : offset += written;
744 449690 : total_written += written;
745 : }
746 :
747 : Assert(total_written == size);
748 :
749 411530 : return total_written;
750 : }
|