LCOV - code coverage report
Current view: top level - src/backend/storage/file - fd.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 706 994 71.0 %
Date: 2025-07-04 04:18:14 Functions: 90 99 90.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * fd.c
       4             :  *    Virtual file descriptor code.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/storage/file/fd.c
      11             :  *
      12             :  * NOTES:
      13             :  *
      14             :  * This code manages a cache of 'virtual' file descriptors (VFDs).
      15             :  * The server opens many file descriptors for a variety of reasons,
      16             :  * including base tables, scratch files (e.g., sort and hash spool
      17             :  * files), and random calls to C library routines like system(3); it
      18             :  * is quite easy to exceed system limits on the number of open files a
      19             :  * single process can have.  (This is around 1024 on many modern
      20             :  * operating systems, but may be lower on others.)
      21             :  *
      22             :  * VFDs are managed as an LRU pool, with actual OS file descriptors
      23             :  * being opened and closed as needed.  Obviously, if a routine is
      24             :  * opened using these interfaces, all subsequent operations must also
      25             :  * be through these interfaces (the File type is not a real file
      26             :  * descriptor).
      27             :  *
      28             :  * For this scheme to work, most (if not all) routines throughout the
      29             :  * server should use these interfaces instead of calling the C library
      30             :  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
      31             :  * may find ourselves short of real file descriptors anyway.
      32             :  *
      33             :  * INTERFACE ROUTINES
      34             :  *
      35             :  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
      36             :  * A File opened with OpenTemporaryFile is automatically deleted when the
      37             :  * File is closed, either explicitly or implicitly at end of transaction or
      38             :  * process exit. PathNameOpenFile is intended for files that are held open
      39             :  * for a long time, like relation files. It is the caller's responsibility
      40             :  * to close them, there is no automatic mechanism in fd.c for that.
      41             :  *
      42             :  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
      43             :  * temporary files that have names so that they can be shared between
      44             :  * backends.  Such files are automatically closed and count against the
      45             :  * temporary file limit of the backend that creates them, but unlike anonymous
      46             :  * files they are not automatically deleted.  See sharedfileset.c for a shared
      47             :  * ownership mechanism that provides automatic cleanup for shared files when
      48             :  * the last of a group of backends detaches.
      49             :  *
      50             :  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
      51             :  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
      52             :  * They behave like the corresponding native functions, except that the handle
      53             :  * is registered with the current subtransaction, and will be automatically
      54             :  * closed at abort. These are intended mainly for short operations like
      55             :  * reading a configuration file; there is a limit on the number of files that
      56             :  * can be opened using these functions at any one time.
      57             :  *
      58             :  * Finally, BasicOpenFile is just a thin wrapper around open() that can
      59             :  * release file descriptors in use by the virtual file descriptors if
      60             :  * necessary. There is no automatic cleanup of file descriptors returned by
      61             :  * BasicOpenFile, it is solely the caller's responsibility to close the file
      62             :  * descriptor by calling close(2).
      63             :  *
      64             :  * If a non-virtual file descriptor needs to be held open for any length of
      65             :  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
      66             :  * (and eventually ReleaseExternalFD), so that we can take it into account
      67             :  * while deciding how many VFDs can be open.  This applies to FDs obtained
      68             :  * with BasicOpenFile as well as those obtained without use of any fd.c API.
      69             :  *
      70             :  *-------------------------------------------------------------------------
      71             :  */
      72             : 
      73             : #include "postgres.h"
      74             : 
      75             : #include <dirent.h>
      76             : #include <sys/file.h>
      77             : #include <sys/param.h>
      78             : #include <sys/resource.h>     /* for getrlimit */
      79             : #include <sys/stat.h>
      80             : #include <sys/types.h>
      81             : #ifndef WIN32
      82             : #include <sys/mman.h>
      83             : #endif
      84             : #include <limits.h>
      85             : #include <unistd.h>
      86             : #include <fcntl.h>
      87             : 
      88             : #include "access/xact.h"
      89             : #include "access/xlog.h"
      90             : #include "catalog/pg_tablespace.h"
      91             : #include "common/file_perm.h"
      92             : #include "common/file_utils.h"
      93             : #include "common/pg_prng.h"
      94             : #include "miscadmin.h"
      95             : #include "pgstat.h"
      96             : #include "postmaster/startup.h"
      97             : #include "storage/aio.h"
      98             : #include "storage/fd.h"
      99             : #include "storage/ipc.h"
     100             : #include "utils/guc.h"
     101             : #include "utils/guc_hooks.h"
     102             : #include "utils/resowner.h"
     103             : #include "utils/varlena.h"
     104             : 
     105             : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
     106             : #if defined(HAVE_SYNC_FILE_RANGE)
     107             : #define PG_FLUSH_DATA_WORKS 1
     108             : #elif !defined(WIN32) && defined(MS_ASYNC)
     109             : #define PG_FLUSH_DATA_WORKS 1
     110             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     111             : #define PG_FLUSH_DATA_WORKS 1
     112             : #endif
     113             : 
     114             : /*
     115             :  * We must leave some file descriptors free for system(), the dynamic loader,
     116             :  * and other code that tries to open files without consulting fd.c.  This
     117             :  * is the number left free.  (While we try fairly hard to prevent EMFILE
     118             :  * errors, there's never any guarantee that we won't get ENFILE due to
     119             :  * other processes chewing up FDs.  So it's a bad idea to try to open files
     120             :  * without consulting fd.c.  Nonetheless we cannot control all code.)
     121             :  *
     122             :  * Because this is just a fixed setting, we are effectively assuming that
     123             :  * no such code will leave FDs open over the long term; otherwise the slop
     124             :  * is likely to be insufficient.  Note in particular that we expect that
     125             :  * loading a shared library does not result in any permanent increase in
     126             :  * the number of open files.  (This appears to be true on most if not
     127             :  * all platforms as of Feb 2004.)
     128             :  */
     129             : #define NUM_RESERVED_FDS        10
     130             : 
     131             : /*
     132             :  * If we have fewer than this many usable FDs after allowing for the reserved
     133             :  * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
     134             :  * much less than that.  Note that this value ensures numExternalFDs can be
     135             :  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
     136             :  * will not pass unless that can grow to at least 14.)
     137             :  */
     138             : #define FD_MINFREE              48
     139             : 
     140             : /*
     141             :  * A number of platforms allow individual processes to open many more files
     142             :  * than they can really support when *many* processes do the same thing.
     143             :  * This GUC parameter lets the DBA limit max_safe_fds to something less than
     144             :  * what the postmaster's initial probe suggests will work.
     145             :  */
     146             : int         max_files_per_process = 1000;
     147             : 
     148             : /*
     149             :  * Maximum number of file descriptors to open for operations that fd.c knows
     150             :  * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
     151             :  * to a conservative value, and remains that way indefinitely in bootstrap or
     152             :  * standalone-backend cases.  In normal postmaster operation, the postmaster
     153             :  * calls set_max_safe_fds() late in initialization to update the value, and
     154             :  * that value is then inherited by forked subprocesses.
     155             :  *
     156             :  * Note: the value of max_files_per_process is taken into account while
     157             :  * setting this variable, and so need not be tested separately.
     158             :  */
     159             : int         max_safe_fds = FD_MINFREE;  /* default if not changed */
     160             : 
     161             : /* Whether it is safe to continue running after fsync() fails. */
     162             : bool        data_sync_retry = false;
     163             : 
     164             : /* How SyncDataDirectory() should do its job. */
     165             : int         recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
     166             : 
     167             : /* Which kinds of files should be opened with PG_O_DIRECT. */
     168             : int         io_direct_flags;
     169             : 
     170             : /* Debugging.... */
     171             : 
     172             : #ifdef FDDEBUG
     173             : #define DO_DB(A) \
     174             :     do { \
     175             :         int         _do_db_save_errno = errno; \
     176             :         A; \
     177             :         errno = _do_db_save_errno; \
     178             :     } while (0)
     179             : #else
     180             : #define DO_DB(A) \
     181             :     ((void) 0)
     182             : #endif
     183             : 
     184             : #define VFD_CLOSED (-1)
     185             : 
     186             : #define FileIsValid(file) \
     187             :     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
     188             : 
     189             : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
     190             : 
     191             : /* these are the assigned bits in fdstate below: */
     192             : #define FD_DELETE_AT_CLOSE  (1 << 0)  /* T = delete when closed */
     193             : #define FD_CLOSE_AT_EOXACT  (1 << 1)  /* T = close at eoXact */
     194             : #define FD_TEMP_FILE_LIMIT  (1 << 2)  /* T = respect temp_file_limit */
     195             : 
     196             : typedef struct vfd
     197             : {
     198             :     int         fd;             /* current FD, or VFD_CLOSED if none */
     199             :     unsigned short fdstate;     /* bitflags for VFD's state */
     200             :     ResourceOwner resowner;     /* owner, for automatic cleanup */
     201             :     File        nextFree;       /* link to next free VFD, if in freelist */
     202             :     File        lruMoreRecently;    /* doubly linked recency-of-use list */
     203             :     File        lruLessRecently;
     204             :     off_t       fileSize;       /* current size of file (0 if not temporary) */
     205             :     char       *fileName;       /* name of file, or NULL for unused VFD */
     206             :     /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
     207             :     int         fileFlags;      /* open(2) flags for (re)opening the file */
     208             :     mode_t      fileMode;       /* mode to pass to open(2) */
     209             : } Vfd;
     210             : 
     211             : /*
     212             :  * Virtual File Descriptor array pointer and size.  This grows as
     213             :  * needed.  'File' values are indexes into this array.
     214             :  * Note that VfdCache[0] is not a usable VFD, just a list header.
     215             :  */
     216             : static Vfd *VfdCache;
     217             : static Size SizeVfdCache = 0;
     218             : 
     219             : /*
     220             :  * Number of file descriptors known to be in use by VFD entries.
     221             :  */
     222             : static int  nfile = 0;
     223             : 
     224             : /*
     225             :  * Flag to tell whether it's worth scanning VfdCache looking for temp files
     226             :  * to close
     227             :  */
     228             : static bool have_xact_temporary_files = false;
     229             : 
     230             : /*
     231             :  * Tracks the total size of all temporary files.  Note: when temp_file_limit
     232             :  * is being enforced, this cannot overflow since the limit cannot be more
     233             :  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
     234             :  * overflow, but we don't care.
     235             :  */
     236             : static uint64 temporary_files_size = 0;
     237             : 
     238             : /* Temporary file access initialized and not yet shut down? */
     239             : #ifdef USE_ASSERT_CHECKING
     240             : static bool temporary_files_allowed = false;
     241             : #endif
     242             : 
     243             : /*
     244             :  * List of OS handles opened with AllocateFile, AllocateDir and
     245             :  * OpenTransientFile.
     246             :  */
     247             : typedef enum
     248             : {
     249             :     AllocateDescFile,
     250             :     AllocateDescPipe,
     251             :     AllocateDescDir,
     252             :     AllocateDescRawFD,
     253             : } AllocateDescKind;
     254             : 
     255             : typedef struct
     256             : {
     257             :     AllocateDescKind kind;
     258             :     SubTransactionId create_subid;
     259             :     union
     260             :     {
     261             :         FILE       *file;
     262             :         DIR        *dir;
     263             :         int         fd;
     264             :     }           desc;
     265             : } AllocateDesc;
     266             : 
     267             : static int  numAllocatedDescs = 0;
     268             : static int  maxAllocatedDescs = 0;
     269             : static AllocateDesc *allocatedDescs = NULL;
     270             : 
     271             : /*
     272             :  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
     273             :  */
     274             : static int  numExternalFDs = 0;
     275             : 
     276             : /*
     277             :  * Number of temporary files opened during the current session;
     278             :  * this is used in generation of tempfile names.
     279             :  */
     280             : static long tempFileCounter = 0;
     281             : 
     282             : /*
     283             :  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
     284             :  * indicating that the current database's default tablespace should be used.)
     285             :  * When numTempTableSpaces is -1, this has not been set in the current
     286             :  * transaction.
     287             :  */
     288             : static Oid *tempTableSpaces = NULL;
     289             : static int  numTempTableSpaces = -1;
     290             : static int  nextTempTableSpace = 0;
     291             : 
     292             : 
     293             : /*--------------------
     294             :  *
     295             :  * Private Routines
     296             :  *
     297             :  * Delete          - delete a file from the Lru ring
     298             :  * LruDelete       - remove a file from the Lru ring and close its FD
     299             :  * Insert          - put a file at the front of the Lru ring
     300             :  * LruInsert       - put a file at the front of the Lru ring and open it
     301             :  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
     302             :  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
     303             :  * AllocateVfd     - grab a free (or new) file record (from VfdCache)
     304             :  * FreeVfd         - free a file record
     305             :  *
     306             :  * The Least Recently Used ring is a doubly linked list that begins and
     307             :  * ends on element zero.  Element zero is special -- it doesn't represent
     308             :  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
     309             :  * anchor that shows us the beginning/end of the ring.
     310             :  * Only VFD elements that are currently really open (have an FD assigned) are
     311             :  * in the Lru ring.  Elements that are "virtually" open can be recognized
     312             :  * by having a non-null fileName field.
     313             :  *
     314             :  * example:
     315             :  *
     316             :  *     /--less----\                /---------\
     317             :  *     v           \              v           \
     318             :  *   #0 --more---> LeastRecentlyUsed --more-\ \
     319             :  *    ^\                                    | |
     320             :  *     \\less--> MostRecentlyUsedFile    <---/ |
     321             :  *      \more---/                    \--less--/
     322             :  *
     323             :  *--------------------
     324             :  */
     325             : static void Delete(File file);
     326             : static void LruDelete(File file);
     327             : static void Insert(File file);
     328             : static int  LruInsert(File file);
     329             : static bool ReleaseLruFile(void);
     330             : static void ReleaseLruFiles(void);
     331             : static File AllocateVfd(void);
     332             : static void FreeVfd(File file);
     333             : 
     334             : static int  FileAccess(File file);
     335             : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
     336             : static bool reserveAllocatedDesc(void);
     337             : static int  FreeDesc(AllocateDesc *desc);
     338             : 
     339             : static void BeforeShmemExit_Files(int code, Datum arg);
     340             : static void CleanupTempFiles(bool isCommit, bool isProcExit);
     341             : static void RemovePgTempRelationFiles(const char *tsdirname);
     342             : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
     343             : 
     344             : static void walkdir(const char *path,
     345             :                     void (*action) (const char *fname, bool isdir, int elevel),
     346             :                     bool process_symlinks,
     347             :                     int elevel);
     348             : #ifdef PG_FLUSH_DATA_WORKS
     349             : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
     350             : #endif
     351             : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
     352             : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
     353             : 
     354             : static int  fsync_parent_path(const char *fname, int elevel);
     355             : 
     356             : 
     357             : /* ResourceOwner callbacks to hold virtual file descriptors */
     358             : static void ResOwnerReleaseFile(Datum res);
     359             : static char *ResOwnerPrintFile(Datum res);
     360             : 
     361             : static const ResourceOwnerDesc file_resowner_desc =
     362             : {
     363             :     .name = "File",
     364             :     .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
     365             :     .release_priority = RELEASE_PRIO_FILES,
     366             :     .ReleaseResource = ResOwnerReleaseFile,
     367             :     .DebugPrint = ResOwnerPrintFile
     368             : };
     369             : 
     370             : /* Convenience wrappers over ResourceOwnerRemember/Forget */
     371             : static inline void
     372        7350 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
     373             : {
     374        7350 :     ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
     375        7350 : }
     376             : static inline void
     377        7342 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
     378             : {
     379        7342 :     ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
     380        7342 : }
     381             : 
     382             : /*
     383             :  * pg_fsync --- do fsync with or without writethrough
     384             :  */
     385             : int
     386      128874 : pg_fsync(int fd)
     387             : {
     388             : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
     389             :     struct stat st;
     390             : 
     391             :     /*
     392             :      * Some operating system implementations of fsync() have requirements
     393             :      * about the file access modes that were used when their file descriptor
     394             :      * argument was opened, and these requirements differ depending on whether
     395             :      * the file descriptor is for a directory.
     396             :      *
     397             :      * For any file descriptor that may eventually be handed to fsync(), we
     398             :      * should have opened it with access modes that are compatible with
     399             :      * fsync() on all supported systems, otherwise the code may not be
     400             :      * portable, even if it runs ok on the current system.
     401             :      *
     402             :      * We assert here that a descriptor for a file was opened with write
     403             :      * permissions (i.e., not O_RDONLY) and for a directory without write
     404             :      * permissions (O_RDONLY).  Notice that the assertion check is made even
     405             :      * if fsync() is disabled.
     406             :      *
     407             :      * If fstat() fails, ignore it and let the follow-up fsync() complain.
     408             :      */
     409             :     if (fstat(fd, &st) == 0)
     410             :     {
     411             :         int         desc_flags = fcntl(fd, F_GETFL);
     412             : 
     413             :         desc_flags &= O_ACCMODE;
     414             : 
     415             :         if (S_ISDIR(st.st_mode))
     416             :             Assert(desc_flags == O_RDONLY);
     417             :         else
     418             :             Assert(desc_flags != O_RDONLY);
     419             :     }
     420             :     errno = 0;
     421             : #endif
     422             : 
     423             :     /* #if is to skip the wal_sync_method test if there's no need for it */
     424             : #if defined(HAVE_FSYNC_WRITETHROUGH)
     425             :     if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
     426             :         return pg_fsync_writethrough(fd);
     427             :     else
     428             : #endif
     429      128874 :         return pg_fsync_no_writethrough(fd);
     430             : }
     431             : 
     432             : 
     433             : /*
     434             :  * pg_fsync_no_writethrough --- same as fsync except does nothing if
     435             :  *  enableFsync is off
     436             :  */
     437             : int
     438      128874 : pg_fsync_no_writethrough(int fd)
     439             : {
     440             :     int         rc;
     441             : 
     442      128874 :     if (!enableFsync)
     443      128874 :         return 0;
     444             : 
     445           0 : retry:
     446           0 :     rc = fsync(fd);
     447             : 
     448           0 :     if (rc == -1 && errno == EINTR)
     449           0 :         goto retry;
     450             : 
     451           0 :     return rc;
     452             : }
     453             : 
     454             : /*
     455             :  * pg_fsync_writethrough
     456             :  */
     457             : int
     458           0 : pg_fsync_writethrough(int fd)
     459             : {
     460           0 :     if (enableFsync)
     461             :     {
     462             : #if defined(F_FULLFSYNC)
     463             :         return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
     464             : #else
     465           0 :         errno = ENOSYS;
     466           0 :         return -1;
     467             : #endif
     468             :     }
     469             :     else
     470           0 :         return 0;
     471             : }
     472             : 
     473             : /*
     474             :  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
     475             :  */
     476             : int
     477           0 : pg_fdatasync(int fd)
     478             : {
     479             :     int         rc;
     480             : 
     481           0 :     if (!enableFsync)
     482           0 :         return 0;
     483             : 
     484           0 : retry:
     485           0 :     rc = fdatasync(fd);
     486             : 
     487           0 :     if (rc == -1 && errno == EINTR)
     488           0 :         goto retry;
     489             : 
     490           0 :     return rc;
     491             : }
     492             : 
     493             : /*
     494             :  * pg_file_exists -- check that a file exists.
     495             :  *
     496             :  * This requires an absolute path to the file.  Returns true if the file is
     497             :  * not a directory, false otherwise.
     498             :  */
     499             : bool
     500       37070 : pg_file_exists(const char *name)
     501             : {
     502             :     struct stat st;
     503             : 
     504             :     Assert(name != NULL);
     505             : 
     506       37070 :     if (stat(name, &st) == 0)
     507       19626 :         return !S_ISDIR(st.st_mode);
     508       17444 :     else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
     509           0 :         ereport(ERROR,
     510             :                 (errcode_for_file_access(),
     511             :                  errmsg("could not access file \"%s\": %m", name)));
     512             : 
     513       17444 :     return false;
     514             : }
     515             : 
     516             : /*
     517             :  * pg_flush_data --- advise OS that the described dirty data should be flushed
     518             :  *
     519             :  * offset of 0 with nbytes 0 means that the entire file should be flushed
     520             :  */
     521             : void
     522       70606 : pg_flush_data(int fd, off_t offset, off_t nbytes)
     523             : {
     524             :     /*
     525             :      * Right now file flushing is primarily used to avoid making later
     526             :      * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
     527             :      * if fsyncs are disabled - that's a decision we might want to make
     528             :      * configurable at some point.
     529             :      */
     530       70606 :     if (!enableFsync)
     531       70606 :         return;
     532             : 
     533             :     /*
     534             :      * We compile all alternatives that are supported on the current platform,
     535             :      * to find portability problems more easily.
     536             :      */
     537             : #if defined(HAVE_SYNC_FILE_RANGE)
     538             :     {
     539             :         int         rc;
     540             :         static bool not_implemented_by_kernel = false;
     541             : 
     542           0 :         if (not_implemented_by_kernel)
     543           0 :             return;
     544             : 
     545           0 : retry:
     546             : 
     547             :         /*
     548             :          * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
     549             :          * tells the OS that writeback for the specified blocks should be
     550             :          * started, but that we don't want to wait for completion.  Note that
     551             :          * this call might block if too much dirty data exists in the range.
     552             :          * This is the preferable method on OSs supporting it, as it works
     553             :          * reliably when available (contrast to msync()) and doesn't flush out
     554             :          * clean data (like FADV_DONTNEED).
     555             :          */
     556           0 :         rc = sync_file_range(fd, offset, nbytes,
     557             :                              SYNC_FILE_RANGE_WRITE);
     558           0 :         if (rc != 0)
     559             :         {
     560             :             int         elevel;
     561             : 
     562           0 :             if (rc == EINTR)
     563           0 :                 goto retry;
     564             : 
     565             :             /*
     566             :              * For systems that don't have an implementation of
     567             :              * sync_file_range() such as Windows WSL, generate only one
     568             :              * warning and then suppress all further attempts by this process.
     569             :              */
     570           0 :             if (errno == ENOSYS)
     571             :             {
     572           0 :                 elevel = WARNING;
     573           0 :                 not_implemented_by_kernel = true;
     574             :             }
     575             :             else
     576           0 :                 elevel = data_sync_elevel(WARNING);
     577             : 
     578           0 :             ereport(elevel,
     579             :                     (errcode_for_file_access(),
     580             :                      errmsg("could not flush dirty data: %m")));
     581             :         }
     582             : 
     583           0 :         return;
     584             :     }
     585             : #endif
     586             : #if !defined(WIN32) && defined(MS_ASYNC)
     587             :     {
     588             :         void       *p;
     589             :         static int  pagesize = 0;
     590             : 
     591             :         /*
     592             :          * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
     593             :          * writeback. On linux it only does so if MS_SYNC is specified, but
     594             :          * then it does the writeback synchronously. Luckily all common linux
     595             :          * systems have sync_file_range().  This is preferable over
     596             :          * FADV_DONTNEED because it doesn't flush out clean data.
     597             :          *
     598             :          * We map the file (mmap()), tell the kernel to sync back the contents
     599             :          * (msync()), and then remove the mapping again (munmap()).
     600             :          */
     601             : 
     602             :         /* mmap() needs actual length if we want to map whole file */
     603             :         if (offset == 0 && nbytes == 0)
     604             :         {
     605             :             nbytes = lseek(fd, 0, SEEK_END);
     606             :             if (nbytes < 0)
     607             :             {
     608             :                 ereport(WARNING,
     609             :                         (errcode_for_file_access(),
     610             :                          errmsg("could not determine dirty data size: %m")));
     611             :                 return;
     612             :             }
     613             :         }
     614             : 
     615             :         /*
     616             :          * Some platforms reject partial-page mmap() attempts.  To deal with
     617             :          * that, just truncate the request to a page boundary.  If any extra
     618             :          * bytes don't get flushed, well, it's only a hint anyway.
     619             :          */
     620             : 
     621             :         /* fetch pagesize only once */
     622             :         if (pagesize == 0)
     623             :             pagesize = sysconf(_SC_PAGESIZE);
     624             : 
     625             :         /* align length to pagesize, dropping any fractional page */
     626             :         if (pagesize > 0)
     627             :             nbytes = (nbytes / pagesize) * pagesize;
     628             : 
     629             :         /* fractional-page request is a no-op */
     630             :         if (nbytes <= 0)
     631             :             return;
     632             : 
     633             :         /*
     634             :          * mmap could well fail, particularly on 32-bit platforms where there
     635             :          * may simply not be enough address space.  If so, silently fall
     636             :          * through to the next implementation.
     637             :          */
     638             :         if (nbytes <= (off_t) SSIZE_MAX)
     639             :             p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
     640             :         else
     641             :             p = MAP_FAILED;
     642             : 
     643             :         if (p != MAP_FAILED)
     644             :         {
     645             :             int         rc;
     646             : 
     647             :             rc = msync(p, (size_t) nbytes, MS_ASYNC);
     648             :             if (rc != 0)
     649             :             {
     650             :                 ereport(data_sync_elevel(WARNING),
     651             :                         (errcode_for_file_access(),
     652             :                          errmsg("could not flush dirty data: %m")));
     653             :                 /* NB: need to fall through to munmap()! */
     654             :             }
     655             : 
     656             :             rc = munmap(p, (size_t) nbytes);
     657             :             if (rc != 0)
     658             :             {
     659             :                 /* FATAL error because mapping would remain */
     660             :                 ereport(FATAL,
     661             :                         (errcode_for_file_access(),
     662             :                          errmsg("could not munmap() while flushing data: %m")));
     663             :             }
     664             : 
     665             :             return;
     666             :         }
     667             :     }
     668             : #endif
     669             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     670             :     {
     671             :         int         rc;
     672             : 
     673             :         /*
     674             :          * Signal the kernel that the passed in range should not be cached
     675             :          * anymore. This has the, desired, side effect of writing out dirty
     676             :          * data, and the, undesired, side effect of likely discarding useful
     677             :          * clean cached blocks.  For the latter reason this is the least
     678             :          * preferable method.
     679             :          */
     680             : 
     681             :         rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
     682             : 
     683             :         if (rc != 0)
     684             :         {
     685             :             /* don't error out, this is just a performance optimization */
     686             :             ereport(WARNING,
     687             :                     (errcode_for_file_access(),
     688             :                      errmsg("could not flush dirty data: %m")));
     689             :         }
     690             : 
     691             :         return;
     692             :     }
     693             : #endif
     694             : }
     695             : 
     696             : /*
     697             :  * Truncate an open file to a given length.
     698             :  */
     699             : static int
     700        1086 : pg_ftruncate(int fd, off_t length)
     701             : {
     702             :     int         ret;
     703             : 
     704        1086 : retry:
     705        1086 :     ret = ftruncate(fd, length);
     706             : 
     707        1086 :     if (ret == -1 && errno == EINTR)
     708           0 :         goto retry;
     709             : 
     710        1086 :     return ret;
     711             : }
     712             : 
     713             : /*
     714             :  * Truncate a file to a given length by name.
     715             :  */
     716             : int
     717      435272 : pg_truncate(const char *path, off_t length)
     718             : {
     719             :     int         ret;
     720             : #ifdef WIN32
     721             :     int         save_errno;
     722             :     int         fd;
     723             : 
     724             :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
     725             :     if (fd >= 0)
     726             :     {
     727             :         ret = pg_ftruncate(fd, length);
     728             :         save_errno = errno;
     729             :         CloseTransientFile(fd);
     730             :         errno = save_errno;
     731             :     }
     732             :     else
     733             :         ret = -1;
     734             : #else
     735             : 
     736      435272 : retry:
     737      435272 :     ret = truncate(path, length);
     738             : 
     739      435272 :     if (ret == -1 && errno == EINTR)
     740           0 :         goto retry;
     741             : #endif
     742             : 
     743      435272 :     return ret;
     744             : }
     745             : 
     746             : /*
     747             :  * fsync_fname -- fsync a file or directory, handling errors properly
     748             :  *
     749             :  * Try to fsync a file or directory. When doing the latter, ignore errors that
     750             :  * indicate the OS just doesn't allow/require fsyncing directories.
     751             :  */
     752             : void
     753       40930 : fsync_fname(const char *fname, bool isdir)
     754             : {
     755       40930 :     fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
     756       40930 : }
     757             : 
     758             : /*
     759             :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     760             :  *
     761             :  * This routine ensures that, after returning, the effect of renaming file
     762             :  * persists in case of a crash. A crash while this routine is running will
     763             :  * leave you with either the pre-existing or the moved file in place of the
     764             :  * new file; no mixed state or truncated files are possible.
     765             :  *
     766             :  * It does so by using fsync on the old filename and the possibly existing
     767             :  * target filename before the rename, and the target file and directory after.
     768             :  *
     769             :  * Note that rename() cannot be used across arbitrary directories, as they
     770             :  * might not be on the same filesystem. Therefore this routine does not
     771             :  * support renaming across directories.
     772             :  *
     773             :  * Log errors with the caller specified severity.
     774             :  *
     775             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     776             :  * valid upon return.
     777             :  */
     778             : int
     779       12604 : durable_rename(const char *oldfile, const char *newfile, int elevel)
     780             : {
     781             :     int         fd;
     782             : 
     783             :     /*
     784             :      * First fsync the old and target path (if it exists), to ensure that they
     785             :      * are properly persistent on disk. Syncing the target file is not
     786             :      * strictly necessary, but it makes it easier to reason about crashes;
     787             :      * because it's then guaranteed that either source or target file exists
     788             :      * after a crash.
     789             :      */
     790       12604 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     791           0 :         return -1;
     792             : 
     793       12604 :     fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
     794       12604 :     if (fd < 0)
     795             :     {
     796        8824 :         if (errno != ENOENT)
     797             :         {
     798           0 :             ereport(elevel,
     799             :                     (errcode_for_file_access(),
     800             :                      errmsg("could not open file \"%s\": %m", newfile)));
     801           0 :             return -1;
     802             :         }
     803             :     }
     804             :     else
     805             :     {
     806        3780 :         if (pg_fsync(fd) != 0)
     807             :         {
     808             :             int         save_errno;
     809             : 
     810             :             /* close file upon error, might not be in transaction context */
     811           0 :             save_errno = errno;
     812           0 :             CloseTransientFile(fd);
     813           0 :             errno = save_errno;
     814             : 
     815           0 :             ereport(elevel,
     816             :                     (errcode_for_file_access(),
     817             :                      errmsg("could not fsync file \"%s\": %m", newfile)));
     818           0 :             return -1;
     819             :         }
     820             : 
     821        3780 :         if (CloseTransientFile(fd) != 0)
     822             :         {
     823           0 :             ereport(elevel,
     824             :                     (errcode_for_file_access(),
     825             :                      errmsg("could not close file \"%s\": %m", newfile)));
     826           0 :             return -1;
     827             :         }
     828             :     }
     829             : 
     830             :     /* Time to do the real deal... */
     831       12604 :     if (rename(oldfile, newfile) < 0)
     832             :     {
     833           0 :         ereport(elevel,
     834             :                 (errcode_for_file_access(),
     835             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     836             :                         oldfile, newfile)));
     837           0 :         return -1;
     838             :     }
     839             : 
     840             :     /*
     841             :      * To guarantee renaming the file is persistent, fsync the file with its
     842             :      * new name, and its containing directory.
     843             :      */
     844       12604 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     845           0 :         return -1;
     846             : 
     847       12604 :     if (fsync_parent_path(newfile, elevel) != 0)
     848           0 :         return -1;
     849             : 
     850       12604 :     return 0;
     851             : }
     852             : 
     853             : /*
     854             :  * durable_unlink -- remove a file in a durable manner
     855             :  *
     856             :  * This routine ensures that, after returning, the effect of removing file
     857             :  * persists in case of a crash. A crash while this routine is running will
     858             :  * leave the system in no mixed state.
     859             :  *
     860             :  * It does so by using fsync on the parent directory of the file after the
     861             :  * actual removal is done.
     862             :  *
     863             :  * Log errors with the severity specified by caller.
     864             :  *
     865             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     866             :  * valid upon return.
     867             :  */
     868             : int
     869        2584 : durable_unlink(const char *fname, int elevel)
     870             : {
     871        2584 :     if (unlink(fname) < 0)
     872             :     {
     873          76 :         ereport(elevel,
     874             :                 (errcode_for_file_access(),
     875             :                  errmsg("could not remove file \"%s\": %m",
     876             :                         fname)));
     877          76 :         return -1;
     878             :     }
     879             : 
     880             :     /*
     881             :      * To guarantee that the removal of the file is persistent, fsync its
     882             :      * parent directory.
     883             :      */
     884        2508 :     if (fsync_parent_path(fname, elevel) != 0)
     885           0 :         return -1;
     886             : 
     887        2508 :     return 0;
     888             : }
     889             : 
     890             : /*
     891             :  * InitFileAccess --- initialize this module during backend startup
     892             :  *
     893             :  * This is called during either normal or standalone backend start.
     894             :  * It is *not* called in the postmaster.
     895             :  *
     896             :  * Note that this does not initialize temporary file access, that is
     897             :  * separately initialized via InitTemporaryFileAccess().
     898             :  */
     899             : void
     900       42720 : InitFileAccess(void)
     901             : {
     902             :     Assert(SizeVfdCache == 0);  /* call me only once */
     903             : 
     904             :     /* initialize cache header entry */
     905       42720 :     VfdCache = (Vfd *) malloc(sizeof(Vfd));
     906       42720 :     if (VfdCache == NULL)
     907           0 :         ereport(FATAL,
     908             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     909             :                  errmsg("out of memory")));
     910             : 
     911      341760 :     MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
     912       42720 :     VfdCache->fd = VFD_CLOSED;
     913             : 
     914       42720 :     SizeVfdCache = 1;
     915       42720 : }
     916             : 
     917             : /*
     918             :  * InitTemporaryFileAccess --- initialize temporary file access during startup
     919             :  *
     920             :  * This is called during either normal or standalone backend start.
     921             :  * It is *not* called in the postmaster.
     922             :  *
     923             :  * This is separate from InitFileAccess() because temporary file cleanup can
     924             :  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
     925             :  * our reporting has to happen before that. Low level file access should be
     926             :  * available for longer, hence the separate initialization / shutdown of
     927             :  * temporary file handling.
     928             :  */
     929             : void
     930       42720 : InitTemporaryFileAccess(void)
     931             : {
     932             :     Assert(SizeVfdCache != 0);  /* InitFileAccess() needs to have run */
     933             :     Assert(!temporary_files_allowed);   /* call me only once */
     934             : 
     935             :     /*
     936             :      * Register before-shmem-exit hook to ensure temp files are dropped while
     937             :      * we can still report stats.
     938             :      */
     939       42720 :     before_shmem_exit(BeforeShmemExit_Files, 0);
     940             : 
     941             : #ifdef USE_ASSERT_CHECKING
     942             :     temporary_files_allowed = true;
     943             : #endif
     944       42720 : }
     945             : 
     946             : /*
     947             :  * count_usable_fds --- count how many FDs the system will let us open,
     948             :  *      and estimate how many are already open.
     949             :  *
     950             :  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
     951             :  * value of max_to_probe might result in an underestimate of already_open;
     952             :  * we must fill in any "gaps" in the set of used FDs before the calculation
     953             :  * of already_open will give the right answer.  In practice, max_to_probe
     954             :  * of a couple of dozen should be enough to ensure good results.
     955             :  *
     956             :  * We assume stderr (FD 2) is available for dup'ing.  While the calling
     957             :  * script could theoretically close that, it would be a really bad idea,
     958             :  * since then one risks loss of error messages from, e.g., libc.
     959             :  */
     960             : static void
     961        2116 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
     962             : {
     963             :     int        *fd;
     964             :     int         size;
     965        2116 :     int         used = 0;
     966        2116 :     int         highestfd = 0;
     967             :     int         j;
     968             : 
     969             : #ifdef HAVE_GETRLIMIT
     970             :     struct rlimit rlim;
     971             :     int         getrlimit_status;
     972             : #endif
     973             : 
     974        2116 :     size = 1024;
     975        2116 :     fd = (int *) palloc(size * sizeof(int));
     976             : 
     977             : #ifdef HAVE_GETRLIMIT
     978        2116 :     getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
     979        2116 :     if (getrlimit_status != 0)
     980           0 :         ereport(WARNING, (errmsg("getrlimit failed: %m")));
     981             : #endif                          /* HAVE_GETRLIMIT */
     982             : 
     983             :     /* dup until failure or probe limit reached */
     984             :     for (;;)
     985     2113884 :     {
     986             :         int         thisfd;
     987             : 
     988             : #ifdef HAVE_GETRLIMIT
     989             : 
     990             :         /*
     991             :          * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
     992             :          * some platforms
     993             :          */
     994     2116000 :         if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
     995           0 :             break;
     996             : #endif
     997             : 
     998     2116000 :         thisfd = dup(2);
     999     2116000 :         if (thisfd < 0)
    1000             :         {
    1001             :             /* Expect EMFILE or ENFILE, else it's fishy */
    1002           0 :             if (errno != EMFILE && errno != ENFILE)
    1003           0 :                 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
    1004           0 :             break;
    1005             :         }
    1006             : 
    1007     2116000 :         if (used >= size)
    1008             :         {
    1009           0 :             size *= 2;
    1010           0 :             fd = (int *) repalloc(fd, size * sizeof(int));
    1011             :         }
    1012     2116000 :         fd[used++] = thisfd;
    1013             : 
    1014     2116000 :         if (highestfd < thisfd)
    1015     2116000 :             highestfd = thisfd;
    1016             : 
    1017     2116000 :         if (used >= max_to_probe)
    1018        2116 :             break;
    1019             :     }
    1020             : 
    1021             :     /* release the files we opened */
    1022     2118116 :     for (j = 0; j < used; j++)
    1023     2116000 :         close(fd[j]);
    1024             : 
    1025        2116 :     pfree(fd);
    1026             : 
    1027             :     /*
    1028             :      * Return results.  usable_fds is just the number of successful dups. We
    1029             :      * assume that the system limit is highestfd+1 (remember 0 is a legal FD
    1030             :      * number) and so already_open is highestfd+1 - usable_fds.
    1031             :      */
    1032        2116 :     *usable_fds = used;
    1033        2116 :     *already_open = highestfd + 1 - used;
    1034        2116 : }
    1035             : 
    1036             : /*
    1037             :  * set_max_safe_fds
    1038             :  *      Determine number of file descriptors that fd.c is allowed to use
    1039             :  */
    1040             : void
    1041        2116 : set_max_safe_fds(void)
    1042             : {
    1043             :     int         usable_fds;
    1044             :     int         already_open;
    1045             : 
    1046             :     /*----------
    1047             :      * We want to set max_safe_fds to
    1048             :      *          MIN(usable_fds, max_files_per_process)
    1049             :      * less the slop factor for files that are opened without consulting
    1050             :      * fd.c.  This ensures that we won't allow to open more than
    1051             :      * max_files_per_process, or the experimentally-determined EMFILE limit,
    1052             :      * additional files.
    1053             :      *----------
    1054             :      */
    1055        2116 :     count_usable_fds(max_files_per_process,
    1056             :                      &usable_fds, &already_open);
    1057             : 
    1058        2116 :     max_safe_fds = Min(usable_fds, max_files_per_process);
    1059             : 
    1060             :     /*
    1061             :      * Take off the FDs reserved for system() etc.
    1062             :      */
    1063        2116 :     max_safe_fds -= NUM_RESERVED_FDS;
    1064             : 
    1065             :     /*
    1066             :      * Make sure we still have enough to get by.
    1067             :      */
    1068        2116 :     if (max_safe_fds < FD_MINFREE)
    1069           0 :         ereport(FATAL,
    1070             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    1071             :                  errmsg("insufficient file descriptors available to start server process"),
    1072             :                  errdetail("System allows %d, server needs at least %d, %d files are already open.",
    1073             :                            max_safe_fds + NUM_RESERVED_FDS,
    1074             :                            FD_MINFREE + NUM_RESERVED_FDS,
    1075             :                            already_open)));
    1076             : 
    1077        2116 :     elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
    1078             :          max_safe_fds, usable_fds, already_open);
    1079        2116 : }
    1080             : 
    1081             : /*
    1082             :  * Open a file with BasicOpenFilePerm() and pass default file mode for the
    1083             :  * fileMode parameter.
    1084             :  */
    1085             : int
    1086       66474 : BasicOpenFile(const char *fileName, int fileFlags)
    1087             : {
    1088       66474 :     return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1089             : }
    1090             : 
    1091             : /*
    1092             :  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
    1093             :  *
    1094             :  * This is exported for use by places that really want a plain kernel FD,
    1095             :  * but need to be proof against running out of FDs.  Once an FD has been
    1096             :  * successfully returned, it is the caller's responsibility to ensure that
    1097             :  * it will not be leaked on ereport()!  Most users should *not* call this
    1098             :  * routine directly, but instead use the VFD abstraction level, which
    1099             :  * provides protection against descriptor leaks as well as management of
    1100             :  * files that need to be open for more than a short period of time.
    1101             :  *
    1102             :  * Ideally this should be the *only* direct call of open() in the backend.
    1103             :  * In practice, the postmaster calls open() directly, and there are some
    1104             :  * direct open() calls done early in backend startup.  Those are OK since
    1105             :  * this module wouldn't have any open files to close at that point anyway.
    1106             :  */
    1107             : int
    1108    18640190 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1109             : {
    1110             :     int         fd;
    1111             : 
    1112    18640190 : tryAgain:
    1113             : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1114             : 
    1115             :     /*
    1116             :      * The value we defined to stand in for O_DIRECT when simulating it with
    1117             :      * F_NOCACHE had better not collide with any of the standard flags.
    1118             :      */
    1119             :     StaticAssertStmt((PG_O_DIRECT &
    1120             :                       (O_APPEND |
    1121             :                        O_CLOEXEC |
    1122             :                        O_CREAT |
    1123             :                        O_DSYNC |
    1124             :                        O_EXCL |
    1125             :                        O_RDWR |
    1126             :                        O_RDONLY |
    1127             :                        O_SYNC |
    1128             :                        O_TRUNC |
    1129             :                        O_WRONLY)) == 0,
    1130             :                      "PG_O_DIRECT value collides with standard flag");
    1131             :     fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
    1132             : #else
    1133    18640190 :     fd = open(fileName, fileFlags, fileMode);
    1134             : #endif
    1135             : 
    1136    18640190 :     if (fd >= 0)
    1137             :     {
    1138             : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1139             :         if (fileFlags & PG_O_DIRECT)
    1140             :         {
    1141             :             if (fcntl(fd, F_NOCACHE, 1) < 0)
    1142             :             {
    1143             :                 int         save_errno = errno;
    1144             : 
    1145             :                 close(fd);
    1146             :                 errno = save_errno;
    1147             :                 return -1;
    1148             :             }
    1149             :         }
    1150             : #endif
    1151             : 
    1152    17696916 :         return fd;              /* success! */
    1153             :     }
    1154             : 
    1155      943274 :     if (errno == EMFILE || errno == ENFILE)
    1156             :     {
    1157           0 :         int         save_errno = errno;
    1158             : 
    1159           0 :         ereport(LOG,
    1160             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    1161             :                  errmsg("out of file descriptors: %m; release and retry")));
    1162           0 :         errno = 0;
    1163           0 :         if (ReleaseLruFile())
    1164           0 :             goto tryAgain;
    1165           0 :         errno = save_errno;
    1166             :     }
    1167             : 
    1168      943274 :     return -1;                  /* failure */
    1169             : }
    1170             : 
    1171             : /*
    1172             :  * AcquireExternalFD - attempt to reserve an external file descriptor
    1173             :  *
    1174             :  * This should be used by callers that need to hold a file descriptor open
    1175             :  * over more than a short interval, but cannot use any of the other facilities
    1176             :  * provided by this module.
    1177             :  *
    1178             :  * The difference between this and the underlying ReserveExternalFD function
    1179             :  * is that this will report failure (by setting errno and returning false)
    1180             :  * if "too many" external FDs are already reserved.  This should be used in
    1181             :  * any code where the total number of FDs to be reserved is not predictable
    1182             :  * and small.
    1183             :  */
    1184             : bool
    1185      311056 : AcquireExternalFD(void)
    1186             : {
    1187             :     /*
    1188             :      * We don't want more than max_safe_fds / 3 FDs to be consumed for
    1189             :      * "external" FDs.
    1190             :      */
    1191      311056 :     if (numExternalFDs < max_safe_fds / 3)
    1192             :     {
    1193      311056 :         ReserveExternalFD();
    1194      311056 :         return true;
    1195             :     }
    1196           0 :     errno = EMFILE;
    1197           0 :     return false;
    1198             : }
    1199             : 
    1200             : /*
    1201             :  * ReserveExternalFD - report external consumption of a file descriptor
    1202             :  *
    1203             :  * This should be used by callers that need to hold a file descriptor open
    1204             :  * over more than a short interval, but cannot use any of the other facilities
    1205             :  * provided by this module.  This just tracks the use of the FD and closes
    1206             :  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
    1207             :  *
    1208             :  * Call this directly only in code where failure to reserve the FD would be
    1209             :  * fatal; for example, the WAL-writing code does so, since the alternative is
    1210             :  * session failure.  Also, it's very unwise to do so in code that could
    1211             :  * consume more than one FD per process.
    1212             :  *
    1213             :  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
    1214             :  * available, it doesn't matter too much whether this is called before or
    1215             :  * after actually opening the FD; but doing so beforehand reduces the risk of
    1216             :  * an EMFILE failure if not everybody played nice.  In any case, it's solely
    1217             :  * caller's responsibility to keep the external-FD count in sync with reality.
    1218             :  */
    1219             : void
    1220      461520 : ReserveExternalFD(void)
    1221             : {
    1222             :     /*
    1223             :      * Release VFDs if needed to stay safe.  Because we do this before
    1224             :      * incrementing numExternalFDs, the final state will be as desired, i.e.,
    1225             :      * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
    1226             :      */
    1227      461520 :     ReleaseLruFiles();
    1228             : 
    1229      461520 :     numExternalFDs++;
    1230      461520 : }
    1231             : 
    1232             : /*
    1233             :  * ReleaseExternalFD - report release of an external file descriptor
    1234             :  *
    1235             :  * This is guaranteed not to change errno, so it can be used in failure paths.
    1236             :  */
    1237             : void
    1238      424144 : ReleaseExternalFD(void)
    1239             : {
    1240             :     Assert(numExternalFDs > 0);
    1241      424144 :     numExternalFDs--;
    1242      424144 : }
    1243             : 
    1244             : 
    1245             : #if defined(FDDEBUG)
    1246             : 
    1247             : static void
    1248             : _dump_lru(void)
    1249             : {
    1250             :     int         mru = VfdCache[0].lruLessRecently;
    1251             :     Vfd        *vfdP = &VfdCache[mru];
    1252             :     char        buf[2048];
    1253             : 
    1254             :     snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
    1255             :     while (mru != 0)
    1256             :     {
    1257             :         mru = vfdP->lruLessRecently;
    1258             :         vfdP = &VfdCache[mru];
    1259             :         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
    1260             :     }
    1261             :     snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
    1262             :     elog(LOG, "%s", buf);
    1263             : }
    1264             : #endif                          /* FDDEBUG */
    1265             : 
    1266             : static void
    1267     2644358 : Delete(File file)
    1268             : {
    1269             :     Vfd        *vfdP;
    1270             : 
    1271             :     Assert(file != 0);
    1272             : 
    1273             :     DO_DB(elog(LOG, "Delete %d (%s)",
    1274             :                file, VfdCache[file].fileName));
    1275             :     DO_DB(_dump_lru());
    1276             : 
    1277     2644358 :     vfdP = &VfdCache[file];
    1278             : 
    1279     2644358 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
    1280     2644358 :     VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
    1281             : 
    1282             :     DO_DB(_dump_lru());
    1283     2644358 : }
    1284             : 
    1285             : static void
    1286        8046 : LruDelete(File file)
    1287             : {
    1288             :     Vfd        *vfdP;
    1289             : 
    1290             :     Assert(file != 0);
    1291             : 
    1292             :     DO_DB(elog(LOG, "LruDelete %d (%s)",
    1293             :                file, VfdCache[file].fileName));
    1294             : 
    1295        8046 :     vfdP = &VfdCache[file];
    1296             : 
    1297        8046 :     pgaio_closing_fd(vfdP->fd);
    1298             : 
    1299             :     /*
    1300             :      * Close the file.  We aren't expecting this to fail; if it does, better
    1301             :      * to leak the FD than to mess up our internal state.
    1302             :      */
    1303        8046 :     if (close(vfdP->fd) != 0)
    1304           0 :         elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1305             :              "could not close file \"%s\": %m", vfdP->fileName);
    1306        8046 :     vfdP->fd = VFD_CLOSED;
    1307        8046 :     --nfile;
    1308             : 
    1309             :     /* delete the vfd record from the LRU ring */
    1310        8046 :     Delete(file);
    1311        8046 : }
    1312             : 
    1313             : static void
    1314     3738936 : Insert(File file)
    1315             : {
    1316             :     Vfd        *vfdP;
    1317             : 
    1318             :     Assert(file != 0);
    1319             : 
    1320             :     DO_DB(elog(LOG, "Insert %d (%s)",
    1321             :                file, VfdCache[file].fileName));
    1322             :     DO_DB(_dump_lru());
    1323             : 
    1324     3738936 :     vfdP = &VfdCache[file];
    1325             : 
    1326     3738936 :     vfdP->lruMoreRecently = 0;
    1327     3738936 :     vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
    1328     3738936 :     VfdCache[0].lruLessRecently = file;
    1329     3738936 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
    1330             : 
    1331             :     DO_DB(_dump_lru());
    1332     3738936 : }
    1333             : 
    1334             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1335             : static int
    1336         112 : LruInsert(File file)
    1337             : {
    1338             :     Vfd        *vfdP;
    1339             : 
    1340             :     Assert(file != 0);
    1341             : 
    1342             :     DO_DB(elog(LOG, "LruInsert %d (%s)",
    1343             :                file, VfdCache[file].fileName));
    1344             : 
    1345         112 :     vfdP = &VfdCache[file];
    1346             : 
    1347         112 :     if (FileIsNotOpen(file))
    1348             :     {
    1349             :         /* Close excess kernel FDs. */
    1350         112 :         ReleaseLruFiles();
    1351             : 
    1352             :         /*
    1353             :          * The open could still fail for lack of file descriptors, eg due to
    1354             :          * overall system file table being full.  So, be prepared to release
    1355             :          * another FD if necessary...
    1356             :          */
    1357         112 :         vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
    1358             :                                      vfdP->fileMode);
    1359         112 :         if (vfdP->fd < 0)
    1360             :         {
    1361             :             DO_DB(elog(LOG, "re-open failed: %m"));
    1362           0 :             return -1;
    1363             :         }
    1364             :         else
    1365             :         {
    1366         112 :             ++nfile;
    1367             :         }
    1368             :     }
    1369             : 
    1370             :     /*
    1371             :      * put it at the head of the Lru ring
    1372             :      */
    1373             : 
    1374         112 :     Insert(file);
    1375             : 
    1376         112 :     return 0;
    1377             : }
    1378             : 
    1379             : /*
    1380             :  * Release one kernel FD by closing the least-recently-used VFD.
    1381             :  */
    1382             : static bool
    1383        7800 : ReleaseLruFile(void)
    1384             : {
    1385             :     DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
    1386             : 
    1387        7800 :     if (nfile > 0)
    1388             :     {
    1389             :         /*
    1390             :          * There are opened files and so there should be at least one used vfd
    1391             :          * in the ring.
    1392             :          */
    1393             :         Assert(VfdCache[0].lruMoreRecently != 0);
    1394        7800 :         LruDelete(VfdCache[0].lruMoreRecently);
    1395        7800 :         return true;            /* freed a file */
    1396             :     }
    1397           0 :     return false;               /* no files available to free */
    1398             : }
    1399             : 
    1400             : /*
    1401             :  * Release kernel FDs as needed to get under the max_safe_fds limit.
    1402             :  * After calling this, it's OK to try to open another file.
    1403             :  */
    1404             : static void
    1405    19293836 : ReleaseLruFiles(void)
    1406             : {
    1407    19301636 :     while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
    1408             :     {
    1409        7800 :         if (!ReleaseLruFile())
    1410           0 :             break;
    1411             :     }
    1412    19293836 : }
    1413             : 
    1414             : static File
    1415     3147412 : AllocateVfd(void)
    1416             : {
    1417             :     Index       i;
    1418             :     File        file;
    1419             : 
    1420             :     DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
    1421             : 
    1422             :     Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */
    1423             : 
    1424     3147412 :     if (VfdCache[0].nextFree == 0)
    1425             :     {
    1426             :         /*
    1427             :          * The free list is empty so it is time to increase the size of the
    1428             :          * array.  We choose to double it each time this happens. However,
    1429             :          * there's not much point in starting *real* small.
    1430             :          */
    1431       54368 :         Size        newCacheSize = SizeVfdCache * 2;
    1432             :         Vfd        *newVfdCache;
    1433             : 
    1434       54368 :         if (newCacheSize < 32)
    1435       36644 :             newCacheSize = 32;
    1436             : 
    1437             :         /*
    1438             :          * Be careful not to clobber VfdCache ptr if realloc fails.
    1439             :          */
    1440       54368 :         newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
    1441       54368 :         if (newVfdCache == NULL)
    1442           0 :             ereport(ERROR,
    1443             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    1444             :                      errmsg("out of memory")));
    1445       54368 :         VfdCache = newVfdCache;
    1446             : 
    1447             :         /*
    1448             :          * Initialize the new entries and link them into the free list.
    1449             :          */
    1450     2730812 :         for (i = SizeVfdCache; i < newCacheSize; i++)
    1451             :         {
    1452    21411552 :             MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
    1453     2676444 :             VfdCache[i].nextFree = i + 1;
    1454     2676444 :             VfdCache[i].fd = VFD_CLOSED;
    1455             :         }
    1456       54368 :         VfdCache[newCacheSize - 1].nextFree = 0;
    1457       54368 :         VfdCache[0].nextFree = SizeVfdCache;
    1458             : 
    1459             :         /*
    1460             :          * Record the new size
    1461             :          */
    1462       54368 :         SizeVfdCache = newCacheSize;
    1463             :     }
    1464             : 
    1465     3147412 :     file = VfdCache[0].nextFree;
    1466             : 
    1467     3147412 :     VfdCache[0].nextFree = VfdCache[file].nextFree;
    1468             : 
    1469     3147412 :     return file;
    1470             : }
    1471             : 
    1472             : static void
    1473     2047010 : FreeVfd(File file)
    1474             : {
    1475     2047010 :     Vfd        *vfdP = &VfdCache[file];
    1476             : 
    1477             :     DO_DB(elog(LOG, "FreeVfd: %d (%s)",
    1478             :                file, vfdP->fileName ? vfdP->fileName : ""));
    1479             : 
    1480     2047010 :     if (vfdP->fileName != NULL)
    1481             :     {
    1482     1116854 :         free(vfdP->fileName);
    1483     1116854 :         vfdP->fileName = NULL;
    1484             :     }
    1485     2047010 :     vfdP->fdstate = 0x0;
    1486             : 
    1487     2047010 :     vfdP->nextFree = VfdCache[0].nextFree;
    1488     2047010 :     VfdCache[0].nextFree = file;
    1489     2047010 : }
    1490             : 
    1491             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1492             : static int
    1493     5992370 : FileAccess(File file)
    1494             : {
    1495             :     int         returnValue;
    1496             : 
    1497             :     DO_DB(elog(LOG, "FileAccess %d (%s)",
    1498             :                file, VfdCache[file].fileName));
    1499             : 
    1500             :     /*
    1501             :      * Is the file open?  If not, open it and put it at the head of the LRU
    1502             :      * ring (possibly closing the least recently used file to get an FD).
    1503             :      */
    1504             : 
    1505     5992370 :     if (FileIsNotOpen(file))
    1506             :     {
    1507         112 :         returnValue = LruInsert(file);
    1508         112 :         if (returnValue != 0)
    1509           0 :             return returnValue;
    1510             :     }
    1511     5992258 :     else if (VfdCache[0].lruLessRecently != file)
    1512             :     {
    1513             :         /*
    1514             :          * We now know that the file is open and that it is not the last one
    1515             :          * accessed, so we need to move it to the head of the Lru ring.
    1516             :          */
    1517             : 
    1518     1521568 :         Delete(file);
    1519     1521568 :         Insert(file);
    1520             :     }
    1521             : 
    1522     5992370 :     return 0;
    1523             : }
    1524             : 
    1525             : /*
    1526             :  * Called whenever a temporary file is deleted to report its size.
    1527             :  */
    1528             : static void
    1529        4424 : ReportTemporaryFileUsage(const char *path, off_t size)
    1530             : {
    1531        4424 :     pgstat_report_tempfile(size);
    1532             : 
    1533        4424 :     if (log_temp_files >= 0)
    1534             :     {
    1535        1460 :         if ((size / 1024) >= log_temp_files)
    1536         224 :             ereport(LOG,
    1537             :                     (errmsg("temporary file: path \"%s\", size %lu",
    1538             :                             path, (unsigned long) size)));
    1539             :     }
    1540        4424 : }
    1541             : 
    1542             : /*
    1543             :  * Called to register a temporary file for automatic close.
    1544             :  * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
    1545             :  * before the file was opened.
    1546             :  */
    1547             : static void
    1548        7350 : RegisterTemporaryFile(File file)
    1549             : {
    1550        7350 :     ResourceOwnerRememberFile(CurrentResourceOwner, file);
    1551        7350 :     VfdCache[file].resowner = CurrentResourceOwner;
    1552             : 
    1553             :     /* Backup mechanism for closing at end of xact. */
    1554        7350 :     VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
    1555        7350 :     have_xact_temporary_files = true;
    1556        7350 : }
    1557             : 
    1558             : /*
    1559             :  *  Called when we get a shared invalidation message on some relation.
    1560             :  */
    1561             : #ifdef NOT_USED
    1562             : void
    1563             : FileInvalidate(File file)
    1564             : {
    1565             :     Assert(FileIsValid(file));
    1566             :     if (!FileIsNotOpen(file))
    1567             :         LruDelete(file);
    1568             : }
    1569             : #endif
    1570             : 
    1571             : /*
    1572             :  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
    1573             :  * fileMode parameter.
    1574             :  */
    1575             : File
    1576     3147412 : PathNameOpenFile(const char *fileName, int fileFlags)
    1577             : {
    1578     3147412 :     return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1579             : }
    1580             : 
    1581             : /*
    1582             :  * open a file in an arbitrary directory
    1583             :  *
    1584             :  * NB: if the passed pathname is relative (which it usually is),
    1585             :  * it will be interpreted relative to the process' working directory
    1586             :  * (which should always be $PGDATA when this code is running).
    1587             :  */
    1588             : File
    1589     3147412 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1590             : {
    1591             :     char       *fnamecopy;
    1592             :     File        file;
    1593             :     Vfd        *vfdP;
    1594             : 
    1595             :     DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
    1596             :                fileName, fileFlags, fileMode));
    1597             : 
    1598             :     /*
    1599             :      * We need a malloc'd copy of the file name; fail cleanly if no room.
    1600             :      */
    1601     3147412 :     fnamecopy = strdup(fileName);
    1602     3147412 :     if (fnamecopy == NULL)
    1603           0 :         ereport(ERROR,
    1604             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    1605             :                  errmsg("out of memory")));
    1606             : 
    1607     3147412 :     file = AllocateVfd();
    1608     3147412 :     vfdP = &VfdCache[file];
    1609             : 
    1610             :     /* Close excess kernel FDs. */
    1611     3147412 :     ReleaseLruFiles();
    1612             : 
    1613             :     /*
    1614             :      * Descriptors managed by VFDs are implicitly marked O_CLOEXEC.  The
    1615             :      * client shouldn't be expected to know which kernel descriptors are
    1616             :      * currently open, so it wouldn't make sense for them to be inherited by
    1617             :      * executed subprograms.
    1618             :      */
    1619     3147412 :     fileFlags |= O_CLOEXEC;
    1620             : 
    1621     3147412 :     vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    1622             : 
    1623     3147412 :     if (vfdP->fd < 0)
    1624             :     {
    1625      930156 :         int         save_errno = errno;
    1626             : 
    1627      930156 :         FreeVfd(file);
    1628      930156 :         free(fnamecopy);
    1629      930156 :         errno = save_errno;
    1630      930156 :         return -1;
    1631             :     }
    1632     2217256 :     ++nfile;
    1633             :     DO_DB(elog(LOG, "PathNameOpenFile: success %d",
    1634             :                vfdP->fd));
    1635             : 
    1636     2217256 :     vfdP->fileName = fnamecopy;
    1637             :     /* Saved flags are adjusted to be OK for re-opening file */
    1638     2217256 :     vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    1639     2217256 :     vfdP->fileMode = fileMode;
    1640     2217256 :     vfdP->fileSize = 0;
    1641     2217256 :     vfdP->fdstate = 0x0;
    1642     2217256 :     vfdP->resowner = NULL;
    1643             : 
    1644     2217256 :     Insert(file);
    1645             : 
    1646     2217256 :     return file;
    1647             : }
    1648             : 
    1649             : /*
    1650             :  * Create directory 'directory'.  If necessary, create 'basedir', which must
    1651             :  * be the directory above it.  This is designed for creating the top-level
    1652             :  * temporary directory on demand before creating a directory underneath it.
    1653             :  * Do nothing if the directory already exists.
    1654             :  *
    1655             :  * Directories created within the top-level temporary directory should begin
    1656             :  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
    1657             :  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
    1658             :  * that do not need any particular prefix.
    1659             : */
    1660             : void
    1661         352 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
    1662             : {
    1663         352 :     if (MakePGDirectory(directory) < 0)
    1664             :     {
    1665          34 :         if (errno == EEXIST)
    1666          12 :             return;
    1667             : 
    1668             :         /*
    1669             :          * Failed.  Try to create basedir first in case it's missing. Tolerate
    1670             :          * EEXIST to close a race against another process following the same
    1671             :          * algorithm.
    1672             :          */
    1673          22 :         if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
    1674           0 :             ereport(ERROR,
    1675             :                     (errcode_for_file_access(),
    1676             :                      errmsg("cannot create temporary directory \"%s\": %m",
    1677             :                             basedir)));
    1678             : 
    1679             :         /* Try again. */
    1680          22 :         if (MakePGDirectory(directory) < 0 && errno != EEXIST)
    1681           0 :             ereport(ERROR,
    1682             :                     (errcode_for_file_access(),
    1683             :                      errmsg("cannot create temporary subdirectory \"%s\": %m",
    1684             :                             directory)));
    1685             :     }
    1686             : }
    1687             : 
    1688             : /*
    1689             :  * Delete a directory and everything in it, if it exists.
    1690             :  */
    1691             : void
    1692         418 : PathNameDeleteTemporaryDir(const char *dirname)
    1693             : {
    1694             :     struct stat statbuf;
    1695             : 
    1696             :     /* Silently ignore missing directory. */
    1697         418 :     if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
    1698          80 :         return;
    1699             : 
    1700             :     /*
    1701             :      * Currently, walkdir doesn't offer a way for our passed in function to
    1702             :      * maintain state.  Perhaps it should, so that we could tell the caller
    1703             :      * whether this operation succeeded or failed.  Since this operation is
    1704             :      * used in a cleanup path, we wouldn't actually behave differently: we'll
    1705             :      * just log failures.
    1706             :      */
    1707         338 :     walkdir(dirname, unlink_if_exists_fname, false, LOG);
    1708             : }
    1709             : 
    1710             : /*
    1711             :  * Open a temporary file that will disappear when we close it.
    1712             :  *
    1713             :  * This routine takes care of generating an appropriate tempfile name.
    1714             :  * There's no need to pass in fileFlags or fileMode either, since only
    1715             :  * one setting makes any sense for a temp file.
    1716             :  *
    1717             :  * Unless interXact is true, the file is remembered by CurrentResourceOwner
    1718             :  * to ensure it's closed and deleted when it's no longer needed, typically at
    1719             :  * the end-of-transaction. In most cases, you don't want temporary files to
    1720             :  * outlive the transaction that created them, so this should be false -- but
    1721             :  * if you need "somewhat" temporary storage, this might be useful. In either
    1722             :  * case, the file is removed when the File is explicitly closed.
    1723             :  */
    1724             : File
    1725        2414 : OpenTemporaryFile(bool interXact)
    1726             : {
    1727        2414 :     File        file = 0;
    1728             : 
    1729             :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1730             : 
    1731             :     /*
    1732             :      * Make sure the current resource owner has space for this File before we
    1733             :      * open it, if we'll be registering it below.
    1734             :      */
    1735        2414 :     if (!interXact)
    1736        2414 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    1737             : 
    1738             :     /*
    1739             :      * If some temp tablespace(s) have been given to us, try to use the next
    1740             :      * one.  If a given tablespace can't be found, we silently fall back to
    1741             :      * the database's default tablespace.
    1742             :      *
    1743             :      * BUT: if the temp file is slated to outlive the current transaction,
    1744             :      * force it into the database's default tablespace, so that it will not
    1745             :      * pose a threat to possible tablespace drop attempts.
    1746             :      */
    1747        2414 :     if (numTempTableSpaces > 0 && !interXact)
    1748             :     {
    1749           2 :         Oid         tblspcOid = GetNextTempTableSpace();
    1750             : 
    1751           2 :         if (OidIsValid(tblspcOid))
    1752           2 :             file = OpenTemporaryFileInTablespace(tblspcOid, false);
    1753             :     }
    1754             : 
    1755             :     /*
    1756             :      * If not, or if tablespace is bad, create in database's default
    1757             :      * tablespace.  MyDatabaseTableSpace should normally be set before we get
    1758             :      * here, but just in case it isn't, fall back to pg_default tablespace.
    1759             :      */
    1760        2414 :     if (file <= 0)
    1761        2412 :         file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
    1762             :                                              MyDatabaseTableSpace :
    1763             :                                              DEFAULTTABLESPACE_OID,
    1764             :                                              true);
    1765             : 
    1766             :     /* Mark it for deletion at close and temporary file size limit */
    1767        2414 :     VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
    1768             : 
    1769             :     /* Register it with the current resource owner */
    1770        2414 :     if (!interXact)
    1771        2414 :         RegisterTemporaryFile(file);
    1772             : 
    1773        2414 :     return file;
    1774             : }
    1775             : 
    1776             : /*
    1777             :  * Return the path of the temp directory in a given tablespace.
    1778             :  */
    1779             : void
    1780       14736 : TempTablespacePath(char *path, Oid tablespace)
    1781             : {
    1782             :     /*
    1783             :      * Identify the tempfile directory for this tablespace.
    1784             :      *
    1785             :      * If someone tries to specify pg_global, use pg_default instead.
    1786             :      */
    1787       14736 :     if (tablespace == InvalidOid ||
    1788           2 :         tablespace == DEFAULTTABLESPACE_OID ||
    1789             :         tablespace == GLOBALTABLESPACE_OID)
    1790       14734 :         snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
    1791             :     else
    1792             :     {
    1793             :         /* All other tablespaces are accessed via symlinks */
    1794           2 :         snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
    1795             :                  PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
    1796             :                  PG_TEMP_FILES_DIR);
    1797             :     }
    1798       14736 : }
    1799             : 
    1800             : /*
    1801             :  * Open a temporary file in a specific tablespace.
    1802             :  * Subroutine for OpenTemporaryFile, which see for details.
    1803             :  */
    1804             : static File
    1805        2414 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
    1806             : {
    1807             :     char        tempdirpath[MAXPGPATH];
    1808             :     char        tempfilepath[MAXPGPATH];
    1809             :     File        file;
    1810             : 
    1811        2414 :     TempTablespacePath(tempdirpath, tblspcOid);
    1812             : 
    1813             :     /*
    1814             :      * Generate a tempfile name that should be unique within the current
    1815             :      * database instance.
    1816             :      */
    1817        2414 :     snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
    1818             :              tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
    1819             : 
    1820             :     /*
    1821             :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1822             :      * temp file that can be reused.
    1823             :      */
    1824        2414 :     file = PathNameOpenFile(tempfilepath,
    1825             :                             O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1826        2414 :     if (file <= 0)
    1827             :     {
    1828             :         /*
    1829             :          * We might need to create the tablespace's tempfile directory, if no
    1830             :          * one has yet done so.
    1831             :          *
    1832             :          * Don't check for an error from MakePGDirectory; it could fail if
    1833             :          * someone else just did the same thing.  If it doesn't work then
    1834             :          * we'll bomb out on the second create attempt, instead.
    1835             :          */
    1836         186 :         (void) MakePGDirectory(tempdirpath);
    1837             : 
    1838         186 :         file = PathNameOpenFile(tempfilepath,
    1839             :                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1840         186 :         if (file <= 0 && rejectError)
    1841           0 :             elog(ERROR, "could not create temporary file \"%s\": %m",
    1842             :                  tempfilepath);
    1843             :     }
    1844             : 
    1845        2414 :     return file;
    1846             : }
    1847             : 
    1848             : 
    1849             : /*
    1850             :  * Create a new file.  The directory containing it must already exist.  Files
    1851             :  * created this way are subject to temp_file_limit and are automatically
    1852             :  * closed at end of transaction, but are not automatically deleted on close
    1853             :  * because they are intended to be shared between cooperating backends.
    1854             :  *
    1855             :  * If the file is inside the top-level temporary directory, its name should
    1856             :  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
    1857             :  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
    1858             :  * inside a directory created with PathNameCreateTemporaryDir(), in which case
    1859             :  * the prefix isn't needed.
    1860             :  */
    1861             : File
    1862        2362 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
    1863             : {
    1864             :     File        file;
    1865             : 
    1866             :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1867             : 
    1868        2362 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    1869             : 
    1870             :     /*
    1871             :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1872             :      * temp file that can be reused.
    1873             :      */
    1874        2362 :     file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1875        2362 :     if (file <= 0)
    1876             :     {
    1877         352 :         if (error_on_failure)
    1878           0 :             ereport(ERROR,
    1879             :                     (errcode_for_file_access(),
    1880             :                      errmsg("could not create temporary file \"%s\": %m",
    1881             :                             path)));
    1882             :         else
    1883         352 :             return file;
    1884             :     }
    1885             : 
    1886             :     /* Mark it for temp_file_limit accounting. */
    1887        2010 :     VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
    1888             : 
    1889             :     /* Register it for automatic close. */
    1890        2010 :     RegisterTemporaryFile(file);
    1891             : 
    1892        2010 :     return file;
    1893             : }
    1894             : 
    1895             : /*
    1896             :  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
    1897             :  * another backend.  Files opened this way don't count against the
    1898             :  * temp_file_limit of the caller, are automatically closed at the end of the
    1899             :  * transaction but are not deleted on close.
    1900             :  */
    1901             : File
    1902        6398 : PathNameOpenTemporaryFile(const char *path, int mode)
    1903             : {
    1904             :     File        file;
    1905             : 
    1906             :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1907             : 
    1908        6398 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    1909             : 
    1910        6398 :     file = PathNameOpenFile(path, mode | PG_BINARY);
    1911             : 
    1912             :     /* If no such file, then we don't raise an error. */
    1913        6398 :     if (file <= 0 && errno != ENOENT)
    1914           0 :         ereport(ERROR,
    1915             :                 (errcode_for_file_access(),
    1916             :                  errmsg("could not open temporary file \"%s\": %m",
    1917             :                         path)));
    1918             : 
    1919        6398 :     if (file > 0)
    1920             :     {
    1921             :         /* Register it for automatic close. */
    1922        2926 :         RegisterTemporaryFile(file);
    1923             :     }
    1924             : 
    1925        6398 :     return file;
    1926             : }
    1927             : 
    1928             : /*
    1929             :  * Delete a file by pathname.  Return true if the file existed, false if
    1930             :  * didn't.
    1931             :  */
    1932             : bool
    1933        4724 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
    1934             : {
    1935             :     struct stat filestats;
    1936             :     int         stat_errno;
    1937             : 
    1938             :     /* Get the final size for pgstat reporting. */
    1939        4724 :     if (stat(path, &filestats) != 0)
    1940        2714 :         stat_errno = errno;
    1941             :     else
    1942        2010 :         stat_errno = 0;
    1943             : 
    1944             :     /*
    1945             :      * Unlike FileClose's automatic file deletion code, we tolerate
    1946             :      * non-existence to support BufFileDeleteFileSet which doesn't know how
    1947             :      * many segments it has to delete until it runs out.
    1948             :      */
    1949        4724 :     if (stat_errno == ENOENT)
    1950        2714 :         return false;
    1951             : 
    1952        2010 :     if (unlink(path) < 0)
    1953             :     {
    1954           0 :         if (errno != ENOENT)
    1955           0 :             ereport(error_on_failure ? ERROR : LOG,
    1956             :                     (errcode_for_file_access(),
    1957             :                      errmsg("could not unlink temporary file \"%s\": %m",
    1958             :                             path)));
    1959           0 :         return false;
    1960             :     }
    1961             : 
    1962        2010 :     if (stat_errno == 0)
    1963        2010 :         ReportTemporaryFileUsage(path, filestats.st_size);
    1964             :     else
    1965             :     {
    1966           0 :         errno = stat_errno;
    1967           0 :         ereport(LOG,
    1968             :                 (errcode_for_file_access(),
    1969             :                  errmsg("could not stat file \"%s\": %m", path)));
    1970             :     }
    1971             : 
    1972        2010 :     return true;
    1973             : }
    1974             : 
    1975             : /*
    1976             :  * close a file when done with it
    1977             :  */
    1978             : void
    1979     1116854 : FileClose(File file)
    1980             : {
    1981             :     Vfd        *vfdP;
    1982             : 
    1983             :     Assert(FileIsValid(file));
    1984             : 
    1985             :     DO_DB(elog(LOG, "FileClose: %d (%s)",
    1986             :                file, VfdCache[file].fileName));
    1987             : 
    1988     1116854 :     vfdP = &VfdCache[file];
    1989             : 
    1990     1116854 :     if (!FileIsNotOpen(file))
    1991             :     {
    1992     1114744 :         pgaio_closing_fd(vfdP->fd);
    1993             : 
    1994             :         /* close the file */
    1995     1114744 :         if (close(vfdP->fd) != 0)
    1996             :         {
    1997             :             /*
    1998             :              * We may need to panic on failure to close non-temporary files;
    1999             :              * see LruDelete.
    2000             :              */
    2001           0 :             elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    2002             :                  "could not close file \"%s\": %m", vfdP->fileName);
    2003             :         }
    2004             : 
    2005     1114744 :         --nfile;
    2006     1114744 :         vfdP->fd = VFD_CLOSED;
    2007             : 
    2008             :         /* remove the file from the lru ring */
    2009     1114744 :         Delete(file);
    2010             :     }
    2011             : 
    2012     1116854 :     if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    2013             :     {
    2014             :         /* Subtract its size from current usage (do first in case of error) */
    2015        4424 :         temporary_files_size -= vfdP->fileSize;
    2016        4424 :         vfdP->fileSize = 0;
    2017             :     }
    2018             : 
    2019             :     /*
    2020             :      * Delete the file if it was temporary, and make a log entry if wanted
    2021             :      */
    2022     1116854 :     if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
    2023             :     {
    2024             :         struct stat filestats;
    2025             :         int         stat_errno;
    2026             : 
    2027             :         /*
    2028             :          * If we get an error, as could happen within the ereport/elog calls,
    2029             :          * we'll come right back here during transaction abort.  Reset the
    2030             :          * flag to ensure that we can't get into an infinite loop.  This code
    2031             :          * is arranged to ensure that the worst-case consequence is failing to
    2032             :          * emit log message(s), not failing to attempt the unlink.
    2033             :          */
    2034        2414 :         vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
    2035             : 
    2036             : 
    2037             :         /* first try the stat() */
    2038        2414 :         if (stat(vfdP->fileName, &filestats))
    2039           0 :             stat_errno = errno;
    2040             :         else
    2041        2414 :             stat_errno = 0;
    2042             : 
    2043             :         /* in any case do the unlink */
    2044        2414 :         if (unlink(vfdP->fileName))
    2045           0 :             ereport(LOG,
    2046             :                     (errcode_for_file_access(),
    2047             :                      errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
    2048             : 
    2049             :         /* and last report the stat results */
    2050        2414 :         if (stat_errno == 0)
    2051        2414 :             ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
    2052             :         else
    2053             :         {
    2054           0 :             errno = stat_errno;
    2055           0 :             ereport(LOG,
    2056             :                     (errcode_for_file_access(),
    2057             :                      errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
    2058             :         }
    2059             :     }
    2060             : 
    2061             :     /* Unregister it from the resource owner */
    2062     1116854 :     if (vfdP->resowner)
    2063        7342 :         ResourceOwnerForgetFile(vfdP->resowner, file);
    2064             : 
    2065             :     /*
    2066             :      * Return the Vfd slot to the free list
    2067             :      */
    2068     1116854 :     FreeVfd(file);
    2069     1116854 : }
    2070             : 
    2071             : /*
    2072             :  * FilePrefetch - initiate asynchronous read of a given range of the file.
    2073             :  *
    2074             :  * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
    2075             :  *
    2076             :  * posix_fadvise() is the simplest standardized interface that accomplishes
    2077             :  * this.
    2078             :  */
    2079             : int
    2080       16914 : FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
    2081             : {
    2082             :     Assert(FileIsValid(file));
    2083             : 
    2084             :     DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2085             :                file, VfdCache[file].fileName,
    2086             :                (int64) offset, (int64) amount));
    2087             : 
    2088             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
    2089             :     {
    2090             :         int         returnCode;
    2091             : 
    2092       16914 :         returnCode = FileAccess(file);
    2093       16914 :         if (returnCode < 0)
    2094           0 :             return returnCode;
    2095             : 
    2096       16914 : retry:
    2097       16914 :         pgstat_report_wait_start(wait_event_info);
    2098       16914 :         returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
    2099             :                                    POSIX_FADV_WILLNEED);
    2100       16914 :         pgstat_report_wait_end();
    2101             : 
    2102       16914 :         if (returnCode == EINTR)
    2103           0 :             goto retry;
    2104             : 
    2105       16914 :         return returnCode;
    2106             :     }
    2107             : #elif defined(__darwin__)
    2108             :     {
    2109             :         struct radvisory
    2110             :         {
    2111             :             off_t       ra_offset;  /* offset into the file */
    2112             :             int         ra_count;   /* size of the read     */
    2113             :         }           ra;
    2114             :         int         returnCode;
    2115             : 
    2116             :         returnCode = FileAccess(file);
    2117             :         if (returnCode < 0)
    2118             :             return returnCode;
    2119             : 
    2120             :         ra.ra_offset = offset;
    2121             :         ra.ra_count = amount;
    2122             :         pgstat_report_wait_start(wait_event_info);
    2123             :         returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
    2124             :         pgstat_report_wait_end();
    2125             :         if (returnCode != -1)
    2126             :             return 0;
    2127             :         else
    2128             :             return errno;
    2129             :     }
    2130             : #else
    2131             :     return 0;
    2132             : #endif
    2133             : }
    2134             : 
    2135             : void
    2136           0 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
    2137             : {
    2138             :     int         returnCode;
    2139             : 
    2140             :     Assert(FileIsValid(file));
    2141             : 
    2142             :     DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2143             :                file, VfdCache[file].fileName,
    2144             :                (int64) offset, (int64) nbytes));
    2145             : 
    2146           0 :     if (nbytes <= 0)
    2147           0 :         return;
    2148             : 
    2149           0 :     if (VfdCache[file].fileFlags & PG_O_DIRECT)
    2150           0 :         return;
    2151             : 
    2152           0 :     returnCode = FileAccess(file);
    2153           0 :     if (returnCode < 0)
    2154           0 :         return;
    2155             : 
    2156           0 :     pgstat_report_wait_start(wait_event_info);
    2157           0 :     pg_flush_data(VfdCache[file].fd, offset, nbytes);
    2158           0 :     pgstat_report_wait_end();
    2159             : }
    2160             : 
    2161             : ssize_t
    2162      810460 : FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
    2163             :           uint32 wait_event_info)
    2164             : {
    2165             :     ssize_t     returnCode;
    2166             :     Vfd        *vfdP;
    2167             : 
    2168             :     Assert(FileIsValid(file));
    2169             : 
    2170             :     DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
    2171             :                file, VfdCache[file].fileName,
    2172             :                (int64) offset,
    2173             :                iovcnt));
    2174             : 
    2175      810460 :     returnCode = FileAccess(file);
    2176      810460 :     if (returnCode < 0)
    2177           0 :         return returnCode;
    2178             : 
    2179      810460 :     vfdP = &VfdCache[file];
    2180             : 
    2181      810460 : retry:
    2182      810460 :     pgstat_report_wait_start(wait_event_info);
    2183      810460 :     returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
    2184      810460 :     pgstat_report_wait_end();
    2185             : 
    2186      810460 :     if (returnCode < 0)
    2187             :     {
    2188             :         /*
    2189             :          * Windows may run out of kernel buffers and return "Insufficient
    2190             :          * system resources" error.  Wait a bit and retry to solve it.
    2191             :          *
    2192             :          * It is rumored that EINTR is also possible on some Unix filesystems,
    2193             :          * in which case immediate retry is indicated.
    2194             :          */
    2195             : #ifdef WIN32
    2196             :         DWORD       error = GetLastError();
    2197             : 
    2198             :         switch (error)
    2199             :         {
    2200             :             case ERROR_NO_SYSTEM_RESOURCES:
    2201             :                 pg_usleep(1000L);
    2202             :                 errno = EINTR;
    2203             :                 break;
    2204             :             default:
    2205             :                 _dosmaperr(error);
    2206             :                 break;
    2207             :         }
    2208             : #endif
    2209             :         /* OK to retry if interrupted */
    2210           0 :         if (errno == EINTR)
    2211           0 :             goto retry;
    2212             :     }
    2213             : 
    2214      810460 :     return returnCode;
    2215             : }
    2216             : 
    2217             : int
    2218     2407282 : FileStartReadV(PgAioHandle *ioh, File file,
    2219             :                int iovcnt, off_t offset,
    2220             :                uint32 wait_event_info)
    2221             : {
    2222             :     int         returnCode;
    2223             :     Vfd        *vfdP;
    2224             : 
    2225             :     Assert(FileIsValid(file));
    2226             : 
    2227             :     DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
    2228             :                file, VfdCache[file].fileName,
    2229             :                (int64) offset,
    2230             :                iovcnt));
    2231             : 
    2232     2407282 :     returnCode = FileAccess(file);
    2233     2407282 :     if (returnCode < 0)
    2234           0 :         return returnCode;
    2235             : 
    2236     2407282 :     vfdP = &VfdCache[file];
    2237             : 
    2238     2407282 :     pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
    2239             : 
    2240     2407282 :     return 0;
    2241             : }
    2242             : 
    2243             : ssize_t
    2244     1476066 : FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
    2245             :            uint32 wait_event_info)
    2246             : {
    2247             :     ssize_t     returnCode;
    2248             :     Vfd        *vfdP;
    2249             : 
    2250             :     Assert(FileIsValid(file));
    2251             : 
    2252             :     DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
    2253             :                file, VfdCache[file].fileName,
    2254             :                (int64) offset,
    2255             :                iovcnt));
    2256             : 
    2257     1476066 :     returnCode = FileAccess(file);
    2258     1476066 :     if (returnCode < 0)
    2259           0 :         return returnCode;
    2260             : 
    2261     1476066 :     vfdP = &VfdCache[file];
    2262             : 
    2263             :     /*
    2264             :      * If enforcing temp_file_limit and it's a temp file, check to see if the
    2265             :      * write would overrun temp_file_limit, and throw error if so.  Note: it's
    2266             :      * really a modularity violation to throw error here; we should set errno
    2267             :      * and return -1.  However, there's no way to report a suitable error
    2268             :      * message if we do that.  All current callers would just throw error
    2269             :      * immediately anyway, so this is safe at present.
    2270             :      */
    2271     1476066 :     if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
    2272             :     {
    2273           0 :         off_t       past_write = offset;
    2274             : 
    2275           0 :         for (int i = 0; i < iovcnt; ++i)
    2276           0 :             past_write += iov[i].iov_len;
    2277             : 
    2278           0 :         if (past_write > vfdP->fileSize)
    2279             :         {
    2280           0 :             uint64      newTotal = temporary_files_size;
    2281             : 
    2282           0 :             newTotal += past_write - vfdP->fileSize;
    2283           0 :             if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
    2284           0 :                 ereport(ERROR,
    2285             :                         (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
    2286             :                          errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
    2287             :                                 temp_file_limit)));
    2288             :         }
    2289             :     }
    2290             : 
    2291     1476066 : retry:
    2292     1476066 :     pgstat_report_wait_start(wait_event_info);
    2293     1476066 :     returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
    2294     1476066 :     pgstat_report_wait_end();
    2295             : 
    2296     1476066 :     if (returnCode >= 0)
    2297             :     {
    2298             :         /*
    2299             :          * Some callers expect short writes to set errno, and traditionally we
    2300             :          * have assumed that they imply disk space shortage.  We don't want to
    2301             :          * waste CPU cycles adding up the total size here, so we'll just set
    2302             :          * it for all successful writes in case such a caller determines that
    2303             :          * the write was short and ereports "%m".
    2304             :          */
    2305     1476066 :         errno = ENOSPC;
    2306             : 
    2307             :         /*
    2308             :          * Maintain fileSize and temporary_files_size if it's a temp file.
    2309             :          */
    2310     1476066 :         if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    2311             :         {
    2312      107166 :             off_t       past_write = offset + returnCode;
    2313             : 
    2314      107166 :             if (past_write > vfdP->fileSize)
    2315             :             {
    2316       73826 :                 temporary_files_size += past_write - vfdP->fileSize;
    2317       73826 :                 vfdP->fileSize = past_write;
    2318             :             }
    2319             :         }
    2320             :     }
    2321             :     else
    2322             :     {
    2323             :         /*
    2324             :          * See comments in FileReadV()
    2325             :          */
    2326             : #ifdef WIN32
    2327             :         DWORD       error = GetLastError();
    2328             : 
    2329             :         switch (error)
    2330             :         {
    2331             :             case ERROR_NO_SYSTEM_RESOURCES:
    2332             :                 pg_usleep(1000L);
    2333             :                 errno = EINTR;
    2334             :                 break;
    2335             :             default:
    2336             :                 _dosmaperr(error);
    2337             :                 break;
    2338             :         }
    2339             : #endif
    2340             :         /* OK to retry if interrupted */
    2341           0 :         if (errno == EINTR)
    2342           0 :             goto retry;
    2343             :     }
    2344             : 
    2345     1476066 :     return returnCode;
    2346             : }
    2347             : 
    2348             : int
    2349        1728 : FileSync(File file, uint32 wait_event_info)
    2350             : {
    2351             :     int         returnCode;
    2352             : 
    2353             :     Assert(FileIsValid(file));
    2354             : 
    2355             :     DO_DB(elog(LOG, "FileSync: %d (%s)",
    2356             :                file, VfdCache[file].fileName));
    2357             : 
    2358        1728 :     returnCode = FileAccess(file);
    2359        1728 :     if (returnCode < 0)
    2360           0 :         return returnCode;
    2361             : 
    2362        1728 :     pgstat_report_wait_start(wait_event_info);
    2363        1728 :     returnCode = pg_fsync(VfdCache[file].fd);
    2364        1728 :     pgstat_report_wait_end();
    2365             : 
    2366        1728 :     return returnCode;
    2367             : }
    2368             : 
    2369             : /*
    2370             :  * Zero a region of the file.
    2371             :  *
    2372             :  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
    2373             :  * appropriate error.
    2374             :  */
    2375             : int
    2376      426718 : FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
    2377             : {
    2378             :     int         returnCode;
    2379             :     ssize_t     written;
    2380             : 
    2381             :     Assert(FileIsValid(file));
    2382             : 
    2383             :     DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2384             :                file, VfdCache[file].fileName,
    2385             :                (int64) offset, (int64) amount));
    2386             : 
    2387      426718 :     returnCode = FileAccess(file);
    2388      426718 :     if (returnCode < 0)
    2389           0 :         return returnCode;
    2390             : 
    2391      426718 :     pgstat_report_wait_start(wait_event_info);
    2392      426718 :     written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
    2393      426718 :     pgstat_report_wait_end();
    2394             : 
    2395      426718 :     if (written < 0)
    2396           0 :         return -1;
    2397      426718 :     else if (written != amount)
    2398             :     {
    2399             :         /* if errno is unset, assume problem is no disk space */
    2400           0 :         if (errno == 0)
    2401           0 :             errno = ENOSPC;
    2402           0 :         return -1;
    2403             :     }
    2404             : 
    2405      426718 :     return 0;
    2406             : }
    2407             : 
    2408             : /*
    2409             :  * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
    2410             :  * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
    2411             :  * use FileZero() instead.
    2412             :  *
    2413             :  * Note that at least glibc() implements posix_fallocate() in userspace if not
    2414             :  * implemented by the filesystem. That's not the case for all environments
    2415             :  * though.
    2416             :  *
    2417             :  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
    2418             :  * appropriate error.
    2419             :  */
    2420             : int
    2421        1330 : FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
    2422             : {
    2423             : #ifdef HAVE_POSIX_FALLOCATE
    2424             :     int         returnCode;
    2425             : 
    2426             :     Assert(FileIsValid(file));
    2427             : 
    2428             :     DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2429             :                file, VfdCache[file].fileName,
    2430             :                (int64) offset, (int64) amount));
    2431             : 
    2432        1330 :     returnCode = FileAccess(file);
    2433        1330 :     if (returnCode < 0)
    2434           0 :         return -1;
    2435             : 
    2436        1330 : retry:
    2437        1330 :     pgstat_report_wait_start(wait_event_info);
    2438        1330 :     returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
    2439        1330 :     pgstat_report_wait_end();
    2440             : 
    2441        1330 :     if (returnCode == 0)
    2442        1330 :         return 0;
    2443           0 :     else if (returnCode == EINTR)
    2444           0 :         goto retry;
    2445             : 
    2446             :     /* for compatibility with %m printing etc */
    2447           0 :     errno = returnCode;
    2448             : 
    2449             :     /*
    2450             :      * Return in cases of a "real" failure, if fallocate is not supported,
    2451             :      * fall through to the FileZero() backed implementation.
    2452             :      */
    2453           0 :     if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
    2454           0 :         return -1;
    2455             : #endif
    2456             : 
    2457           0 :     return FileZero(file, offset, amount, wait_event_info);
    2458             : }
    2459             : 
    2460             : off_t
    2461     5097272 : FileSize(File file)
    2462             : {
    2463             :     Assert(FileIsValid(file));
    2464             : 
    2465             :     DO_DB(elog(LOG, "FileSize %d (%s)",
    2466             :                file, VfdCache[file].fileName));
    2467             : 
    2468     5097272 :     if (FileIsNotOpen(file))
    2469             :     {
    2470          34 :         if (FileAccess(file) < 0)
    2471           0 :             return (off_t) -1;
    2472             :     }
    2473             : 
    2474     5097272 :     return lseek(VfdCache[file].fd, 0, SEEK_END);
    2475             : }
    2476             : 
    2477             : int
    2478        1086 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
    2479             : {
    2480             :     int         returnCode;
    2481             : 
    2482             :     Assert(FileIsValid(file));
    2483             : 
    2484             :     DO_DB(elog(LOG, "FileTruncate %d (%s)",
    2485             :                file, VfdCache[file].fileName));
    2486             : 
    2487        1086 :     returnCode = FileAccess(file);
    2488        1086 :     if (returnCode < 0)
    2489           0 :         return returnCode;
    2490             : 
    2491        1086 :     pgstat_report_wait_start(wait_event_info);
    2492        1086 :     returnCode = pg_ftruncate(VfdCache[file].fd, offset);
    2493        1086 :     pgstat_report_wait_end();
    2494             : 
    2495        1086 :     if (returnCode == 0 && VfdCache[file].fileSize > offset)
    2496             :     {
    2497             :         /* adjust our state for truncation of a temp file */
    2498             :         Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
    2499           0 :         temporary_files_size -= VfdCache[file].fileSize - offset;
    2500           0 :         VfdCache[file].fileSize = offset;
    2501             :     }
    2502             : 
    2503        1086 :     return returnCode;
    2504             : }
    2505             : 
    2506             : /*
    2507             :  * Return the pathname associated with an open file.
    2508             :  *
    2509             :  * The returned string points to an internal buffer, which is valid until
    2510             :  * the file is closed.
    2511             :  */
    2512             : char *
    2513          44 : FilePathName(File file)
    2514             : {
    2515             :     Assert(FileIsValid(file));
    2516             : 
    2517          44 :     return VfdCache[file].fileName;
    2518             : }
    2519             : 
    2520             : /*
    2521             :  * Return the raw file descriptor of an opened file.
    2522             :  *
    2523             :  * The returned file descriptor will be valid until the file is closed, but
    2524             :  * there are a lot of things that can make that happen.  So the caller should
    2525             :  * be careful not to do much of anything else before it finishes using the
    2526             :  * returned file descriptor.
    2527             :  */
    2528             : int
    2529      850752 : FileGetRawDesc(File file)
    2530             : {
    2531             :     int         returnCode;
    2532             : 
    2533      850752 :     returnCode = FileAccess(file);
    2534      850752 :     if (returnCode < 0)
    2535           0 :         return returnCode;
    2536             : 
    2537             :     Assert(FileIsValid(file));
    2538      850752 :     return VfdCache[file].fd;
    2539             : }
    2540             : 
    2541             : /*
    2542             :  * FileGetRawFlags - returns the file flags on open(2)
    2543             :  */
    2544             : int
    2545           0 : FileGetRawFlags(File file)
    2546             : {
    2547             :     Assert(FileIsValid(file));
    2548           0 :     return VfdCache[file].fileFlags;
    2549             : }
    2550             : 
    2551             : /*
    2552             :  * FileGetRawMode - returns the mode bitmask passed to open(2)
    2553             :  */
    2554             : mode_t
    2555           0 : FileGetRawMode(File file)
    2556             : {
    2557             :     Assert(FileIsValid(file));
    2558           0 :     return VfdCache[file].fileMode;
    2559             : }
    2560             : 
    2561             : /*
    2562             :  * Make room for another allocatedDescs[] array entry if needed and possible.
    2563             :  * Returns true if an array element is available.
    2564             :  */
    2565             : static bool
    2566    15684792 : reserveAllocatedDesc(void)
    2567             : {
    2568             :     AllocateDesc *newDescs;
    2569             :     int         newMax;
    2570             : 
    2571             :     /* Quick out if array already has a free slot. */
    2572    15684792 :     if (numAllocatedDescs < maxAllocatedDescs)
    2573    15682640 :         return true;
    2574             : 
    2575             :     /*
    2576             :      * If the array hasn't yet been created in the current process, initialize
    2577             :      * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
    2578             :      * we will ever need, anyway.  We don't want to look at max_safe_fds
    2579             :      * immediately because set_max_safe_fds() may not have run yet.
    2580             :      */
    2581        2152 :     if (allocatedDescs == NULL)
    2582             :     {
    2583        2152 :         newMax = FD_MINFREE / 3;
    2584        2152 :         newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
    2585             :         /* Out of memory already?  Treat as fatal error. */
    2586        2152 :         if (newDescs == NULL)
    2587           0 :             ereport(ERROR,
    2588             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    2589             :                      errmsg("out of memory")));
    2590        2152 :         allocatedDescs = newDescs;
    2591        2152 :         maxAllocatedDescs = newMax;
    2592        2152 :         return true;
    2593             :     }
    2594             : 
    2595             :     /*
    2596             :      * Consider enlarging the array beyond the initial allocation used above.
    2597             :      * By the time this happens, max_safe_fds should be known accurately.
    2598             :      *
    2599             :      * We mustn't let allocated descriptors hog all the available FDs, and in
    2600             :      * practice we'd better leave a reasonable number of FDs for VFD use.  So
    2601             :      * set the maximum to max_safe_fds / 3.  (This should certainly be at
    2602             :      * least as large as the initial size, FD_MINFREE / 3, so we aren't
    2603             :      * tightening the restriction here.)  Recall that "external" FDs are
    2604             :      * allowed to consume another third of max_safe_fds.
    2605             :      */
    2606           0 :     newMax = max_safe_fds / 3;
    2607           0 :     if (newMax > maxAllocatedDescs)
    2608             :     {
    2609           0 :         newDescs = (AllocateDesc *) realloc(allocatedDescs,
    2610             :                                             newMax * sizeof(AllocateDesc));
    2611             :         /* Treat out-of-memory as a non-fatal error. */
    2612           0 :         if (newDescs == NULL)
    2613           0 :             return false;
    2614           0 :         allocatedDescs = newDescs;
    2615           0 :         maxAllocatedDescs = newMax;
    2616           0 :         return true;
    2617             :     }
    2618             : 
    2619             :     /* Can't enlarge allocatedDescs[] any more. */
    2620           0 :     return false;
    2621             : }
    2622             : 
    2623             : /*
    2624             :  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
    2625             :  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
    2626             :  * necessary to open the file.  When done, call FreeFile rather than fclose.
    2627             :  *
    2628             :  * Note that files that will be open for any significant length of time
    2629             :  * should NOT be handled this way, since they cannot share kernel file
    2630             :  * descriptors with other files; there is grave risk of running out of FDs
    2631             :  * if anyone locks down too many FDs.  Most callers of this routine are
    2632             :  * simply reading a config file that they will read and close immediately.
    2633             :  *
    2634             :  * fd.c will automatically close all files opened with AllocateFile at
    2635             :  * transaction commit or abort; this prevents FD leakage if a routine
    2636             :  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
    2637             :  *
    2638             :  * Ideally this should be the *only* direct call of fopen() in the backend.
    2639             :  */
    2640             : FILE *
    2641      167984 : AllocateFile(const char *name, const char *mode)
    2642             : {
    2643             :     FILE       *file;
    2644             : 
    2645             :     DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
    2646             :                numAllocatedDescs, name));
    2647             : 
    2648             :     /* Can we allocate another non-virtual FD? */
    2649      167984 :     if (!reserveAllocatedDesc())
    2650           0 :         ereport(ERROR,
    2651             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2652             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2653             :                         maxAllocatedDescs, name)));
    2654             : 
    2655             :     /* Close excess kernel FDs. */
    2656      167984 :     ReleaseLruFiles();
    2657             : 
    2658      167984 : TryAgain:
    2659      167984 :     if ((file = fopen(name, mode)) != NULL)
    2660             :     {
    2661      154290 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2662             : 
    2663      154290 :         desc->kind = AllocateDescFile;
    2664      154290 :         desc->desc.file = file;
    2665      154290 :         desc->create_subid = GetCurrentSubTransactionId();
    2666      154290 :         numAllocatedDescs++;
    2667      154290 :         return desc->desc.file;
    2668             :     }
    2669             : 
    2670       13694 :     if (errno == EMFILE || errno == ENFILE)
    2671             :     {
    2672           0 :         int         save_errno = errno;
    2673             : 
    2674           0 :         ereport(LOG,
    2675             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2676             :                  errmsg("out of file descriptors: %m; release and retry")));
    2677           0 :         errno = 0;
    2678           0 :         if (ReleaseLruFile())
    2679           0 :             goto TryAgain;
    2680           0 :         errno = save_errno;
    2681             :     }
    2682             : 
    2683       13694 :     return NULL;
    2684             : }
    2685             : 
    2686             : /*
    2687             :  * Open a file with OpenTransientFilePerm() and pass default file mode for
    2688             :  * the fileMode parameter.
    2689             :  */
    2690             : int
    2691    15425964 : OpenTransientFile(const char *fileName, int fileFlags)
    2692             : {
    2693    15425964 :     return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
    2694             : }
    2695             : 
    2696             : /*
    2697             :  * Like AllocateFile, but returns an unbuffered fd like open(2)
    2698             :  */
    2699             : int
    2700    15425976 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    2701             : {
    2702             :     int         fd;
    2703             : 
    2704             :     DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
    2705             :                numAllocatedDescs, fileName));
    2706             : 
    2707             :     /* Can we allocate another non-virtual FD? */
    2708    15425976 :     if (!reserveAllocatedDesc())
    2709           0 :         ereport(ERROR,
    2710             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2711             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2712             :                         maxAllocatedDescs, fileName)));
    2713             : 
    2714             :     /* Close excess kernel FDs. */
    2715    15425976 :     ReleaseLruFiles();
    2716             : 
    2717    15425976 :     fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    2718             : 
    2719    15425976 :     if (fd >= 0)
    2720             :     {
    2721    15416202 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2722             : 
    2723    15416202 :         desc->kind = AllocateDescRawFD;
    2724    15416202 :         desc->desc.fd = fd;
    2725    15416202 :         desc->create_subid = GetCurrentSubTransactionId();
    2726    15416202 :         numAllocatedDescs++;
    2727             : 
    2728    15416202 :         return fd;
    2729             :     }
    2730             : 
    2731        9774 :     return -1;                  /* failure */
    2732             : }
    2733             : 
    2734             : /*
    2735             :  * Routines that want to initiate a pipe stream should use OpenPipeStream
    2736             :  * rather than plain popen().  This lets fd.c deal with freeing FDs if
    2737             :  * necessary.  When done, call ClosePipeStream rather than pclose.
    2738             :  *
    2739             :  * This function also ensures that the popen'd program is run with default
    2740             :  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
    2741             :  * uses.  This ensures desirable response to, eg, closing a read pipe early.
    2742             :  */
    2743             : FILE *
    2744         116 : OpenPipeStream(const char *command, const char *mode)
    2745             : {
    2746             :     FILE       *file;
    2747             :     int         save_errno;
    2748             : 
    2749             :     DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
    2750             :                numAllocatedDescs, command));
    2751             : 
    2752             :     /* Can we allocate another non-virtual FD? */
    2753         116 :     if (!reserveAllocatedDesc())
    2754           0 :         ereport(ERROR,
    2755             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2756             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
    2757             :                         maxAllocatedDescs, command)));
    2758             : 
    2759             :     /* Close excess kernel FDs. */
    2760         116 :     ReleaseLruFiles();
    2761             : 
    2762         116 : TryAgain:
    2763         116 :     fflush(NULL);
    2764         116 :     pqsignal(SIGPIPE, SIG_DFL);
    2765         116 :     errno = 0;
    2766         116 :     file = popen(command, mode);
    2767         116 :     save_errno = errno;
    2768         116 :     pqsignal(SIGPIPE, SIG_IGN);
    2769         116 :     errno = save_errno;
    2770         116 :     if (file != NULL)
    2771             :     {
    2772         116 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2773             : 
    2774         116 :         desc->kind = AllocateDescPipe;
    2775         116 :         desc->desc.file = file;
    2776         116 :         desc->create_subid = GetCurrentSubTransactionId();
    2777         116 :         numAllocatedDescs++;
    2778         116 :         return desc->desc.file;
    2779             :     }
    2780             : 
    2781           0 :     if (errno == EMFILE || errno == ENFILE)
    2782             :     {
    2783           0 :         ereport(LOG,
    2784             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2785             :                  errmsg("out of file descriptors: %m; release and retry")));
    2786           0 :         if (ReleaseLruFile())
    2787           0 :             goto TryAgain;
    2788           0 :         errno = save_errno;
    2789             :     }
    2790             : 
    2791           0 :     return NULL;
    2792             : }
    2793             : 
    2794             : /*
    2795             :  * Free an AllocateDesc of any type.
    2796             :  *
    2797             :  * The argument *must* point into the allocatedDescs[] array.
    2798             :  */
    2799             : static int
    2800    15659596 : FreeDesc(AllocateDesc *desc)
    2801             : {
    2802             :     int         result;
    2803             : 
    2804             :     /* Close the underlying object */
    2805    15659596 :     switch (desc->kind)
    2806             :     {
    2807      154290 :         case AllocateDescFile:
    2808      154290 :             result = fclose(desc->desc.file);
    2809      154290 :             break;
    2810         116 :         case AllocateDescPipe:
    2811         116 :             result = pclose(desc->desc.file);
    2812         116 :             break;
    2813       88988 :         case AllocateDescDir:
    2814       88988 :             result = closedir(desc->desc.dir);
    2815       88988 :             break;
    2816    15416202 :         case AllocateDescRawFD:
    2817    15416202 :             pgaio_closing_fd(desc->desc.fd);
    2818    15416202 :             result = close(desc->desc.fd);
    2819    15416202 :             break;
    2820           0 :         default:
    2821           0 :             elog(ERROR, "AllocateDesc kind not recognized");
    2822             :             result = 0;         /* keep compiler quiet */
    2823             :             break;
    2824             :     }
    2825             : 
    2826             :     /* Compact storage in the allocatedDescs array */
    2827    15659596 :     numAllocatedDescs--;
    2828    15659596 :     *desc = allocatedDescs[numAllocatedDescs];
    2829             : 
    2830    15659596 :     return result;
    2831             : }
    2832             : 
    2833             : /*
    2834             :  * Close a file returned by AllocateFile.
    2835             :  *
    2836             :  * Note we do not check fclose's return value --- it is up to the caller
    2837             :  * to handle close errors.
    2838             :  */
    2839             : int
    2840      154258 : FreeFile(FILE *file)
    2841             : {
    2842             :     int         i;
    2843             : 
    2844             :     DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
    2845             : 
    2846             :     /* Remove file from list of allocated files, if it's present */
    2847      154260 :     for (i = numAllocatedDescs; --i >= 0;)
    2848             :     {
    2849      154260 :         AllocateDesc *desc = &allocatedDescs[i];
    2850             : 
    2851      154260 :         if (desc->kind == AllocateDescFile && desc->desc.file == file)
    2852      154258 :             return FreeDesc(desc);
    2853             :     }
    2854             : 
    2855             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2856           0 :     elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
    2857             : 
    2858           0 :     return fclose(file);
    2859             : }
    2860             : 
    2861             : /*
    2862             :  * Close a file returned by OpenTransientFile.
    2863             :  *
    2864             :  * Note we do not check close's return value --- it is up to the caller
    2865             :  * to handle close errors.
    2866             :  */
    2867             : int
    2868    15416200 : CloseTransientFile(int fd)
    2869             : {
    2870             :     int         i;
    2871             : 
    2872             :     DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
    2873             : 
    2874             :     /* Remove fd from list of allocated files, if it's present */
    2875    15416218 :     for (i = numAllocatedDescs; --i >= 0;)
    2876             :     {
    2877    15416218 :         AllocateDesc *desc = &allocatedDescs[i];
    2878             : 
    2879    15416218 :         if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
    2880    15416200 :             return FreeDesc(desc);
    2881             :     }
    2882             : 
    2883             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2884           0 :     elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
    2885             : 
    2886           0 :     pgaio_closing_fd(fd);
    2887             : 
    2888           0 :     return close(fd);
    2889             : }
    2890             : 
    2891             : /*
    2892             :  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
    2893             :  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
    2894             :  * necessary to open the directory, and with closing it after an elog.
    2895             :  * When done, call FreeDir rather than closedir.
    2896             :  *
    2897             :  * Returns NULL, with errno set, on failure.  Note that failure detection
    2898             :  * is commonly left to the following call of ReadDir or ReadDirExtended;
    2899             :  * see the comments for ReadDir.
    2900             :  *
    2901             :  * Ideally this should be the *only* direct call of opendir() in the backend.
    2902             :  */
    2903             : DIR *
    2904       90716 : AllocateDir(const char *dirname)
    2905             : {
    2906             :     DIR        *dir;
    2907             : 
    2908             :     DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
    2909             :                numAllocatedDescs, dirname));
    2910             : 
    2911             :     /* Can we allocate another non-virtual FD? */
    2912       90716 :     if (!reserveAllocatedDesc())
    2913           0 :         ereport(ERROR,
    2914             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2915             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
    2916             :                         maxAllocatedDescs, dirname)));
    2917             : 
    2918             :     /* Close excess kernel FDs. */
    2919       90716 :     ReleaseLruFiles();
    2920             : 
    2921       90716 : TryAgain:
    2922       90716 :     if ((dir = opendir(dirname)) != NULL)
    2923             :     {
    2924       88988 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2925             : 
    2926       88988 :         desc->kind = AllocateDescDir;
    2927       88988 :         desc->desc.dir = dir;
    2928       88988 :         desc->create_subid = GetCurrentSubTransactionId();
    2929       88988 :         numAllocatedDescs++;
    2930       88988 :         return desc->desc.dir;
    2931             :     }
    2932             : 
    2933        1728 :     if (errno == EMFILE || errno == ENFILE)
    2934             :     {
    2935           0 :         int         save_errno = errno;
    2936             : 
    2937           0 :         ereport(LOG,
    2938             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2939             :                  errmsg("out of file descriptors: %m; release and retry")));
    2940           0 :         errno = 0;
    2941           0 :         if (ReleaseLruFile())
    2942           0 :             goto TryAgain;
    2943           0 :         errno = save_errno;
    2944             :     }
    2945             : 
    2946        1728 :     return NULL;
    2947             : }
    2948             : 
    2949             : /*
    2950             :  * Read a directory opened with AllocateDir, ereport'ing any error.
    2951             :  *
    2952             :  * This is easier to use than raw readdir() since it takes care of some
    2953             :  * otherwise rather tedious and error-prone manipulation of errno.  Also,
    2954             :  * if you are happy with a generic error message for AllocateDir failure,
    2955             :  * you can just do
    2956             :  *
    2957             :  *      dir = AllocateDir(path);
    2958             :  *      while ((dirent = ReadDir(dir, path)) != NULL)
    2959             :  *          process dirent;
    2960             :  *      FreeDir(dir);
    2961             :  *
    2962             :  * since a NULL dir parameter is taken as indicating AllocateDir failed.
    2963             :  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
    2964             :  * use this shortcut.)
    2965             :  *
    2966             :  * The pathname passed to AllocateDir must be passed to this routine too,
    2967             :  * but it is only used for error reporting.
    2968             :  */
    2969             : struct dirent *
    2970     5066802 : ReadDir(DIR *dir, const char *dirname)
    2971             : {
    2972     5066802 :     return ReadDirExtended(dir, dirname, ERROR);
    2973             : }
    2974             : 
    2975             : /*
    2976             :  * Alternate version of ReadDir that allows caller to specify the elevel
    2977             :  * for any error report (whether it's reporting an initial failure of
    2978             :  * AllocateDir or a subsequent directory read failure).
    2979             :  *
    2980             :  * If elevel < ERROR, returns NULL after any error.  With the normal coding
    2981             :  * pattern, this will result in falling out of the loop immediately as
    2982             :  * though the directory contained no (more) entries.
    2983             :  */
    2984             : struct dirent *
    2985     8214856 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
    2986             : {
    2987             :     struct dirent *dent;
    2988             : 
    2989             :     /* Give a generic message for AllocateDir failure, if caller didn't */
    2990     8214856 :     if (dir == NULL)
    2991             :     {
    2992           6 :         ereport(elevel,
    2993             :                 (errcode_for_file_access(),
    2994             :                  errmsg("could not open directory \"%s\": %m",
    2995             :                         dirname)));
    2996           0 :         return NULL;
    2997             :     }
    2998             : 
    2999     8214850 :     errno = 0;
    3000     8214850 :     if ((dent = readdir(dir)) != NULL)
    3001     8148192 :         return dent;
    3002             : 
    3003       66658 :     if (errno)
    3004           0 :         ereport(elevel,
    3005             :                 (errcode_for_file_access(),
    3006             :                  errmsg("could not read directory \"%s\": %m",
    3007             :                         dirname)));
    3008       66658 :     return NULL;
    3009             : }
    3010             : 
    3011             : /*
    3012             :  * Close a directory opened with AllocateDir.
    3013             :  *
    3014             :  * Returns closedir's return value (with errno set if it's not 0).
    3015             :  * Note we do not check the return value --- it is up to the caller
    3016             :  * to handle close errors if wanted.
    3017             :  *
    3018             :  * Does nothing if dir == NULL; we assume that directory open failure was
    3019             :  * already reported if desired.
    3020             :  */
    3021             : int
    3022       88742 : FreeDir(DIR *dir)
    3023             : {
    3024             :     int         i;
    3025             : 
    3026             :     /* Nothing to do if AllocateDir failed */
    3027       88742 :     if (dir == NULL)
    3028           0 :         return 0;
    3029             : 
    3030             :     DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
    3031             : 
    3032             :     /* Remove dir from list of allocated dirs, if it's present */
    3033       88742 :     for (i = numAllocatedDescs; --i >= 0;)
    3034             :     {
    3035       88742 :         AllocateDesc *desc = &allocatedDescs[i];
    3036             : 
    3037       88742 :         if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
    3038       88742 :             return FreeDesc(desc);
    3039             :     }
    3040             : 
    3041             :     /* Only get here if someone passes us a dir not in allocatedDescs */
    3042           0 :     elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
    3043             : 
    3044           0 :     return closedir(dir);
    3045             : }
    3046             : 
    3047             : 
    3048             : /*
    3049             :  * Close a pipe stream returned by OpenPipeStream.
    3050             :  */
    3051             : int
    3052         116 : ClosePipeStream(FILE *file)
    3053             : {
    3054             :     int         i;
    3055             : 
    3056             :     DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
    3057             : 
    3058             :     /* Remove file from list of allocated files, if it's present */
    3059         116 :     for (i = numAllocatedDescs; --i >= 0;)
    3060             :     {
    3061         116 :         AllocateDesc *desc = &allocatedDescs[i];
    3062             : 
    3063         116 :         if (desc->kind == AllocateDescPipe && desc->desc.file == file)
    3064         116 :             return FreeDesc(desc);
    3065             :     }
    3066             : 
    3067             :     /* Only get here if someone passes us a file not in allocatedDescs */
    3068           0 :     elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
    3069             : 
    3070           0 :     return pclose(file);
    3071             : }
    3072             : 
    3073             : /*
    3074             :  * closeAllVfds
    3075             :  *
    3076             :  * Force all VFDs into the physically-closed state, so that the fewest
    3077             :  * possible number of kernel file descriptors are in use.  There is no
    3078             :  * change in the logical state of the VFDs.
    3079             :  */
    3080             : void
    3081          52 : closeAllVfds(void)
    3082             : {
    3083             :     Index       i;
    3084             : 
    3085          52 :     if (SizeVfdCache > 0)
    3086             :     {
    3087             :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    3088        1664 :         for (i = 1; i < SizeVfdCache; i++)
    3089             :         {
    3090        1612 :             if (!FileIsNotOpen(i))
    3091         246 :                 LruDelete(i);
    3092             :         }
    3093             :     }
    3094          52 : }
    3095             : 
    3096             : 
    3097             : /*
    3098             :  * SetTempTablespaces
    3099             :  *
    3100             :  * Define a list (actually an array) of OIDs of tablespaces to use for
    3101             :  * temporary files.  This list will be used until end of transaction,
    3102             :  * unless this function is called again before then.  It is caller's
    3103             :  * responsibility that the passed-in array has adequate lifespan (typically
    3104             :  * it'd be allocated in TopTransactionContext).
    3105             :  *
    3106             :  * Some entries of the array may be InvalidOid, indicating that the current
    3107             :  * database's default tablespace should be used.
    3108             :  */
    3109             : void
    3110        6112 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
    3111             : {
    3112             :     Assert(numSpaces >= 0);
    3113        6112 :     tempTableSpaces = tableSpaces;
    3114        6112 :     numTempTableSpaces = numSpaces;
    3115             : 
    3116             :     /*
    3117             :      * Select a random starting point in the list.  This is to minimize
    3118             :      * conflicts between backends that are most likely sharing the same list
    3119             :      * of temp tablespaces.  Note that if we create multiple temp files in the
    3120             :      * same transaction, we'll advance circularly through the list --- this
    3121             :      * ensures that large temporary sort files are nicely spread across all
    3122             :      * available tablespaces.
    3123             :      */
    3124        6112 :     if (numSpaces > 1)
    3125           0 :         nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
    3126           0 :                                                   0, numSpaces - 1);
    3127             :     else
    3128        6112 :         nextTempTableSpace = 0;
    3129        6112 : }
    3130             : 
    3131             : /*
    3132             :  * TempTablespacesAreSet
    3133             :  *
    3134             :  * Returns true if SetTempTablespaces has been called in current transaction.
    3135             :  * (This is just so that tablespaces.c doesn't need its own per-transaction
    3136             :  * state.)
    3137             :  */
    3138             : bool
    3139        8024 : TempTablespacesAreSet(void)
    3140             : {
    3141        8024 :     return (numTempTableSpaces >= 0);
    3142             : }
    3143             : 
    3144             : /*
    3145             :  * GetTempTablespaces
    3146             :  *
    3147             :  * Populate an array with the OIDs of the tablespaces that should be used for
    3148             :  * temporary files.  (Some entries may be InvalidOid, indicating that the
    3149             :  * current database's default tablespace should be used.)  At most numSpaces
    3150             :  * entries will be filled.
    3151             :  * Returns the number of OIDs that were copied into the output array.
    3152             :  */
    3153             : int
    3154         370 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
    3155             : {
    3156             :     int         i;
    3157             : 
    3158             :     Assert(TempTablespacesAreSet());
    3159         370 :     for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
    3160           0 :         tableSpaces[i] = tempTableSpaces[i];
    3161             : 
    3162         370 :     return i;
    3163             : }
    3164             : 
    3165             : /*
    3166             :  * GetNextTempTableSpace
    3167             :  *
    3168             :  * Select the next temp tablespace to use.  A result of InvalidOid means
    3169             :  * to use the current database's default tablespace.
    3170             :  */
    3171             : Oid
    3172        4212 : GetNextTempTableSpace(void)
    3173             : {
    3174        4212 :     if (numTempTableSpaces > 0)
    3175             :     {
    3176             :         /* Advance nextTempTableSpace counter with wraparound */
    3177           2 :         if (++nextTempTableSpace >= numTempTableSpaces)
    3178           2 :             nextTempTableSpace = 0;
    3179           2 :         return tempTableSpaces[nextTempTableSpace];
    3180             :     }
    3181        4210 :     return InvalidOid;
    3182             : }
    3183             : 
    3184             : 
    3185             : /*
    3186             :  * AtEOSubXact_Files
    3187             :  *
    3188             :  * Take care of subtransaction commit/abort.  At abort, we close temp files
    3189             :  * that the subtransaction may have opened.  At commit, we reassign the
    3190             :  * files that were opened to the parent subtransaction.
    3191             :  */
    3192             : void
    3193       20084 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
    3194             :                   SubTransactionId parentSubid)
    3195             : {
    3196             :     Index       i;
    3197             : 
    3198       20084 :     for (i = 0; i < numAllocatedDescs; i++)
    3199             :     {
    3200           0 :         if (allocatedDescs[i].create_subid == mySubid)
    3201             :         {
    3202           0 :             if (isCommit)
    3203           0 :                 allocatedDescs[i].create_subid = parentSubid;
    3204             :             else
    3205             :             {
    3206             :                 /* have to recheck the item after FreeDesc (ugly) */
    3207           0 :                 FreeDesc(&allocatedDescs[i--]);
    3208             :             }
    3209             :         }
    3210             :     }
    3211       20084 : }
    3212             : 
    3213             : /*
    3214             :  * AtEOXact_Files
    3215             :  *
    3216             :  * This routine is called during transaction commit or abort.  All still-open
    3217             :  * per-transaction temporary file VFDs are closed, which also causes the
    3218             :  * underlying files to be deleted (although they should've been closed already
    3219             :  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
    3220             :  * closed. We also forget any transaction-local temp tablespace list.
    3221             :  *
    3222             :  * The isCommit flag is used only to decide whether to emit warnings about
    3223             :  * unclosed files.
    3224             :  */
    3225             : void
    3226     1122122 : AtEOXact_Files(bool isCommit)
    3227             : {
    3228     1122122 :     CleanupTempFiles(isCommit, false);
    3229     1122122 :     tempTableSpaces = NULL;
    3230     1122122 :     numTempTableSpaces = -1;
    3231     1122122 : }
    3232             : 
    3233             : /*
    3234             :  * BeforeShmemExit_Files
    3235             :  *
    3236             :  * before_shmem_exit hook to clean up temp files during backend shutdown.
    3237             :  * Here, we want to clean up *all* temp files including interXact ones.
    3238             :  */
    3239             : static void
    3240       42720 : BeforeShmemExit_Files(int code, Datum arg)
    3241             : {
    3242       42720 :     CleanupTempFiles(false, true);
    3243             : 
    3244             :     /* prevent further temp files from being created */
    3245             : #ifdef USE_ASSERT_CHECKING
    3246             :     temporary_files_allowed = false;
    3247             : #endif
    3248       42720 : }
    3249             : 
    3250             : /*
    3251             :  * Close temporary files and delete their underlying files.
    3252             :  *
    3253             :  * isCommit: if true, this is normal transaction commit, and we don't
    3254             :  * expect any remaining files; warn if there are some.
    3255             :  *
    3256             :  * isProcExit: if true, this is being called as the backend process is
    3257             :  * exiting. If that's the case, we should remove all temporary files; if
    3258             :  * that's not the case, we are being called for transaction commit/abort
    3259             :  * and should only remove transaction-local temp files.  In either case,
    3260             :  * also clean up "allocated" stdio files, dirs and fds.
    3261             :  */
    3262             : static void
    3263     1164842 : CleanupTempFiles(bool isCommit, bool isProcExit)
    3264             : {
    3265             :     Index       i;
    3266             : 
    3267             :     /*
    3268             :      * Careful here: at proc_exit we need extra cleanup, not just
    3269             :      * xact_temporary files.
    3270             :      */
    3271     1164842 :     if (isProcExit || have_xact_temporary_files)
    3272             :     {
    3273             :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    3274     2792764 :         for (i = 1; i < SizeVfdCache; i++)
    3275             :         {
    3276     2748522 :             unsigned short fdstate = VfdCache[i].fdstate;
    3277             : 
    3278     2748522 :             if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
    3279           8 :                 VfdCache[i].fileName != NULL)
    3280             :             {
    3281             :                 /*
    3282             :                  * If we're in the process of exiting a backend process, close
    3283             :                  * all temporary files. Otherwise, only close temporary files
    3284             :                  * local to the current transaction. They should be closed by
    3285             :                  * the ResourceOwner mechanism already, so this is just a
    3286             :                  * debugging cross-check.
    3287             :                  */
    3288           8 :                 if (isProcExit)
    3289           8 :                     FileClose(i);
    3290           0 :                 else if (fdstate & FD_CLOSE_AT_EOXACT)
    3291             :                 {
    3292           0 :                     elog(WARNING,
    3293             :                          "temporary file %s not closed at end-of-transaction",
    3294             :                          VfdCache[i].fileName);
    3295           0 :                     FileClose(i);
    3296             :                 }
    3297             :             }
    3298             :         }
    3299             : 
    3300       44242 :         have_xact_temporary_files = false;
    3301             :     }
    3302             : 
    3303             :     /* Complain if any allocated files remain open at commit. */
    3304     1164842 :     if (isCommit && numAllocatedDescs > 0)
    3305           0 :         elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
    3306             :              numAllocatedDescs);
    3307             : 
    3308             :     /* Clean up "allocated" stdio files, dirs and fds. */
    3309     1165122 :     while (numAllocatedDescs > 0)
    3310         280 :         FreeDesc(&allocatedDescs[0]);
    3311     1164842 : }
    3312             : 
    3313             : 
    3314             : /*
    3315             :  * Remove temporary and temporary relation files left over from a prior
    3316             :  * postmaster session
    3317             :  *
    3318             :  * This should be called during postmaster startup.  It will forcibly
    3319             :  * remove any leftover files created by OpenTemporaryFile and any leftover
    3320             :  * temporary relation files created by mdcreate.
    3321             :  *
    3322             :  * During post-backend-crash restart cycle, this routine is called when
    3323             :  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
    3324             :  * queries are using temp files could result in useless storage usage that can
    3325             :  * only be reclaimed by a service restart. The argument against enabling it is
    3326             :  * that someone might want to examine the temporary files for debugging
    3327             :  * purposes. This does however mean that OpenTemporaryFile had better allow for
    3328             :  * collision with an existing temp file name.
    3329             :  *
    3330             :  * NOTE: this function and its subroutines generally report syscall failures
    3331             :  * with ereport(LOG) and keep going.  Removing temp files is not so critical
    3332             :  * that we should fail to start the database when we can't do it.
    3333             :  */
    3334             : void
    3335        1706 : RemovePgTempFiles(void)
    3336             : {
    3337             :     char        temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
    3338             :     DIR        *spc_dir;
    3339             :     struct dirent *spc_de;
    3340             : 
    3341             :     /*
    3342             :      * First process temp files in pg_default ($PGDATA/base)
    3343             :      */
    3344        1706 :     snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
    3345        1706 :     RemovePgTempFilesInDir(temp_path, true, false);
    3346        1706 :     RemovePgTempRelationFiles("base");
    3347             : 
    3348             :     /*
    3349             :      * Cycle through temp directories for all non-default tablespaces.
    3350             :      */
    3351        1706 :     spc_dir = AllocateDir(PG_TBLSPC_DIR);
    3352             : 
    3353        5238 :     while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
    3354             :     {
    3355        3532 :         if (strcmp(spc_de->d_name, ".") == 0 ||
    3356        1826 :             strcmp(spc_de->d_name, "..") == 0)
    3357        3412 :             continue;
    3358             : 
    3359         120 :         snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
    3360         120 :                  PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
    3361             :                  PG_TEMP_FILES_DIR);
    3362         120 :         RemovePgTempFilesInDir(temp_path, true, false);
    3363             : 
    3364         120 :         snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
    3365         120 :                  PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
    3366         120 :         RemovePgTempRelationFiles(temp_path);
    3367             :     }
    3368             : 
    3369        1706 :     FreeDir(spc_dir);
    3370             : 
    3371             :     /*
    3372             :      * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
    3373             :      * DataDir as well.  However, that is *not* cleaned here because doing so
    3374             :      * would create a race condition.  It's done separately, earlier in
    3375             :      * postmaster startup.
    3376             :      */
    3377        1706 : }
    3378             : 
    3379             : /*
    3380             :  * Process one pgsql_tmp directory for RemovePgTempFiles.
    3381             :  *
    3382             :  * If missing_ok is true, it's all right for the named directory to not exist.
    3383             :  * Any other problem results in a LOG message.  (missing_ok should be true at
    3384             :  * the top level, since pgsql_tmp directories are not created until needed.)
    3385             :  *
    3386             :  * At the top level, this should be called with unlink_all = false, so that
    3387             :  * only files matching the temporary name prefix will be unlinked.  When
    3388             :  * recursing it will be called with unlink_all = true to unlink everything
    3389             :  * under a top-level temporary directory.
    3390             :  *
    3391             :  * (These two flags could be replaced by one, but it seems clearer to keep
    3392             :  * them separate.)
    3393             :  */
    3394             : void
    3395        1828 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
    3396             : {
    3397             :     DIR        *temp_dir;
    3398             :     struct dirent *temp_de;
    3399             :     char        rm_path[MAXPGPATH * 2];
    3400             : 
    3401        1828 :     temp_dir = AllocateDir(tmpdirname);
    3402             : 
    3403        1828 :     if (temp_dir == NULL && errno == ENOENT && missing_ok)
    3404        1696 :         return;
    3405             : 
    3406         402 :     while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
    3407             :     {
    3408         270 :         if (strcmp(temp_de->d_name, ".") == 0 ||
    3409         138 :             strcmp(temp_de->d_name, "..") == 0)
    3410         264 :             continue;
    3411             : 
    3412           6 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3413           6 :                  tmpdirname, temp_de->d_name);
    3414             : 
    3415           6 :         if (unlink_all ||
    3416           6 :             strncmp(temp_de->d_name,
    3417             :                     PG_TEMP_FILE_PREFIX,
    3418             :                     strlen(PG_TEMP_FILE_PREFIX)) == 0)
    3419           6 :         {
    3420           6 :             PGFileType  type = get_dirent_type(rm_path, temp_de, false, LOG);
    3421             : 
    3422           6 :             if (type == PGFILETYPE_ERROR)
    3423           0 :                 continue;
    3424           6 :             else if (type == PGFILETYPE_DIR)
    3425             :             {
    3426             :                 /* recursively remove contents, then directory itself */
    3427           2 :                 RemovePgTempFilesInDir(rm_path, false, true);
    3428             : 
    3429           2 :                 if (rmdir(rm_path) < 0)
    3430           0 :                     ereport(LOG,
    3431             :                             (errcode_for_file_access(),
    3432             :                              errmsg("could not remove directory \"%s\": %m",
    3433             :                                     rm_path)));
    3434             :             }
    3435             :             else
    3436             :             {
    3437           4 :                 if (unlink(rm_path) < 0)
    3438           0 :                     ereport(LOG,
    3439             :                             (errcode_for_file_access(),
    3440             :                              errmsg("could not remove file \"%s\": %m",
    3441             :                                     rm_path)));
    3442             :             }
    3443             :         }
    3444             :         else
    3445           0 :             ereport(LOG,
    3446             :                     (errmsg("unexpected file found in temporary-files directory: \"%s\"",
    3447             :                             rm_path)));
    3448             :     }
    3449             : 
    3450         132 :     FreeDir(temp_dir);
    3451             : }
    3452             : 
    3453             : /* Process one tablespace directory, look for per-DB subdirectories */
    3454             : static void
    3455        1826 : RemovePgTempRelationFiles(const char *tsdirname)
    3456             : {
    3457             :     DIR        *ts_dir;
    3458             :     struct dirent *de;
    3459             :     char        dbspace_path[MAXPGPATH * 2];
    3460             : 
    3461        1826 :     ts_dir = AllocateDir(tsdirname);
    3462             : 
    3463       11392 :     while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
    3464             :     {
    3465             :         /*
    3466             :          * We're only interested in the per-database directories, which have
    3467             :          * numeric names.  Note that this code will also (properly) ignore "."
    3468             :          * and "..".
    3469             :          */
    3470        9566 :         if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    3471        3782 :             continue;
    3472             : 
    3473        5784 :         snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
    3474        5784 :                  tsdirname, de->d_name);
    3475        5784 :         RemovePgTempRelationFilesInDbspace(dbspace_path);
    3476             :     }
    3477             : 
    3478        1826 :     FreeDir(ts_dir);
    3479        1826 : }
    3480             : 
    3481             : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
    3482             : static void
    3483        5784 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
    3484             : {
    3485             :     DIR        *dbspace_dir;
    3486             :     struct dirent *de;
    3487             :     char        rm_path[MAXPGPATH * 2];
    3488             : 
    3489        5784 :     dbspace_dir = AllocateDir(dbspacedirname);
    3490             : 
    3491     1760314 :     while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
    3492             :     {
    3493     1754530 :         if (!looks_like_temp_rel_name(de->d_name))
    3494     1754512 :             continue;
    3495             : 
    3496          18 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3497          18 :                  dbspacedirname, de->d_name);
    3498             : 
    3499          18 :         if (unlink(rm_path) < 0)
    3500           0 :             ereport(LOG,
    3501             :                     (errcode_for_file_access(),
    3502             :                      errmsg("could not remove file \"%s\": %m",
    3503             :                             rm_path)));
    3504             :     }
    3505             : 
    3506        5784 :     FreeDir(dbspace_dir);
    3507        5784 : }
    3508             : 
    3509             : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
    3510             : bool
    3511     2345990 : looks_like_temp_rel_name(const char *name)
    3512             : {
    3513             :     int         pos;
    3514             :     int         savepos;
    3515             : 
    3516             :     /* Must start with "t". */
    3517     2345990 :     if (name[0] != 't')
    3518     2345900 :         return false;
    3519             : 
    3520             :     /* Followed by a non-empty string of digits and then an underscore. */
    3521         412 :     for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
    3522             :         ;
    3523          90 :     if (pos == 1 || name[pos] != '_')
    3524           0 :         return false;
    3525             : 
    3526             :     /* Followed by another nonempty string of digits. */
    3527         462 :     for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
    3528             :         ;
    3529          90 :     if (savepos == pos)
    3530           0 :         return false;
    3531             : 
    3532             :     /* We might have _forkname or .segment or both. */
    3533          90 :     if (name[pos] == '_')
    3534             :     {
    3535          40 :         int         forkchar = forkname_chars(&name[pos + 1], NULL);
    3536             : 
    3537          40 :         if (forkchar <= 0)
    3538           0 :             return false;
    3539          40 :         pos += forkchar + 1;
    3540             :     }
    3541          90 :     if (name[pos] == '.')
    3542             :     {
    3543             :         int         segchar;
    3544             : 
    3545          80 :         for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
    3546             :             ;
    3547          40 :         if (segchar <= 1)
    3548           0 :             return false;
    3549          40 :         pos += segchar;
    3550             :     }
    3551             : 
    3552             :     /* Now we should be at the end. */
    3553          90 :     if (name[pos] != '\0')
    3554           0 :         return false;
    3555          90 :     return true;
    3556             : }
    3557             : 
    3558             : #ifdef HAVE_SYNCFS
    3559             : static void
    3560           0 : do_syncfs(const char *path)
    3561             : {
    3562             :     int         fd;
    3563             : 
    3564           0 :     ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
    3565             :                              path);
    3566             : 
    3567           0 :     fd = OpenTransientFile(path, O_RDONLY);
    3568           0 :     if (fd < 0)
    3569             :     {
    3570           0 :         ereport(LOG,
    3571             :                 (errcode_for_file_access(),
    3572             :                  errmsg("could not open file \"%s\": %m", path)));
    3573           0 :         return;
    3574             :     }
    3575           0 :     if (syncfs(fd) < 0)
    3576           0 :         ereport(LOG,
    3577             :                 (errcode_for_file_access(),
    3578             :                  errmsg("could not synchronize file system for file \"%s\": %m", path)));
    3579           0 :     CloseTransientFile(fd);
    3580             : }
    3581             : #endif
    3582             : 
    3583             : /*
    3584             :  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
    3585             :  * all potential filesystem, depending on recovery_init_sync_method setting.
    3586             :  *
    3587             :  * We fsync regular files and directories wherever they are, but we
    3588             :  * follow symlinks only for pg_wal and immediately under pg_tblspc.
    3589             :  * Other symlinks are presumed to point at files we're not responsible
    3590             :  * for fsyncing, and might not have privileges to write at all.
    3591             :  *
    3592             :  * Errors are logged but not considered fatal; that's because this is used
    3593             :  * only during database startup, to deal with the possibility that there are
    3594             :  * issued-but-unsynced writes pending against the data directory.  We want to
    3595             :  * ensure that such writes reach disk before anything that's done in the new
    3596             :  * run.  However, aborting on error would result in failure to start for
    3597             :  * harmless cases such as read-only files in the data directory, and that's
    3598             :  * not good either.
    3599             :  *
    3600             :  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
    3601             :  * rewriting all changes again during recovery.
    3602             :  *
    3603             :  * Note we assume we're chdir'd into PGDATA to begin with.
    3604             :  */
    3605             : void
    3606         348 : SyncDataDirectory(void)
    3607             : {
    3608             :     bool        xlog_is_symlink;
    3609             : 
    3610             :     /* We can skip this whole thing if fsync is disabled. */
    3611         348 :     if (!enableFsync)
    3612         348 :         return;
    3613             : 
    3614             :     /*
    3615             :      * If pg_wal is a symlink, we'll need to recurse into it separately,
    3616             :      * because the first walkdir below will ignore it.
    3617             :      */
    3618           0 :     xlog_is_symlink = false;
    3619             : 
    3620             :     {
    3621             :         struct stat st;
    3622             : 
    3623           0 :         if (lstat("pg_wal", &st) < 0)
    3624           0 :             ereport(LOG,
    3625             :                     (errcode_for_file_access(),
    3626             :                      errmsg("could not stat file \"%s\": %m",
    3627             :                             "pg_wal")));
    3628           0 :         else if (S_ISLNK(st.st_mode))
    3629           0 :             xlog_is_symlink = true;
    3630             :     }
    3631             : 
    3632             : #ifdef HAVE_SYNCFS
    3633           0 :     if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
    3634             :     {
    3635             :         DIR        *dir;
    3636             :         struct dirent *de;
    3637             : 
    3638             :         /*
    3639             :          * On Linux, we don't have to open every single file one by one.  We
    3640             :          * can use syncfs() to sync whole filesystems.  We only expect
    3641             :          * filesystem boundaries to exist where we tolerate symlinks, namely
    3642             :          * pg_wal and the tablespaces, so we call syncfs() for each of those
    3643             :          * directories.
    3644             :          */
    3645             : 
    3646             :         /* Prepare to report progress syncing the data directory via syncfs. */
    3647           0 :         begin_startup_progress_phase();
    3648             : 
    3649             :         /* Sync the top level pgdata directory. */
    3650           0 :         do_syncfs(".");
    3651             :         /* If any tablespaces are configured, sync each of those. */
    3652           0 :         dir = AllocateDir(PG_TBLSPC_DIR);
    3653           0 :         while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
    3654             :         {
    3655             :             char        path[MAXPGPATH];
    3656             : 
    3657           0 :             if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
    3658           0 :                 continue;
    3659             : 
    3660           0 :             snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
    3661           0 :             do_syncfs(path);
    3662             :         }
    3663           0 :         FreeDir(dir);
    3664             :         /* If pg_wal is a symlink, process that too. */
    3665           0 :         if (xlog_is_symlink)
    3666           0 :             do_syncfs("pg_wal");
    3667           0 :         return;
    3668             :     }
    3669             : #endif                          /* !HAVE_SYNCFS */
    3670             : 
    3671             : #ifdef PG_FLUSH_DATA_WORKS
    3672             :     /* Prepare to report progress of the pre-fsync phase. */
    3673           0 :     begin_startup_progress_phase();
    3674             : 
    3675             :     /*
    3676             :      * If possible, hint to the kernel that we're soon going to fsync the data
    3677             :      * directory and its contents.  Errors in this step are even less
    3678             :      * interesting than normal, so log them only at DEBUG1.
    3679             :      */
    3680           0 :     walkdir(".", pre_sync_fname, false, DEBUG1);
    3681           0 :     if (xlog_is_symlink)
    3682           0 :         walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
    3683           0 :     walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
    3684             : #endif
    3685             : 
    3686             :     /* Prepare to report progress syncing the data directory via fsync. */
    3687           0 :     begin_startup_progress_phase();
    3688             : 
    3689             :     /*
    3690             :      * Now we do the fsync()s in the same order.
    3691             :      *
    3692             :      * The main call ignores symlinks, so in addition to specially processing
    3693             :      * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
    3694             :      * process_symlinks = true.  Note that if there are any plain directories
    3695             :      * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
    3696             :      * so we don't worry about optimizing it.
    3697             :      */
    3698           0 :     walkdir(".", datadir_fsync_fname, false, LOG);
    3699           0 :     if (xlog_is_symlink)
    3700           0 :         walkdir("pg_wal", datadir_fsync_fname, false, LOG);
    3701           0 :     walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
    3702             : }
    3703             : 
    3704             : /*
    3705             :  * walkdir: recursively walk a directory, applying the action to each
    3706             :  * regular file and directory (including the named directory itself).
    3707             :  *
    3708             :  * If process_symlinks is true, the action and recursion are also applied
    3709             :  * to regular files and directories that are pointed to by symlinks in the
    3710             :  * given directory; otherwise symlinks are ignored.  Symlinks are always
    3711             :  * ignored in subdirectories, ie we intentionally don't pass down the
    3712             :  * process_symlinks flag to recursive calls.
    3713             :  *
    3714             :  * Errors are reported at level elevel, which might be ERROR or less.
    3715             :  *
    3716             :  * See also walkdir in file_utils.c, which is a frontend version of this
    3717             :  * logic.
    3718             :  */
    3719             : static void
    3720         338 : walkdir(const char *path,
    3721             :         void (*action) (const char *fname, bool isdir, int elevel),
    3722             :         bool process_symlinks,
    3723             :         int elevel)
    3724             : {
    3725             :     DIR        *dir;
    3726             :     struct dirent *de;
    3727             : 
    3728         338 :     dir = AllocateDir(path);
    3729             : 
    3730        2946 :     while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
    3731             :     {
    3732             :         char        subpath[MAXPGPATH * 2];
    3733             : 
    3734        2608 :         CHECK_FOR_INTERRUPTS();
    3735             : 
    3736        2608 :         if (strcmp(de->d_name, ".") == 0 ||
    3737        2270 :             strcmp(de->d_name, "..") == 0)
    3738         676 :             continue;
    3739             : 
    3740        1932 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
    3741             : 
    3742        1932 :         switch (get_dirent_type(subpath, de, process_symlinks, elevel))
    3743             :         {
    3744        1932 :             case PGFILETYPE_REG:
    3745        1932 :                 (*action) (subpath, false, elevel);
    3746        1932 :                 break;
    3747           0 :             case PGFILETYPE_DIR:
    3748           0 :                 walkdir(subpath, action, false, elevel);
    3749           0 :                 break;
    3750           0 :             default:
    3751             : 
    3752             :                 /*
    3753             :                  * Errors are already reported directly by get_dirent_type(),
    3754             :                  * and any remaining symlinks and unknown file types are
    3755             :                  * ignored.
    3756             :                  */
    3757           0 :                 break;
    3758             :         }
    3759             :     }
    3760             : 
    3761         338 :     FreeDir(dir);               /* we ignore any error here */
    3762             : 
    3763             :     /*
    3764             :      * It's important to fsync the destination directory itself as individual
    3765             :      * file fsyncs don't guarantee that the directory entry for the file is
    3766             :      * synced.  However, skip this if AllocateDir failed; the action function
    3767             :      * might not be robust against that.
    3768             :      */
    3769         338 :     if (dir)
    3770         338 :         (*action) (path, true, elevel);
    3771         338 : }
    3772             : 
    3773             : 
    3774             : /*
    3775             :  * Hint to the OS that it should get ready to fsync() this file.
    3776             :  *
    3777             :  * Ignores errors trying to open unreadable files, and logs other errors at a
    3778             :  * caller-specified level.
    3779             :  */
    3780             : #ifdef PG_FLUSH_DATA_WORKS
    3781             : 
    3782             : static void
    3783           0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
    3784             : {
    3785             :     int         fd;
    3786             : 
    3787             :     /* Don't try to flush directories, it'll likely just fail */
    3788           0 :     if (isdir)
    3789           0 :         return;
    3790             : 
    3791           0 :     ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
    3792             :                              fname);
    3793             : 
    3794           0 :     fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
    3795             : 
    3796           0 :     if (fd < 0)
    3797             :     {
    3798           0 :         if (errno == EACCES)
    3799           0 :             return;
    3800           0 :         ereport(elevel,
    3801             :                 (errcode_for_file_access(),
    3802             :                  errmsg("could not open file \"%s\": %m", fname)));
    3803           0 :         return;
    3804             :     }
    3805             : 
    3806             :     /*
    3807             :      * pg_flush_data() ignores errors, which is ok because this is only a
    3808             :      * hint.
    3809             :      */
    3810           0 :     pg_flush_data(fd, 0, 0);
    3811             : 
    3812           0 :     if (CloseTransientFile(fd) != 0)
    3813           0 :         ereport(elevel,
    3814             :                 (errcode_for_file_access(),
    3815             :                  errmsg("could not close file \"%s\": %m", fname)));
    3816             : }
    3817             : 
    3818             : #endif                          /* PG_FLUSH_DATA_WORKS */
    3819             : 
    3820             : static void
    3821           0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
    3822             : {
    3823           0 :     ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
    3824             :                              fname);
    3825             : 
    3826             :     /*
    3827             :      * We want to silently ignoring errors about unreadable files.  Pass that
    3828             :      * desire on to fsync_fname_ext().
    3829             :      */
    3830           0 :     fsync_fname_ext(fname, isdir, true, elevel);
    3831           0 : }
    3832             : 
    3833             : static void
    3834        2270 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
    3835             : {
    3836        2270 :     if (isdir)
    3837             :     {
    3838         338 :         if (rmdir(fname) != 0 && errno != ENOENT)
    3839           0 :             ereport(elevel,
    3840             :                     (errcode_for_file_access(),
    3841             :                      errmsg("could not remove directory \"%s\": %m", fname)));
    3842             :     }
    3843             :     else
    3844             :     {
    3845             :         /* Use PathNameDeleteTemporaryFile to report filesize */
    3846        1932 :         PathNameDeleteTemporaryFile(fname, false);
    3847             :     }
    3848        2270 : }
    3849             : 
    3850             : /*
    3851             :  * fsync_fname_ext -- Try to fsync a file or directory
    3852             :  *
    3853             :  * If ignore_perm is true, ignore errors upon trying to open unreadable
    3854             :  * files. Logs other errors at a caller-specified level.
    3855             :  *
    3856             :  * Returns 0 if the operation succeeded, -1 otherwise.
    3857             :  */
    3858             : int
    3859       81250 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
    3860             : {
    3861             :     int         fd;
    3862             :     int         flags;
    3863             :     int         returncode;
    3864             : 
    3865             :     /*
    3866             :      * Some OSs require directories to be opened read-only whereas other
    3867             :      * systems don't allow us to fsync files opened read-only; so we need both
    3868             :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
    3869             :      * not writable by our userid, but we assume that's OK.
    3870             :      */
    3871       81250 :     flags = PG_BINARY;
    3872       81250 :     if (!isdir)
    3873       30432 :         flags |= O_RDWR;
    3874             :     else
    3875       50818 :         flags |= O_RDONLY;
    3876             : 
    3877       81250 :     fd = OpenTransientFile(fname, flags);
    3878             : 
    3879             :     /*
    3880             :      * Some OSs don't allow us to open directories at all (Windows returns
    3881             :      * EACCES), just ignore the error in that case.  If desired also silently
    3882             :      * ignoring errors about unreadable files. Log others.
    3883             :      */
    3884       81250 :     if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
    3885           0 :         return 0;
    3886       81250 :     else if (fd < 0 && ignore_perm && errno == EACCES)
    3887           0 :         return 0;
    3888       81250 :     else if (fd < 0)
    3889             :     {
    3890           0 :         ereport(elevel,
    3891             :                 (errcode_for_file_access(),
    3892             :                  errmsg("could not open file \"%s\": %m", fname)));
    3893           0 :         return -1;
    3894             :     }
    3895             : 
    3896       81250 :     returncode = pg_fsync(fd);
    3897             : 
    3898             :     /*
    3899             :      * Some OSes don't allow us to fsync directories at all, so we can ignore
    3900             :      * those errors. Anything else needs to be logged.
    3901             :      */
    3902       81250 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
    3903             :     {
    3904             :         int         save_errno;
    3905             : 
    3906             :         /* close file upon error, might not be in transaction context */
    3907           0 :         save_errno = errno;
    3908           0 :         (void) CloseTransientFile(fd);
    3909           0 :         errno = save_errno;
    3910             : 
    3911           0 :         ereport(elevel,
    3912             :                 (errcode_for_file_access(),
    3913             :                  errmsg("could not fsync file \"%s\": %m", fname)));
    3914           0 :         return -1;
    3915             :     }
    3916             : 
    3917       81250 :     if (CloseTransientFile(fd) != 0)
    3918             :     {
    3919           0 :         ereport(elevel,
    3920             :                 (errcode_for_file_access(),
    3921             :                  errmsg("could not close file \"%s\": %m", fname)));
    3922           0 :         return -1;
    3923             :     }
    3924             : 
    3925       81250 :     return 0;
    3926             : }
    3927             : 
    3928             : /*
    3929             :  * fsync_parent_path -- fsync the parent path of a file or directory
    3930             :  *
    3931             :  * This is aimed at making file operations persistent on disk in case of
    3932             :  * an OS crash or power failure.
    3933             :  */
    3934             : static int
    3935       15112 : fsync_parent_path(const char *fname, int elevel)
    3936             : {
    3937             :     char        parentpath[MAXPGPATH];
    3938             : 
    3939       15112 :     strlcpy(parentpath, fname, MAXPGPATH);
    3940       15112 :     get_parent_directory(parentpath);
    3941             : 
    3942             :     /*
    3943             :      * get_parent_directory() returns an empty string if the input argument is
    3944             :      * just a file name (see comments in path.c), so handle that as being the
    3945             :      * current directory.
    3946             :      */
    3947       15112 :     if (strlen(parentpath) == 0)
    3948         394 :         strlcpy(parentpath, ".", MAXPGPATH);
    3949             : 
    3950       15112 :     if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
    3951           0 :         return -1;
    3952             : 
    3953       15112 :     return 0;
    3954             : }
    3955             : 
    3956             : /*
    3957             :  * Create a PostgreSQL data sub-directory
    3958             :  *
    3959             :  * The data directory itself, and most of its sub-directories, are created at
    3960             :  * initdb time, but we do have some occasions when we create directories in
    3961             :  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
    3962             :  * make sure that those directories are created consistently.  Today, that means
    3963             :  * making sure that the created directory has the correct permissions, which is
    3964             :  * what pg_dir_create_mode tracks for us.
    3965             :  *
    3966             :  * Note that we also set the umask() based on what we understand the correct
    3967             :  * permissions to be (see file_perm.c).
    3968             :  *
    3969             :  * For permissions other than the default, mkdir() can be used directly, but
    3970             :  * be sure to consider carefully such cases -- a sub-directory with incorrect
    3971             :  * permissions in a PostgreSQL data directory could cause backups and other
    3972             :  * processes to fail.
    3973             :  */
    3974             : int
    3975        2888 : MakePGDirectory(const char *directoryName)
    3976             : {
    3977        2888 :     return mkdir(directoryName, pg_dir_create_mode);
    3978             : }
    3979             : 
    3980             : /*
    3981             :  * Return the passed-in error level, or PANIC if data_sync_retry is off.
    3982             :  *
    3983             :  * Failure to fsync any data file is cause for immediate panic, unless
    3984             :  * data_sync_retry is enabled.  Data may have been written to the operating
    3985             :  * system and removed from our buffer pool already, and if we are running on
    3986             :  * an operating system that forgets dirty data on write-back failure, there
    3987             :  * may be only one copy of the data remaining: in the WAL.  A later attempt to
    3988             :  * fsync again might falsely report success.  Therefore we must not allow any
    3989             :  * further checkpoints to be attempted.  data_sync_retry can in theory be
    3990             :  * enabled on systems known not to drop dirty buffered data on write-back
    3991             :  * failure (with the likely outcome that checkpoints will continue to fail
    3992             :  * until the underlying problem is fixed).
    3993             :  *
    3994             :  * Any code that reports a failure from fsync() or related functions should
    3995             :  * filter the error level with this function.
    3996             :  */
    3997             : int
    3998       40930 : data_sync_elevel(int elevel)
    3999             : {
    4000       40930 :     return data_sync_retry ? elevel : PANIC;
    4001             : }
    4002             : 
    4003             : bool
    4004        2204 : check_debug_io_direct(char **newval, void **extra, GucSource source)
    4005             : {
    4006        2204 :     bool        result = true;
    4007             :     int         flags;
    4008             : 
    4009             : #if PG_O_DIRECT == 0
    4010             :     if (strcmp(*newval, "") != 0)
    4011             :     {
    4012             :         GUC_check_errdetail("\"%s\" is not supported on this platform.",
    4013             :                             "debug_io_direct");
    4014             :         result = false;
    4015             :     }
    4016             :     flags = 0;
    4017             : #else
    4018             :     List       *elemlist;
    4019             :     ListCell   *l;
    4020             :     char       *rawstring;
    4021             : 
    4022             :     /* Need a modifiable copy of string */
    4023        2204 :     rawstring = pstrdup(*newval);
    4024             : 
    4025        2204 :     if (!SplitGUCList(rawstring, ',', &elemlist))
    4026             :     {
    4027           0 :         GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
    4028             :                             "debug_io_direct");
    4029           0 :         pfree(rawstring);
    4030           0 :         list_free(elemlist);
    4031           0 :         return false;
    4032             :     }
    4033             : 
    4034        2204 :     flags = 0;
    4035        2216 :     foreach(l, elemlist)
    4036             :     {
    4037          12 :         char       *item = (char *) lfirst(l);
    4038             : 
    4039          12 :         if (pg_strcasecmp(item, "data") == 0)
    4040           4 :             flags |= IO_DIRECT_DATA;
    4041           8 :         else if (pg_strcasecmp(item, "wal") == 0)
    4042           4 :             flags |= IO_DIRECT_WAL;
    4043           4 :         else if (pg_strcasecmp(item, "wal_init") == 0)
    4044           4 :             flags |= IO_DIRECT_WAL_INIT;
    4045             :         else
    4046             :         {
    4047           0 :             GUC_check_errdetail("Invalid option \"%s\".", item);
    4048           0 :             result = false;
    4049           0 :             break;
    4050             :         }
    4051             :     }
    4052             : 
    4053             :     /*
    4054             :      * It's possible to configure block sizes smaller than our assumed I/O
    4055             :      * alignment size, which could result in invalid I/O requests.
    4056             :      */
    4057             : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
    4058             :     if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
    4059             :     {
    4060             :         GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
    4061             :                             "debug_io_direct", "XLOG_BLCKSZ");
    4062             :         result = false;
    4063             :     }
    4064             : #endif
    4065             : #if BLCKSZ < PG_IO_ALIGN_SIZE
    4066             :     if (result && (flags & IO_DIRECT_DATA))
    4067             :     {
    4068             :         GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
    4069             :                             "debug_io_direct", "BLCKSZ");
    4070             :         result = false;
    4071             :     }
    4072             : #endif
    4073             : 
    4074        2204 :     pfree(rawstring);
    4075        2204 :     list_free(elemlist);
    4076             : #endif
    4077             : 
    4078        2204 :     if (!result)
    4079           0 :         return result;
    4080             : 
    4081             :     /* Save the flags in *extra, for use by assign_debug_io_direct */
    4082        2204 :     *extra = guc_malloc(LOG, sizeof(int));
    4083        2204 :     if (!*extra)
    4084           0 :         return false;
    4085        2204 :     *((int *) *extra) = flags;
    4086             : 
    4087        2204 :     return result;
    4088             : }
    4089             : 
    4090             : void
    4091        2204 : assign_debug_io_direct(const char *newval, void *extra)
    4092             : {
    4093        2204 :     int        *flags = (int *) extra;
    4094             : 
    4095        2204 :     io_direct_flags = *flags;
    4096        2204 : }
    4097             : 
    4098             : /* ResourceOwner callbacks */
    4099             : 
    4100             : static void
    4101           8 : ResOwnerReleaseFile(Datum res)
    4102             : {
    4103           8 :     File        file = (File) DatumGetInt32(res);
    4104             :     Vfd        *vfdP;
    4105             : 
    4106             :     Assert(FileIsValid(file));
    4107             : 
    4108           8 :     vfdP = &VfdCache[file];
    4109           8 :     vfdP->resowner = NULL;
    4110             : 
    4111           8 :     FileClose(file);
    4112           8 : }
    4113             : 
    4114             : static char *
    4115           0 : ResOwnerPrintFile(Datum res)
    4116             : {
    4117           0 :     return psprintf("File %d", DatumGetInt32(res));
    4118             : }

Generated by: LCOV version 1.16