LCOV - code coverage report
Current view: top level - src/backend/storage/file - fd.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 71.0 % 994 706
Test Date: 2026-03-23 14:16:05 Functions: 90.9 % 99 90
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * fd.c
       4              :  *    Virtual file descriptor code.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  * IDENTIFICATION
      10              :  *    src/backend/storage/file/fd.c
      11              :  *
      12              :  * NOTES:
      13              :  *
      14              :  * This code manages a cache of 'virtual' file descriptors (VFDs).
      15              :  * The server opens many file descriptors for a variety of reasons,
      16              :  * including base tables, scratch files (e.g., sort and hash spool
      17              :  * files), and random calls to C library routines like system(3); it
      18              :  * is quite easy to exceed system limits on the number of open files a
      19              :  * single process can have.  (This is around 1024 on many modern
      20              :  * operating systems, but may be lower on others.)
      21              :  *
      22              :  * VFDs are managed as an LRU pool, with actual OS file descriptors
      23              :  * being opened and closed as needed.  Obviously, if a routine is
      24              :  * opened using these interfaces, all subsequent operations must also
      25              :  * be through these interfaces (the File type is not a real file
      26              :  * descriptor).
      27              :  *
      28              :  * For this scheme to work, most (if not all) routines throughout the
      29              :  * server should use these interfaces instead of calling the C library
      30              :  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
      31              :  * may find ourselves short of real file descriptors anyway.
      32              :  *
      33              :  * INTERFACE ROUTINES
      34              :  *
      35              :  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
      36              :  * A File opened with OpenTemporaryFile is automatically deleted when the
      37              :  * File is closed, either explicitly or implicitly at end of transaction or
      38              :  * process exit. PathNameOpenFile is intended for files that are held open
      39              :  * for a long time, like relation files. It is the caller's responsibility
      40              :  * to close them, there is no automatic mechanism in fd.c for that.
      41              :  *
      42              :  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
      43              :  * temporary files that have names so that they can be shared between
      44              :  * backends.  Such files are automatically closed and count against the
      45              :  * temporary file limit of the backend that creates them, but unlike anonymous
      46              :  * files they are not automatically deleted.  See sharedfileset.c for a shared
      47              :  * ownership mechanism that provides automatic cleanup for shared files when
      48              :  * the last of a group of backends detaches.
      49              :  *
      50              :  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
      51              :  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
      52              :  * They behave like the corresponding native functions, except that the handle
      53              :  * is registered with the current subtransaction, and will be automatically
      54              :  * closed at abort. These are intended mainly for short operations like
      55              :  * reading a configuration file; there is a limit on the number of files that
      56              :  * can be opened using these functions at any one time.
      57              :  *
      58              :  * Finally, BasicOpenFile is just a thin wrapper around open() that can
      59              :  * release file descriptors in use by the virtual file descriptors if
      60              :  * necessary. There is no automatic cleanup of file descriptors returned by
      61              :  * BasicOpenFile, it is solely the caller's responsibility to close the file
      62              :  * descriptor by calling close(2).
      63              :  *
      64              :  * If a non-virtual file descriptor needs to be held open for any length of
      65              :  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
      66              :  * (and eventually ReleaseExternalFD), so that we can take it into account
      67              :  * while deciding how many VFDs can be open.  This applies to FDs obtained
      68              :  * with BasicOpenFile as well as those obtained without use of any fd.c API.
      69              :  *
      70              :  *-------------------------------------------------------------------------
      71              :  */
      72              : 
      73              : #include "postgres.h"
      74              : 
      75              : #include <dirent.h>
      76              : #include <sys/file.h>
      77              : #include <sys/param.h>
      78              : #include <sys/resource.h>     /* for getrlimit */
      79              : #include <sys/stat.h>
      80              : #include <sys/types.h>
      81              : #ifndef WIN32
      82              : #include <sys/mman.h>
      83              : #endif
      84              : #include <limits.h>
      85              : #include <unistd.h>
      86              : #include <fcntl.h>
      87              : 
      88              : #include "access/xact.h"
      89              : #include "access/xlog.h"
      90              : #include "catalog/pg_tablespace.h"
      91              : #include "common/file_perm.h"
      92              : #include "common/file_utils.h"
      93              : #include "common/pg_prng.h"
      94              : #include "miscadmin.h"
      95              : #include "pgstat.h"
      96              : #include "postmaster/startup.h"
      97              : #include "storage/aio.h"
      98              : #include "storage/fd.h"
      99              : #include "storage/ipc.h"
     100              : #include "utils/guc.h"
     101              : #include "utils/guc_hooks.h"
     102              : #include "utils/resowner.h"
     103              : #include "utils/varlena.h"
     104              : #include "utils/wait_event.h"
     105              : 
     106              : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
     107              : #if defined(HAVE_SYNC_FILE_RANGE)
     108              : #define PG_FLUSH_DATA_WORKS 1
     109              : #elif !defined(WIN32) && defined(MS_ASYNC)
     110              : #define PG_FLUSH_DATA_WORKS 1
     111              : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     112              : #define PG_FLUSH_DATA_WORKS 1
     113              : #endif
     114              : 
     115              : /*
     116              :  * We must leave some file descriptors free for system(), the dynamic loader,
     117              :  * and other code that tries to open files without consulting fd.c.  This
     118              :  * is the number left free.  (While we try fairly hard to prevent EMFILE
     119              :  * errors, there's never any guarantee that we won't get ENFILE due to
     120              :  * other processes chewing up FDs.  So it's a bad idea to try to open files
     121              :  * without consulting fd.c.  Nonetheless we cannot control all code.)
     122              :  *
     123              :  * Because this is just a fixed setting, we are effectively assuming that
     124              :  * no such code will leave FDs open over the long term; otherwise the slop
     125              :  * is likely to be insufficient.  Note in particular that we expect that
     126              :  * loading a shared library does not result in any permanent increase in
     127              :  * the number of open files.  (This appears to be true on most if not
     128              :  * all platforms as of Feb 2004.)
     129              :  */
     130              : #define NUM_RESERVED_FDS        10
     131              : 
     132              : /*
     133              :  * If we have fewer than this many usable FDs after allowing for the reserved
     134              :  * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
     135              :  * much less than that.  Note that this value ensures numExternalFDs can be
     136              :  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
     137              :  * will not pass unless that can grow to at least 14.)
     138              :  */
     139              : #define FD_MINFREE              48
     140              : 
     141              : /*
     142              :  * A number of platforms allow individual processes to open many more files
     143              :  * than they can really support when *many* processes do the same thing.
     144              :  * This GUC parameter lets the DBA limit max_safe_fds to something less than
     145              :  * what the postmaster's initial probe suggests will work.
     146              :  */
     147              : int         max_files_per_process = 1000;
     148              : 
     149              : /*
     150              :  * Maximum number of file descriptors to open for operations that fd.c knows
     151              :  * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
     152              :  * to a conservative value, and remains that way indefinitely in bootstrap or
     153              :  * standalone-backend cases.  In normal postmaster operation, the postmaster
     154              :  * calls set_max_safe_fds() late in initialization to update the value, and
     155              :  * that value is then inherited by forked subprocesses.
     156              :  *
     157              :  * Note: the value of max_files_per_process is taken into account while
     158              :  * setting this variable, and so need not be tested separately.
     159              :  */
     160              : int         max_safe_fds = FD_MINFREE;  /* default if not changed */
     161              : 
     162              : /* Whether it is safe to continue running after fsync() fails. */
     163              : bool        data_sync_retry = false;
     164              : 
     165              : /* How SyncDataDirectory() should do its job. */
     166              : int         recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
     167              : 
     168              : /* How data files should be bulk-extended with zeros. */
     169              : int         file_extend_method = DEFAULT_FILE_EXTEND_METHOD;
     170              : 
     171              : /* Which kinds of files should be opened with PG_O_DIRECT. */
     172              : int         io_direct_flags;
     173              : 
     174              : /* Debugging.... */
     175              : 
     176              : #ifdef FDDEBUG
     177              : #define DO_DB(A) \
     178              :     do { \
     179              :         int         _do_db_save_errno = errno; \
     180              :         A; \
     181              :         errno = _do_db_save_errno; \
     182              :     } while (0)
     183              : #else
     184              : #define DO_DB(A) \
     185              :     ((void) 0)
     186              : #endif
     187              : 
     188              : #define VFD_CLOSED (-1)
     189              : 
     190              : #define FileIsValid(file) \
     191              :     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
     192              : 
     193              : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
     194              : 
     195              : /* these are the assigned bits in fdstate below: */
     196              : #define FD_DELETE_AT_CLOSE  (1 << 0)  /* T = delete when closed */
     197              : #define FD_CLOSE_AT_EOXACT  (1 << 1)  /* T = close at eoXact */
     198              : #define FD_TEMP_FILE_LIMIT  (1 << 2)  /* T = respect temp_file_limit */
     199              : 
     200              : typedef struct vfd
     201              : {
     202              :     int         fd;             /* current FD, or VFD_CLOSED if none */
     203              :     unsigned short fdstate;     /* bitflags for VFD's state */
     204              :     ResourceOwner resowner;     /* owner, for automatic cleanup */
     205              :     File        nextFree;       /* link to next free VFD, if in freelist */
     206              :     File        lruMoreRecently;    /* doubly linked recency-of-use list */
     207              :     File        lruLessRecently;
     208              :     pgoff_t     fileSize;       /* current size of file (0 if not temporary) */
     209              :     char       *fileName;       /* name of file, or NULL for unused VFD */
     210              :     /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
     211              :     int         fileFlags;      /* open(2) flags for (re)opening the file */
     212              :     mode_t      fileMode;       /* mode to pass to open(2) */
     213              : } Vfd;
     214              : 
     215              : /*
     216              :  * Virtual File Descriptor array pointer and size.  This grows as
     217              :  * needed.  'File' values are indexes into this array.
     218              :  * Note that VfdCache[0] is not a usable VFD, just a list header.
     219              :  */
     220              : static Vfd *VfdCache;
     221              : static Size SizeVfdCache = 0;
     222              : 
     223              : /*
     224              :  * Number of file descriptors known to be in use by VFD entries.
     225              :  */
     226              : static int  nfile = 0;
     227              : 
     228              : /*
     229              :  * Flag to tell whether it's worth scanning VfdCache looking for temp files
     230              :  * to close
     231              :  */
     232              : static bool have_xact_temporary_files = false;
     233              : 
     234              : /*
     235              :  * Tracks the total size of all temporary files.  Note: when temp_file_limit
     236              :  * is being enforced, this cannot overflow since the limit cannot be more
     237              :  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
     238              :  * overflow, but we don't care.
     239              :  */
     240              : static uint64 temporary_files_size = 0;
     241              : 
     242              : /* Temporary file access initialized and not yet shut down? */
     243              : #ifdef USE_ASSERT_CHECKING
     244              : static bool temporary_files_allowed = false;
     245              : #endif
     246              : 
     247              : /*
     248              :  * List of OS handles opened with AllocateFile, AllocateDir and
     249              :  * OpenTransientFile.
     250              :  */
     251              : typedef enum
     252              : {
     253              :     AllocateDescFile,
     254              :     AllocateDescPipe,
     255              :     AllocateDescDir,
     256              :     AllocateDescRawFD,
     257              : } AllocateDescKind;
     258              : 
     259              : typedef struct
     260              : {
     261              :     AllocateDescKind kind;
     262              :     SubTransactionId create_subid;
     263              :     union
     264              :     {
     265              :         FILE       *file;
     266              :         DIR        *dir;
     267              :         int         fd;
     268              :     }           desc;
     269              : } AllocateDesc;
     270              : 
     271              : static int  numAllocatedDescs = 0;
     272              : static int  maxAllocatedDescs = 0;
     273              : static AllocateDesc *allocatedDescs = NULL;
     274              : 
     275              : /*
     276              :  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
     277              :  */
     278              : static int  numExternalFDs = 0;
     279              : 
     280              : /*
     281              :  * Number of temporary files opened during the current session;
     282              :  * this is used in generation of tempfile names.
     283              :  */
     284              : static long tempFileCounter = 0;
     285              : 
     286              : /*
     287              :  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
     288              :  * indicating that the current database's default tablespace should be used.)
     289              :  * When numTempTableSpaces is -1, this has not been set in the current
     290              :  * transaction.
     291              :  */
     292              : static Oid *tempTableSpaces = NULL;
     293              : static int  numTempTableSpaces = -1;
     294              : static int  nextTempTableSpace = 0;
     295              : 
     296              : 
     297              : /*--------------------
     298              :  *
     299              :  * Private Routines
     300              :  *
     301              :  * Delete          - delete a file from the Lru ring
     302              :  * LruDelete       - remove a file from the Lru ring and close its FD
     303              :  * Insert          - put a file at the front of the Lru ring
     304              :  * LruInsert       - put a file at the front of the Lru ring and open it
     305              :  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
     306              :  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
     307              :  * AllocateVfd     - grab a free (or new) file record (from VfdCache)
     308              :  * FreeVfd         - free a file record
     309              :  *
     310              :  * The Least Recently Used ring is a doubly linked list that begins and
     311              :  * ends on element zero.  Element zero is special -- it doesn't represent
     312              :  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
     313              :  * anchor that shows us the beginning/end of the ring.
     314              :  * Only VFD elements that are currently really open (have an FD assigned) are
     315              :  * in the Lru ring.  Elements that are "virtually" open can be recognized
     316              :  * by having a non-null fileName field.
     317              :  *
     318              :  * example:
     319              :  *
     320              :  *     /--less----\                /---------\
     321              :  *     v           \              v           \
     322              :  *   #0 --more---> LeastRecentlyUsed --more-\ \
     323              :  *    ^\                                    | |
     324              :  *     \\less--> MostRecentlyUsedFile    <---/ |
     325              :  *      \more---/                    \--less--/
     326              :  *
     327              :  *--------------------
     328              :  */
     329              : static void Delete(File file);
     330              : static void LruDelete(File file);
     331              : static void Insert(File file);
     332              : static int  LruInsert(File file);
     333              : static bool ReleaseLruFile(void);
     334              : static void ReleaseLruFiles(void);
     335              : static File AllocateVfd(void);
     336              : static void FreeVfd(File file);
     337              : 
     338              : static int  FileAccess(File file);
     339              : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
     340              : static bool reserveAllocatedDesc(void);
     341              : static int  FreeDesc(AllocateDesc *desc);
     342              : 
     343              : static void BeforeShmemExit_Files(int code, Datum arg);
     344              : static void CleanupTempFiles(bool isCommit, bool isProcExit);
     345              : static void RemovePgTempRelationFiles(const char *tsdirname);
     346              : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
     347              : 
     348              : static void walkdir(const char *path,
     349              :                     void (*action) (const char *fname, bool isdir, int elevel),
     350              :                     bool process_symlinks,
     351              :                     int elevel);
     352              : #ifdef PG_FLUSH_DATA_WORKS
     353              : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
     354              : #endif
     355              : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
     356              : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
     357              : 
     358              : static int  fsync_parent_path(const char *fname, int elevel);
     359              : 
     360              : 
     361              : /* ResourceOwner callbacks to hold virtual file descriptors */
     362              : static void ResOwnerReleaseFile(Datum res);
     363              : static char *ResOwnerPrintFile(Datum res);
     364              : 
     365              : static const ResourceOwnerDesc file_resowner_desc =
     366              : {
     367              :     .name = "File",
     368              :     .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
     369              :     .release_priority = RELEASE_PRIO_FILES,
     370              :     .ReleaseResource = ResOwnerReleaseFile,
     371              :     .DebugPrint = ResOwnerPrintFile
     372              : };
     373              : 
     374              : /* Convenience wrappers over ResourceOwnerRemember/Forget */
     375              : static inline void
     376         6135 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
     377              : {
     378         6135 :     ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
     379         6135 : }
     380              : static inline void
     381         6130 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
     382              : {
     383         6130 :     ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
     384         6130 : }
     385              : 
     386              : /*
     387              :  * pg_fsync --- do fsync with or without writethrough
     388              :  */
     389              : int
     390        70037 : pg_fsync(int fd)
     391              : {
     392              : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
     393              :     struct stat st;
     394              : 
     395              :     /*
     396              :      * Some operating system implementations of fsync() have requirements
     397              :      * about the file access modes that were used when their file descriptor
     398              :      * argument was opened, and these requirements differ depending on whether
     399              :      * the file descriptor is for a directory.
     400              :      *
     401              :      * For any file descriptor that may eventually be handed to fsync(), we
     402              :      * should have opened it with access modes that are compatible with
     403              :      * fsync() on all supported systems, otherwise the code may not be
     404              :      * portable, even if it runs ok on the current system.
     405              :      *
     406              :      * We assert here that a descriptor for a file was opened with write
     407              :      * permissions (i.e., not O_RDONLY) and for a directory without write
     408              :      * permissions (O_RDONLY).  Notice that the assertion check is made even
     409              :      * if fsync() is disabled.
     410              :      *
     411              :      * If fstat() fails, ignore it and let the follow-up fsync() complain.
     412              :      */
     413              :     if (fstat(fd, &st) == 0)
     414              :     {
     415              :         int         desc_flags = fcntl(fd, F_GETFL);
     416              : 
     417              :         desc_flags &= O_ACCMODE;
     418              : 
     419              :         if (S_ISDIR(st.st_mode))
     420              :             Assert(desc_flags == O_RDONLY);
     421              :         else
     422              :             Assert(desc_flags != O_RDONLY);
     423              :     }
     424              :     errno = 0;
     425              : #endif
     426              : 
     427              :     /* #if is to skip the wal_sync_method test if there's no need for it */
     428              : #if defined(HAVE_FSYNC_WRITETHROUGH)
     429              :     if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
     430              :         return pg_fsync_writethrough(fd);
     431              :     else
     432              : #endif
     433        70037 :         return pg_fsync_no_writethrough(fd);
     434              : }
     435              : 
     436              : 
     437              : /*
     438              :  * pg_fsync_no_writethrough --- same as fsync except does nothing if
     439              :  *  enableFsync is off
     440              :  */
     441              : int
     442        70037 : pg_fsync_no_writethrough(int fd)
     443              : {
     444              :     int         rc;
     445              : 
     446        70037 :     if (!enableFsync)
     447        70037 :         return 0;
     448              : 
     449            0 : retry:
     450            0 :     rc = fsync(fd);
     451              : 
     452            0 :     if (rc == -1 && errno == EINTR)
     453            0 :         goto retry;
     454              : 
     455            0 :     return rc;
     456              : }
     457              : 
     458              : /*
     459              :  * pg_fsync_writethrough
     460              :  */
     461              : int
     462            0 : pg_fsync_writethrough(int fd)
     463              : {
     464            0 :     if (enableFsync)
     465              :     {
     466              : #if defined(F_FULLFSYNC)
     467              :         return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
     468              : #else
     469            0 :         errno = ENOSYS;
     470            0 :         return -1;
     471              : #endif
     472              :     }
     473              :     else
     474            0 :         return 0;
     475              : }
     476              : 
     477              : /*
     478              :  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
     479              :  */
     480              : int
     481            0 : pg_fdatasync(int fd)
     482              : {
     483              :     int         rc;
     484              : 
     485            0 :     if (!enableFsync)
     486            0 :         return 0;
     487              : 
     488            0 : retry:
     489            0 :     rc = fdatasync(fd);
     490              : 
     491            0 :     if (rc == -1 && errno == EINTR)
     492            0 :         goto retry;
     493              : 
     494            0 :     return rc;
     495              : }
     496              : 
     497              : /*
     498              :  * pg_file_exists -- check that a file exists.
     499              :  *
     500              :  * This requires an absolute path to the file.  Returns true if the file is
     501              :  * not a directory, false otherwise.
     502              :  */
     503              : bool
     504        22401 : pg_file_exists(const char *name)
     505              : {
     506              :     struct stat st;
     507              : 
     508              :     Assert(name != NULL);
     509              : 
     510        22401 :     if (stat(name, &st) == 0)
     511        11785 :         return !S_ISDIR(st.st_mode);
     512        10616 :     else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
     513            0 :         ereport(ERROR,
     514              :                 (errcode_for_file_access(),
     515              :                  errmsg("could not access file \"%s\": %m", name)));
     516              : 
     517        10616 :     return false;
     518              : }
     519              : 
     520              : /*
     521              :  * pg_flush_data --- advise OS that the described dirty data should be flushed
     522              :  *
     523              :  * offset of 0 with nbytes 0 means that the entire file should be flushed
     524              :  */
     525              : void
     526        41598 : pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
     527              : {
     528              :     /*
     529              :      * Right now file flushing is primarily used to avoid making later
     530              :      * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
     531              :      * if fsyncs are disabled - that's a decision we might want to make
     532              :      * configurable at some point.
     533              :      */
     534        41598 :     if (!enableFsync)
     535        41598 :         return;
     536              : 
     537              :     /*
     538              :      * We compile all alternatives that are supported on the current platform,
     539              :      * to find portability problems more easily.
     540              :      */
     541              : #if defined(HAVE_SYNC_FILE_RANGE)
     542              :     {
     543              :         int         rc;
     544              :         static bool not_implemented_by_kernel = false;
     545              : 
     546            0 :         if (not_implemented_by_kernel)
     547            0 :             return;
     548              : 
     549            0 : retry:
     550              : 
     551              :         /*
     552              :          * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
     553              :          * tells the OS that writeback for the specified blocks should be
     554              :          * started, but that we don't want to wait for completion.  Note that
     555              :          * this call might block if too much dirty data exists in the range.
     556              :          * This is the preferable method on OSs supporting it, as it works
     557              :          * reliably when available (contrast to msync()) and doesn't flush out
     558              :          * clean data (like FADV_DONTNEED).
     559              :          */
     560            0 :         rc = sync_file_range(fd, offset, nbytes,
     561              :                              SYNC_FILE_RANGE_WRITE);
     562            0 :         if (rc != 0)
     563              :         {
     564              :             int         elevel;
     565              : 
     566            0 :             if (rc == EINTR)
     567            0 :                 goto retry;
     568              : 
     569              :             /*
     570              :              * For systems that don't have an implementation of
     571              :              * sync_file_range() such as Windows WSL, generate only one
     572              :              * warning and then suppress all further attempts by this process.
     573              :              */
     574            0 :             if (errno == ENOSYS)
     575              :             {
     576            0 :                 elevel = WARNING;
     577            0 :                 not_implemented_by_kernel = true;
     578              :             }
     579              :             else
     580            0 :                 elevel = data_sync_elevel(WARNING);
     581              : 
     582            0 :             ereport(elevel,
     583              :                     (errcode_for_file_access(),
     584              :                      errmsg("could not flush dirty data: %m")));
     585              :         }
     586              : 
     587            0 :         return;
     588              :     }
     589              : #endif
     590              : #if !defined(WIN32) && defined(MS_ASYNC)
     591              :     {
     592              :         void       *p;
     593              :         static int  pagesize = 0;
     594              : 
     595              :         /*
     596              :          * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
     597              :          * writeback. On linux it only does so if MS_SYNC is specified, but
     598              :          * then it does the writeback synchronously. Luckily all common linux
     599              :          * systems have sync_file_range().  This is preferable over
     600              :          * FADV_DONTNEED because it doesn't flush out clean data.
     601              :          *
     602              :          * We map the file (mmap()), tell the kernel to sync back the contents
     603              :          * (msync()), and then remove the mapping again (munmap()).
     604              :          */
     605              : 
     606              :         /* mmap() needs actual length if we want to map whole file */
     607              :         if (offset == 0 && nbytes == 0)
     608              :         {
     609              :             nbytes = lseek(fd, 0, SEEK_END);
     610              :             if (nbytes < 0)
     611              :             {
     612              :                 ereport(WARNING,
     613              :                         (errcode_for_file_access(),
     614              :                          errmsg("could not determine dirty data size: %m")));
     615              :                 return;
     616              :             }
     617              :         }
     618              : 
     619              :         /*
     620              :          * Some platforms reject partial-page mmap() attempts.  To deal with
     621              :          * that, just truncate the request to a page boundary.  If any extra
     622              :          * bytes don't get flushed, well, it's only a hint anyway.
     623              :          */
     624              : 
     625              :         /* fetch pagesize only once */
     626              :         if (pagesize == 0)
     627              :             pagesize = sysconf(_SC_PAGESIZE);
     628              : 
     629              :         /* align length to pagesize, dropping any fractional page */
     630              :         if (pagesize > 0)
     631              :             nbytes = (nbytes / pagesize) * pagesize;
     632              : 
     633              :         /* fractional-page request is a no-op */
     634              :         if (nbytes <= 0)
     635              :             return;
     636              : 
     637              :         /*
     638              :          * mmap could well fail, particularly on 32-bit platforms where there
     639              :          * may simply not be enough address space.  If so, silently fall
     640              :          * through to the next implementation.
     641              :          */
     642              :         if (nbytes <= (pgoff_t) SSIZE_MAX)
     643              :             p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
     644              :         else
     645              :             p = MAP_FAILED;
     646              : 
     647              :         if (p != MAP_FAILED)
     648              :         {
     649              :             int         rc;
     650              : 
     651              :             rc = msync(p, (size_t) nbytes, MS_ASYNC);
     652              :             if (rc != 0)
     653              :             {
     654              :                 ereport(data_sync_elevel(WARNING),
     655              :                         (errcode_for_file_access(),
     656              :                          errmsg("could not flush dirty data: %m")));
     657              :                 /* NB: need to fall through to munmap()! */
     658              :             }
     659              : 
     660              :             rc = munmap(p, (size_t) nbytes);
     661              :             if (rc != 0)
     662              :             {
     663              :                 /* FATAL error because mapping would remain */
     664              :                 ereport(FATAL,
     665              :                         (errcode_for_file_access(),
     666              :                          errmsg("could not munmap() while flushing data: %m")));
     667              :             }
     668              : 
     669              :             return;
     670              :         }
     671              :     }
     672              : #endif
     673              : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     674              :     {
     675              :         int         rc;
     676              : 
     677              :         /*
     678              :          * Signal the kernel that the passed in range should not be cached
     679              :          * anymore. This has the, desired, side effect of writing out dirty
     680              :          * data, and the, undesired, side effect of likely discarding useful
     681              :          * clean cached blocks.  For the latter reason this is the least
     682              :          * preferable method.
     683              :          */
     684              : 
     685              :         rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
     686              : 
     687              :         if (rc != 0)
     688              :         {
     689              :             /* don't error out, this is just a performance optimization */
     690              :             ereport(WARNING,
     691              :                     (errcode_for_file_access(),
     692              :                      errmsg("could not flush dirty data: %m")));
     693              :         }
     694              : 
     695              :         return;
     696              :     }
     697              : #endif
     698              : }
     699              : 
     700              : /*
     701              :  * Truncate an open file to a given length.
     702              :  */
     703              : static int
     704          672 : pg_ftruncate(int fd, pgoff_t length)
     705              : {
     706              :     int         ret;
     707              : 
     708          672 : retry:
     709          672 :     ret = ftruncate(fd, length);
     710              : 
     711          672 :     if (ret == -1 && errno == EINTR)
     712            0 :         goto retry;
     713              : 
     714          672 :     return ret;
     715              : }
     716              : 
     717              : /*
     718              :  * Truncate a file to a given length by name.
     719              :  */
     720              : int
     721       271654 : pg_truncate(const char *path, pgoff_t length)
     722              : {
     723              :     int         ret;
     724              : #ifdef WIN32
     725              :     int         save_errno;
     726              :     int         fd;
     727              : 
     728              :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
     729              :     if (fd >= 0)
     730              :     {
     731              :         ret = pg_ftruncate(fd, length);
     732              :         save_errno = errno;
     733              :         CloseTransientFile(fd);
     734              :         errno = save_errno;
     735              :     }
     736              :     else
     737              :         ret = -1;
     738              : #else
     739              : 
     740       271654 : retry:
     741       271654 :     ret = truncate(path, length);
     742              : 
     743       271654 :     if (ret == -1 && errno == EINTR)
     744            0 :         goto retry;
     745              : #endif
     746              : 
     747       271654 :     return ret;
     748              : }
     749              : 
     750              : /*
     751              :  * fsync_fname -- fsync a file or directory, handling errors properly
     752              :  *
     753              :  * Try to fsync a file or directory. When doing the latter, ignore errors that
     754              :  * indicate the OS just doesn't allow/require fsyncing directories.
     755              :  */
     756              : void
     757        22892 : fsync_fname(const char *fname, bool isdir)
     758              : {
     759        22892 :     fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
     760        22892 : }
     761              : 
     762              : /*
     763              :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     764              :  *
     765              :  * This routine ensures that, after returning, the effect of renaming file
     766              :  * persists in case of a crash. A crash while this routine is running will
     767              :  * leave you with either the pre-existing or the moved file in place of the
     768              :  * new file; no mixed state or truncated files are possible.
     769              :  *
     770              :  * It does so by using fsync on the old filename and the possibly existing
     771              :  * target filename before the rename, and the target file and directory after.
     772              :  *
     773              :  * Note that rename() cannot be used across arbitrary directories, as they
     774              :  * might not be on the same filesystem. Therefore this routine does not
     775              :  * support renaming across directories.
     776              :  *
     777              :  * Log errors with the caller specified severity.
     778              :  *
     779              :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     780              :  * valid upon return.
     781              :  */
     782              : int
     783         6734 : durable_rename(const char *oldfile, const char *newfile, int elevel)
     784              : {
     785              :     int         fd;
     786              : 
     787              :     /*
     788              :      * First fsync the old and target path (if it exists), to ensure that they
     789              :      * are properly persistent on disk. Syncing the target file is not
     790              :      * strictly necessary, but it makes it easier to reason about crashes;
     791              :      * because it's then guaranteed that either source or target file exists
     792              :      * after a crash.
     793              :      */
     794         6734 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     795            0 :         return -1;
     796              : 
     797         6734 :     fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
     798         6734 :     if (fd < 0)
     799              :     {
     800         4661 :         if (errno != ENOENT)
     801              :         {
     802            0 :             ereport(elevel,
     803              :                     (errcode_for_file_access(),
     804              :                      errmsg("could not open file \"%s\": %m", newfile)));
     805            0 :             return -1;
     806              :         }
     807              :     }
     808              :     else
     809              :     {
     810         2073 :         if (pg_fsync(fd) != 0)
     811              :         {
     812              :             int         save_errno;
     813              : 
     814              :             /* close file upon error, might not be in transaction context */
     815            0 :             save_errno = errno;
     816            0 :             CloseTransientFile(fd);
     817            0 :             errno = save_errno;
     818              : 
     819            0 :             ereport(elevel,
     820              :                     (errcode_for_file_access(),
     821              :                      errmsg("could not fsync file \"%s\": %m", newfile)));
     822            0 :             return -1;
     823              :         }
     824              : 
     825         2073 :         if (CloseTransientFile(fd) != 0)
     826              :         {
     827            0 :             ereport(elevel,
     828              :                     (errcode_for_file_access(),
     829              :                      errmsg("could not close file \"%s\": %m", newfile)));
     830            0 :             return -1;
     831              :         }
     832              :     }
     833              : 
     834              :     /* Time to do the real deal... */
     835         6734 :     if (rename(oldfile, newfile) < 0)
     836              :     {
     837            0 :         ereport(elevel,
     838              :                 (errcode_for_file_access(),
     839              :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     840              :                         oldfile, newfile)));
     841            0 :         return -1;
     842              :     }
     843              : 
     844              :     /*
     845              :      * To guarantee renaming the file is persistent, fsync the file with its
     846              :      * new name, and its containing directory.
     847              :      */
     848         6734 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     849            0 :         return -1;
     850              : 
     851         6734 :     if (fsync_parent_path(newfile, elevel) != 0)
     852            0 :         return -1;
     853              : 
     854         6734 :     return 0;
     855              : }
     856              : 
     857              : /*
     858              :  * durable_unlink -- remove a file in a durable manner
     859              :  *
     860              :  * This routine ensures that, after returning, the effect of removing file
     861              :  * persists in case of a crash. A crash while this routine is running will
     862              :  * leave the system in no mixed state.
     863              :  *
     864              :  * It does so by using fsync on the parent directory of the file after the
     865              :  * actual removal is done.
     866              :  *
     867              :  * Log errors with the severity specified by caller.
     868              :  *
     869              :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     870              :  * valid upon return.
     871              :  */
     872              : int
     873         1345 : durable_unlink(const char *fname, int elevel)
     874              : {
     875         1345 :     if (unlink(fname) < 0)
     876              :     {
     877           43 :         ereport(elevel,
     878              :                 (errcode_for_file_access(),
     879              :                  errmsg("could not remove file \"%s\": %m",
     880              :                         fname)));
     881           43 :         return -1;
     882              :     }
     883              : 
     884              :     /*
     885              :      * To guarantee that the removal of the file is persistent, fsync its
     886              :      * parent directory.
     887              :      */
     888         1302 :     if (fsync_parent_path(fname, elevel) != 0)
     889            0 :         return -1;
     890              : 
     891         1302 :     return 0;
     892              : }
     893              : 
     894              : /*
     895              :  * InitFileAccess --- initialize this module during backend startup
     896              :  *
     897              :  * This is called during either normal or standalone backend start.
     898              :  * It is *not* called in the postmaster.
     899              :  *
     900              :  * Note that this does not initialize temporary file access, that is
     901              :  * separately initialized via InitTemporaryFileAccess().
     902              :  */
     903              : void
     904        24251 : InitFileAccess(void)
     905              : {
     906              :     Assert(SizeVfdCache == 0);  /* call me only once */
     907              : 
     908              :     /* initialize cache header entry */
     909        24251 :     VfdCache = (Vfd *) malloc(sizeof(Vfd));
     910        24251 :     if (VfdCache == NULL)
     911            0 :         ereport(FATAL,
     912              :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     913              :                  errmsg("out of memory")));
     914              : 
     915       194008 :     MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
     916        24251 :     VfdCache->fd = VFD_CLOSED;
     917              : 
     918        24251 :     SizeVfdCache = 1;
     919        24251 : }
     920              : 
     921              : /*
     922              :  * InitTemporaryFileAccess --- initialize temporary file access during startup
     923              :  *
     924              :  * This is called during either normal or standalone backend start.
     925              :  * It is *not* called in the postmaster.
     926              :  *
     927              :  * This is separate from InitFileAccess() because temporary file cleanup can
     928              :  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
     929              :  * our reporting has to happen before that. Low level file access should be
     930              :  * available for longer, hence the separate initialization / shutdown of
     931              :  * temporary file handling.
     932              :  */
     933              : void
     934        24251 : InitTemporaryFileAccess(void)
     935              : {
     936              :     Assert(SizeVfdCache != 0);  /* InitFileAccess() needs to have run */
     937              :     Assert(!temporary_files_allowed);   /* call me only once */
     938              : 
     939              :     /*
     940              :      * Register before-shmem-exit hook to ensure temp files are dropped while
     941              :      * we can still report stats.
     942              :      */
     943        24251 :     before_shmem_exit(BeforeShmemExit_Files, 0);
     944              : 
     945              : #ifdef USE_ASSERT_CHECKING
     946              :     temporary_files_allowed = true;
     947              : #endif
     948        24251 : }
     949              : 
     950              : /*
     951              :  * count_usable_fds --- count how many FDs the system will let us open,
     952              :  *      and estimate how many are already open.
     953              :  *
     954              :  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
     955              :  * value of max_to_probe might result in an underestimate of already_open;
     956              :  * we must fill in any "gaps" in the set of used FDs before the calculation
     957              :  * of already_open will give the right answer.  In practice, max_to_probe
     958              :  * of a couple of dozen should be enough to ensure good results.
     959              :  *
     960              :  * We assume stderr (FD 2) is available for dup'ing.  While the calling
     961              :  * script could theoretically close that, it would be a really bad idea,
     962              :  * since then one risks loss of error messages from, e.g., libc.
     963              :  */
     964              : static void
     965         1175 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
     966              : {
     967              :     int        *fd;
     968              :     int         size;
     969         1175 :     int         used = 0;
     970         1175 :     int         highestfd = 0;
     971              :     int         j;
     972              : 
     973              : #ifdef HAVE_GETRLIMIT
     974              :     struct rlimit rlim;
     975              :     int         getrlimit_status;
     976              : #endif
     977              : 
     978         1175 :     size = 1024;
     979         1175 :     fd = (int *) palloc(size * sizeof(int));
     980              : 
     981              : #ifdef HAVE_GETRLIMIT
     982         1175 :     getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
     983         1175 :     if (getrlimit_status != 0)
     984            0 :         ereport(WARNING, (errmsg("getrlimit failed: %m")));
     985              : #endif                          /* HAVE_GETRLIMIT */
     986              : 
     987              :     /* dup until failure or probe limit reached */
     988              :     for (;;)
     989      1173825 :     {
     990              :         int         thisfd;
     991              : 
     992              : #ifdef HAVE_GETRLIMIT
     993              : 
     994              :         /*
     995              :          * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
     996              :          * some platforms
     997              :          */
     998      1175000 :         if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
     999            0 :             break;
    1000              : #endif
    1001              : 
    1002      1175000 :         thisfd = dup(2);
    1003      1175000 :         if (thisfd < 0)
    1004              :         {
    1005              :             /* Expect EMFILE or ENFILE, else it's fishy */
    1006            0 :             if (errno != EMFILE && errno != ENFILE)
    1007            0 :                 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
    1008            0 :             break;
    1009              :         }
    1010              : 
    1011      1175000 :         if (used >= size)
    1012              :         {
    1013            0 :             size *= 2;
    1014            0 :             fd = (int *) repalloc(fd, size * sizeof(int));
    1015              :         }
    1016      1175000 :         fd[used++] = thisfd;
    1017              : 
    1018      1175000 :         if (highestfd < thisfd)
    1019      1175000 :             highestfd = thisfd;
    1020              : 
    1021      1175000 :         if (used >= max_to_probe)
    1022         1175 :             break;
    1023              :     }
    1024              : 
    1025              :     /* release the files we opened */
    1026      1176175 :     for (j = 0; j < used; j++)
    1027      1175000 :         close(fd[j]);
    1028              : 
    1029         1175 :     pfree(fd);
    1030              : 
    1031              :     /*
    1032              :      * Return results.  usable_fds is just the number of successful dups. We
    1033              :      * assume that the system limit is highestfd+1 (remember 0 is a legal FD
    1034              :      * number) and so already_open is highestfd+1 - usable_fds.
    1035              :      */
    1036         1175 :     *usable_fds = used;
    1037         1175 :     *already_open = highestfd + 1 - used;
    1038         1175 : }
    1039              : 
    1040              : /*
    1041              :  * set_max_safe_fds
    1042              :  *      Determine number of file descriptors that fd.c is allowed to use
    1043              :  */
    1044              : void
    1045         1175 : set_max_safe_fds(void)
    1046              : {
    1047              :     int         usable_fds;
    1048              :     int         already_open;
    1049              : 
    1050              :     /*----------
    1051              :      * We want to set max_safe_fds to
    1052              :      *          MIN(usable_fds, max_files_per_process)
    1053              :      * less the slop factor for files that are opened without consulting
    1054              :      * fd.c.  This ensures that we won't allow to open more than
    1055              :      * max_files_per_process, or the experimentally-determined EMFILE limit,
    1056              :      * additional files.
    1057              :      *----------
    1058              :      */
    1059         1175 :     count_usable_fds(max_files_per_process,
    1060              :                      &usable_fds, &already_open);
    1061              : 
    1062         1175 :     max_safe_fds = Min(usable_fds, max_files_per_process);
    1063              : 
    1064              :     /*
    1065              :      * Take off the FDs reserved for system() etc.
    1066              :      */
    1067         1175 :     max_safe_fds -= NUM_RESERVED_FDS;
    1068              : 
    1069              :     /*
    1070              :      * Make sure we still have enough to get by.
    1071              :      */
    1072         1175 :     if (max_safe_fds < FD_MINFREE)
    1073            0 :         ereport(FATAL,
    1074              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    1075              :                  errmsg("insufficient file descriptors available to start server process"),
    1076              :                  errdetail("System allows %d, server needs at least %d, %d files are already open.",
    1077              :                            max_safe_fds + NUM_RESERVED_FDS,
    1078              :                            FD_MINFREE + NUM_RESERVED_FDS,
    1079              :                            already_open)));
    1080              : 
    1081         1175 :     elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
    1082              :          max_safe_fds, usable_fds, already_open);
    1083         1175 : }
    1084              : 
    1085              : /*
    1086              :  * Open a file with BasicOpenFilePerm() and pass default file mode for the
    1087              :  * fileMode parameter.
    1088              :  */
    1089              : int
    1090        44326 : BasicOpenFile(const char *fileName, int fileFlags)
    1091              : {
    1092        44326 :     return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1093              : }
    1094              : 
    1095              : /*
    1096              :  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
    1097              :  *
    1098              :  * This is exported for use by places that really want a plain kernel FD,
    1099              :  * but need to be proof against running out of FDs.  Once an FD has been
    1100              :  * successfully returned, it is the caller's responsibility to ensure that
    1101              :  * it will not be leaked on ereport()!  Most users should *not* call this
    1102              :  * routine directly, but instead use the VFD abstraction level, which
    1103              :  * provides protection against descriptor leaks as well as management of
    1104              :  * files that need to be open for more than a short period of time.
    1105              :  *
    1106              :  * Ideally this should be the *only* direct call of open() in the backend.
    1107              :  * In practice, the postmaster calls open() directly, and there are some
    1108              :  * direct open() calls done early in backend startup.  Those are OK since
    1109              :  * this module wouldn't have any open files to close at that point anyway.
    1110              :  */
    1111              : int
    1112      9612037 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1113              : {
    1114              :     int         fd;
    1115              : 
    1116      9612037 : tryAgain:
    1117              : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1118              :     fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
    1119              : #else
    1120      9612037 :     fd = open(fileName, fileFlags, fileMode);
    1121              : #endif
    1122              : 
    1123      9612037 :     if (fd >= 0)
    1124              :     {
    1125              : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1126              :         if (fileFlags & PG_O_DIRECT)
    1127              :         {
    1128              :             if (fcntl(fd, F_NOCACHE, 1) < 0)
    1129              :             {
    1130              :                 int         save_errno = errno;
    1131              : 
    1132              :                 close(fd);
    1133              :                 errno = save_errno;
    1134              :                 return -1;
    1135              :             }
    1136              :         }
    1137              : #endif
    1138              : 
    1139      9073681 :         return fd;              /* success! */
    1140              :     }
    1141              : 
    1142       538356 :     if (errno == EMFILE || errno == ENFILE)
    1143              :     {
    1144            0 :         int         save_errno = errno;
    1145              : 
    1146            0 :         ereport(LOG,
    1147              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    1148              :                  errmsg("out of file descriptors: %m; release and retry")));
    1149            0 :         errno = 0;
    1150            0 :         if (ReleaseLruFile())
    1151            0 :             goto tryAgain;
    1152            0 :         errno = save_errno;
    1153              :     }
    1154              : 
    1155       538356 :     return -1;                  /* failure */
    1156              : }
    1157              : 
    1158              : /*
    1159              :  * AcquireExternalFD - attempt to reserve an external file descriptor
    1160              :  *
    1161              :  * This should be used by callers that need to hold a file descriptor open
    1162              :  * over more than a short interval, but cannot use any of the other facilities
    1163              :  * provided by this module.
    1164              :  *
    1165              :  * The difference between this and the underlying ReserveExternalFD function
    1166              :  * is that this will report failure (by setting errno and returning false)
    1167              :  * if "too many" external FDs are already reserved.  This should be used in
    1168              :  * any code where the total number of FDs to be reserved is not predictable
    1169              :  * and small.
    1170              :  */
    1171              : bool
    1172       144638 : AcquireExternalFD(void)
    1173              : {
    1174              :     /*
    1175              :      * We don't want more than max_safe_fds / 3 FDs to be consumed for
    1176              :      * "external" FDs.
    1177              :      */
    1178       144638 :     if (numExternalFDs < max_safe_fds / 3)
    1179              :     {
    1180       144638 :         ReserveExternalFD();
    1181       144638 :         return true;
    1182              :     }
    1183            0 :     errno = EMFILE;
    1184            0 :     return false;
    1185              : }
    1186              : 
    1187              : /*
    1188              :  * ReserveExternalFD - report external consumption of a file descriptor
    1189              :  *
    1190              :  * This should be used by callers that need to hold a file descriptor open
    1191              :  * over more than a short interval, but cannot use any of the other facilities
    1192              :  * provided by this module.  This just tracks the use of the FD and closes
    1193              :  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
    1194              :  *
    1195              :  * Call this directly only in code where failure to reserve the FD would be
    1196              :  * fatal; for example, the WAL-writing code does so, since the alternative is
    1197              :  * session failure.  Also, it's very unwise to do so in code that could
    1198              :  * consume more than one FD per process.
    1199              :  *
    1200              :  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
    1201              :  * available, it doesn't matter too much whether this is called before or
    1202              :  * after actually opening the FD; but doing so beforehand reduces the risk of
    1203              :  * an EMFILE failure if not everybody played nice.  In any case, it's solely
    1204              :  * caller's responsibility to keep the external-FD count in sync with reality.
    1205              :  */
    1206              : void
    1207       230685 : ReserveExternalFD(void)
    1208              : {
    1209              :     /*
    1210              :      * Release VFDs if needed to stay safe.  Because we do this before
    1211              :      * incrementing numExternalFDs, the final state will be as desired, i.e.,
    1212              :      * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
    1213              :      */
    1214       230685 :     ReleaseLruFiles();
    1215              : 
    1216       230685 :     numExternalFDs++;
    1217       230685 : }
    1218              : 
    1219              : /*
    1220              :  * ReleaseExternalFD - report release of an external file descriptor
    1221              :  *
    1222              :  * This is guaranteed not to change errno, so it can be used in failure paths.
    1223              :  */
    1224              : void
    1225       209785 : ReleaseExternalFD(void)
    1226              : {
    1227              :     Assert(numExternalFDs > 0);
    1228       209785 :     numExternalFDs--;
    1229       209785 : }
    1230              : 
    1231              : 
    1232              : #if defined(FDDEBUG)
    1233              : 
    1234              : static void
    1235              : _dump_lru(void)
    1236              : {
    1237              :     int         mru = VfdCache[0].lruLessRecently;
    1238              :     Vfd        *vfdP = &VfdCache[mru];
    1239              :     char        buf[2048];
    1240              : 
    1241              :     snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
    1242              :     while (mru != 0)
    1243              :     {
    1244              :         mru = vfdP->lruLessRecently;
    1245              :         vfdP = &VfdCache[mru];
    1246              :         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
    1247              :     }
    1248              :     snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
    1249              :     elog(LOG, "%s", buf);
    1250              : }
    1251              : #endif                          /* FDDEBUG */
    1252              : 
    1253              : static void
    1254      1556614 : Delete(File file)
    1255              : {
    1256              :     Vfd        *vfdP;
    1257              : 
    1258              :     Assert(file != 0);
    1259              : 
    1260              :     DO_DB(elog(LOG, "Delete %d (%s)",
    1261              :                file, VfdCache[file].fileName));
    1262              :     DO_DB(_dump_lru());
    1263              : 
    1264      1556614 :     vfdP = &VfdCache[file];
    1265              : 
    1266      1556614 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
    1267      1556614 :     VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
    1268              : 
    1269              :     DO_DB(_dump_lru());
    1270      1556614 : }
    1271              : 
    1272              : static void
    1273         4650 : LruDelete(File file)
    1274              : {
    1275              :     Vfd        *vfdP;
    1276              : 
    1277              :     Assert(file != 0);
    1278              : 
    1279              :     DO_DB(elog(LOG, "LruDelete %d (%s)",
    1280              :                file, VfdCache[file].fileName));
    1281              : 
    1282         4650 :     vfdP = &VfdCache[file];
    1283              : 
    1284         4650 :     pgaio_closing_fd(vfdP->fd);
    1285              : 
    1286              :     /*
    1287              :      * Close the file.  We aren't expecting this to fail; if it does, better
    1288              :      * to leak the FD than to mess up our internal state.
    1289              :      */
    1290         4650 :     if (close(vfdP->fd) != 0)
    1291            0 :         elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1292              :              "could not close file \"%s\": %m", vfdP->fileName);
    1293         4650 :     vfdP->fd = VFD_CLOSED;
    1294         4650 :     --nfile;
    1295              : 
    1296              :     /* delete the vfd record from the LRU ring */
    1297         4650 :     Delete(file);
    1298         4650 : }
    1299              : 
    1300              : static void
    1301      2143346 : Insert(File file)
    1302              : {
    1303              :     Vfd        *vfdP;
    1304              : 
    1305              :     Assert(file != 0);
    1306              : 
    1307              :     DO_DB(elog(LOG, "Insert %d (%s)",
    1308              :                file, VfdCache[file].fileName));
    1309              :     DO_DB(_dump_lru());
    1310              : 
    1311      2143346 :     vfdP = &VfdCache[file];
    1312              : 
    1313      2143346 :     vfdP->lruMoreRecently = 0;
    1314      2143346 :     vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
    1315      2143346 :     VfdCache[0].lruLessRecently = file;
    1316      2143346 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
    1317              : 
    1318              :     DO_DB(_dump_lru());
    1319      2143346 : }
    1320              : 
    1321              : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1322              : static int
    1323           41 : LruInsert(File file)
    1324              : {
    1325              :     Vfd        *vfdP;
    1326              : 
    1327              :     Assert(file != 0);
    1328              : 
    1329              :     DO_DB(elog(LOG, "LruInsert %d (%s)",
    1330              :                file, VfdCache[file].fileName));
    1331              : 
    1332           41 :     vfdP = &VfdCache[file];
    1333              : 
    1334           41 :     if (FileIsNotOpen(file))
    1335              :     {
    1336              :         /* Close excess kernel FDs. */
    1337           41 :         ReleaseLruFiles();
    1338              : 
    1339              :         /*
    1340              :          * The open could still fail for lack of file descriptors, eg due to
    1341              :          * overall system file table being full.  So, be prepared to release
    1342              :          * another FD if necessary...
    1343              :          */
    1344           41 :         vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
    1345              :                                      vfdP->fileMode);
    1346           41 :         if (vfdP->fd < 0)
    1347              :         {
    1348              :             DO_DB(elog(LOG, "re-open failed: %m"));
    1349            0 :             return -1;
    1350              :         }
    1351              :         else
    1352              :         {
    1353           41 :             ++nfile;
    1354              :         }
    1355              :     }
    1356              : 
    1357              :     /*
    1358              :      * put it at the head of the Lru ring
    1359              :      */
    1360              : 
    1361           41 :     Insert(file);
    1362              : 
    1363           41 :     return 0;
    1364              : }
    1365              : 
    1366              : /*
    1367              :  * Release one kernel FD by closing the least-recently-used VFD.
    1368              :  */
    1369              : static bool
    1370         4502 : ReleaseLruFile(void)
    1371              : {
    1372              :     DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
    1373              : 
    1374         4502 :     if (nfile > 0)
    1375              :     {
    1376              :         /*
    1377              :          * There are opened files and so there should be at least one used vfd
    1378              :          * in the ring.
    1379              :          */
    1380              :         Assert(VfdCache[0].lruMoreRecently != 0);
    1381         4502 :         LruDelete(VfdCache[0].lruMoreRecently);
    1382         4502 :         return true;            /* freed a file */
    1383              :     }
    1384            0 :     return false;               /* no files available to free */
    1385              : }
    1386              : 
    1387              : /*
    1388              :  * Release kernel FDs as needed to get under the max_safe_fds limit.
    1389              :  * After calling this, it's OK to try to open another file.
    1390              :  */
    1391              : static void
    1392      9948736 : ReleaseLruFiles(void)
    1393              : {
    1394      9953238 :     while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
    1395              :     {
    1396         4502 :         if (!ReleaseLruFile())
    1397            0 :             break;
    1398              :     }
    1399      9948736 : }
    1400              : 
    1401              : static File
    1402      1781520 : AllocateVfd(void)
    1403              : {
    1404              :     Index       i;
    1405              :     File        file;
    1406              : 
    1407              :     DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
    1408              : 
    1409              :     Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */
    1410              : 
    1411      1781520 :     if (VfdCache[0].nextFree == 0)
    1412              :     {
    1413              :         /*
    1414              :          * The free list is empty so it is time to increase the size of the
    1415              :          * array.  We choose to double it each time this happens. However,
    1416              :          * there's not much point in starting *real* small.
    1417              :          */
    1418        30559 :         Size        newCacheSize = SizeVfdCache * 2;
    1419              :         Vfd        *newVfdCache;
    1420              : 
    1421        30559 :         if (newCacheSize < 32)
    1422        21023 :             newCacheSize = 32;
    1423              : 
    1424              :         /*
    1425              :          * Be careful not to clobber VfdCache ptr if realloc fails.
    1426              :          */
    1427        30559 :         newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
    1428        30559 :         if (newVfdCache == NULL)
    1429            0 :             ereport(ERROR,
    1430              :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    1431              :                      errmsg("out of memory")));
    1432        30559 :         VfdCache = newVfdCache;
    1433              : 
    1434              :         /*
    1435              :          * Initialize the new entries and link them into the free list.
    1436              :          */
    1437      1534048 :         for (i = SizeVfdCache; i < newCacheSize; i++)
    1438              :         {
    1439     12027912 :             MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
    1440      1503489 :             VfdCache[i].nextFree = i + 1;
    1441      1503489 :             VfdCache[i].fd = VFD_CLOSED;
    1442              :         }
    1443        30559 :         VfdCache[newCacheSize - 1].nextFree = 0;
    1444        30559 :         VfdCache[0].nextFree = SizeVfdCache;
    1445              : 
    1446              :         /*
    1447              :          * Record the new size
    1448              :          */
    1449        30559 :         SizeVfdCache = newCacheSize;
    1450              :     }
    1451              : 
    1452      1781520 :     file = VfdCache[0].nextFree;
    1453              : 
    1454      1781520 :     VfdCache[0].nextFree = VfdCache[file].nextFree;
    1455              : 
    1456      1781520 :     return file;
    1457              : }
    1458              : 
    1459              : static void
    1460      1190605 : FreeVfd(File file)
    1461              : {
    1462      1190605 :     Vfd        *vfdP = &VfdCache[file];
    1463              : 
    1464              :     DO_DB(elog(LOG, "FreeVfd: %d (%s)",
    1465              :                file, vfdP->fileName ? vfdP->fileName : ""));
    1466              : 
    1467      1190605 :     if (vfdP->fileName != NULL)
    1468              :     {
    1469       659120 :         free(vfdP->fileName);
    1470       659120 :         vfdP->fileName = NULL;
    1471              :     }
    1472      1190605 :     vfdP->fdstate = 0x0;
    1473              : 
    1474      1190605 :     vfdP->nextFree = VfdCache[0].nextFree;
    1475      1190605 :     VfdCache[0].nextFree = file;
    1476      1190605 : }
    1477              : 
    1478              : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1479              : static int
    1480      3392523 : FileAccess(File file)
    1481              : {
    1482              :     int         returnValue;
    1483              : 
    1484              :     DO_DB(elog(LOG, "FileAccess %d (%s)",
    1485              :                file, VfdCache[file].fileName));
    1486              : 
    1487              :     /*
    1488              :      * Is the file open?  If not, open it and put it at the head of the LRU
    1489              :      * ring (possibly closing the least recently used file to get an FD).
    1490              :      */
    1491              : 
    1492      3392523 :     if (FileIsNotOpen(file))
    1493              :     {
    1494           41 :         returnValue = LruInsert(file);
    1495           41 :         if (returnValue != 0)
    1496            0 :             return returnValue;
    1497              :     }
    1498      3392482 :     else if (VfdCache[0].lruLessRecently != file)
    1499              :     {
    1500              :         /*
    1501              :          * We now know that the file is open and that it is not the last one
    1502              :          * accessed, so we need to move it to the head of the Lru ring.
    1503              :          */
    1504              : 
    1505       893270 :         Delete(file);
    1506       893270 :         Insert(file);
    1507              :     }
    1508              : 
    1509      3392523 :     return 0;
    1510              : }
    1511              : 
    1512              : /*
    1513              :  * Called whenever a temporary file is deleted to report its size.
    1514              :  */
    1515              : static void
    1516         3856 : ReportTemporaryFileUsage(const char *path, pgoff_t size)
    1517              : {
    1518         3856 :     pgstat_report_tempfile(size);
    1519              : 
    1520         3856 :     if (log_temp_files >= 0)
    1521              :     {
    1522          842 :         if ((size / 1024) >= log_temp_files)
    1523          132 :             ereport(LOG,
    1524              :                     (errmsg("temporary file: path \"%s\", size %lu",
    1525              :                             path, (unsigned long) size)));
    1526              :     }
    1527         3856 : }
    1528              : 
    1529              : /*
    1530              :  * Called to register a temporary file for automatic close.
    1531              :  * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
    1532              :  * before the file was opened.
    1533              :  */
    1534              : static void
    1535         6135 : RegisterTemporaryFile(File file)
    1536              : {
    1537         6135 :     ResourceOwnerRememberFile(CurrentResourceOwner, file);
    1538         6135 :     VfdCache[file].resowner = CurrentResourceOwner;
    1539              : 
    1540              :     /* Backup mechanism for closing at end of xact. */
    1541         6135 :     VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
    1542         6135 :     have_xact_temporary_files = true;
    1543         6135 : }
    1544              : 
    1545              : /*
    1546              :  *  Called when we get a shared invalidation message on some relation.
    1547              :  */
    1548              : #ifdef NOT_USED
    1549              : void
    1550              : FileInvalidate(File file)
    1551              : {
    1552              :     Assert(FileIsValid(file));
    1553              :     if (!FileIsNotOpen(file))
    1554              :         LruDelete(file);
    1555              : }
    1556              : #endif
    1557              : 
    1558              : /*
    1559              :  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
    1560              :  * fileMode parameter.
    1561              :  */
    1562              : File
    1563      1781520 : PathNameOpenFile(const char *fileName, int fileFlags)
    1564              : {
    1565      1781520 :     return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1566              : }
    1567              : 
    1568              : /*
    1569              :  * open a file in an arbitrary directory
    1570              :  *
    1571              :  * NB: if the passed pathname is relative (which it usually is),
    1572              :  * it will be interpreted relative to the process' working directory
    1573              :  * (which should always be $PGDATA when this code is running).
    1574              :  */
    1575              : File
    1576      1781520 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1577              : {
    1578              :     char       *fnamecopy;
    1579              :     File        file;
    1580              :     Vfd        *vfdP;
    1581              : 
    1582              :     DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
    1583              :                fileName, fileFlags, fileMode));
    1584              : 
    1585              :     /*
    1586              :      * We need a malloc'd copy of the file name; fail cleanly if no room.
    1587              :      */
    1588      1781520 :     fnamecopy = strdup(fileName);
    1589      1781520 :     if (fnamecopy == NULL)
    1590            0 :         ereport(ERROR,
    1591              :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    1592              :                  errmsg("out of memory")));
    1593              : 
    1594      1781520 :     file = AllocateVfd();
    1595      1781520 :     vfdP = &VfdCache[file];
    1596              : 
    1597              :     /* Close excess kernel FDs. */
    1598      1781520 :     ReleaseLruFiles();
    1599              : 
    1600              :     /*
    1601              :      * Descriptors managed by VFDs are implicitly marked O_CLOEXEC.  The
    1602              :      * client shouldn't be expected to know which kernel descriptors are
    1603              :      * currently open, so it wouldn't make sense for them to be inherited by
    1604              :      * executed subprograms.
    1605              :      */
    1606      1781520 :     fileFlags |= O_CLOEXEC;
    1607              : 
    1608      1781520 :     vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    1609              : 
    1610      1781520 :     if (vfdP->fd < 0)
    1611              :     {
    1612       531485 :         int         save_errno = errno;
    1613              : 
    1614       531485 :         FreeVfd(file);
    1615       531485 :         free(fnamecopy);
    1616       531485 :         errno = save_errno;
    1617       531485 :         return -1;
    1618              :     }
    1619      1250035 :     ++nfile;
    1620              :     DO_DB(elog(LOG, "PathNameOpenFile: success %d",
    1621              :                vfdP->fd));
    1622              : 
    1623      1250035 :     vfdP->fileName = fnamecopy;
    1624              :     /* Saved flags are adjusted to be OK for re-opening file */
    1625      1250035 :     vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    1626      1250035 :     vfdP->fileMode = fileMode;
    1627      1250035 :     vfdP->fileSize = 0;
    1628      1250035 :     vfdP->fdstate = 0x0;
    1629      1250035 :     vfdP->resowner = NULL;
    1630              : 
    1631      1250035 :     Insert(file);
    1632              : 
    1633      1250035 :     return file;
    1634              : }
    1635              : 
    1636              : /*
    1637              :  * Create directory 'directory'.  If necessary, create 'basedir', which must
    1638              :  * be the directory above it.  This is designed for creating the top-level
    1639              :  * temporary directory on demand before creating a directory underneath it.
    1640              :  * Do nothing if the directory already exists.
    1641              :  *
    1642              :  * Directories created within the top-level temporary directory should begin
    1643              :  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
    1644              :  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
    1645              :  * that do not need any particular prefix.
    1646              : */
    1647              : void
    1648          262 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
    1649              : {
    1650          262 :     if (MakePGDirectory(directory) < 0)
    1651              :     {
    1652           22 :         if (errno == EEXIST)
    1653            9 :             return;
    1654              : 
    1655              :         /*
    1656              :          * Failed.  Try to create basedir first in case it's missing. Tolerate
    1657              :          * EEXIST to close a race against another process following the same
    1658              :          * algorithm.
    1659              :          */
    1660           13 :         if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
    1661            0 :             ereport(ERROR,
    1662              :                     (errcode_for_file_access(),
    1663              :                      errmsg("cannot create temporary directory \"%s\": %m",
    1664              :                             basedir)));
    1665              : 
    1666              :         /* Try again. */
    1667           13 :         if (MakePGDirectory(directory) < 0 && errno != EEXIST)
    1668            0 :             ereport(ERROR,
    1669              :                     (errcode_for_file_access(),
    1670              :                      errmsg("cannot create temporary subdirectory \"%s\": %m",
    1671              :                             directory)));
    1672              :     }
    1673              : }
    1674              : 
    1675              : /*
    1676              :  * Delete a directory and everything in it, if it exists.
    1677              :  */
    1678              : void
    1679          309 : PathNameDeleteTemporaryDir(const char *dirname)
    1680              : {
    1681              :     struct stat statbuf;
    1682              : 
    1683              :     /* Silently ignore missing directory. */
    1684          309 :     if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
    1685           57 :         return;
    1686              : 
    1687              :     /*
    1688              :      * Currently, walkdir doesn't offer a way for our passed in function to
    1689              :      * maintain state.  Perhaps it should, so that we could tell the caller
    1690              :      * whether this operation succeeded or failed.  Since this operation is
    1691              :      * used in a cleanup path, we wouldn't actually behave differently: we'll
    1692              :      * just log failures.
    1693              :      */
    1694          252 :     walkdir(dirname, unlink_if_exists_fname, false, LOG);
    1695              : }
    1696              : 
    1697              : /*
    1698              :  * Open a temporary file that will disappear when we close it.
    1699              :  *
    1700              :  * This routine takes care of generating an appropriate tempfile name.
    1701              :  * There's no need to pass in fileFlags or fileMode either, since only
    1702              :  * one setting makes any sense for a temp file.
    1703              :  *
    1704              :  * Unless interXact is true, the file is remembered by CurrentResourceOwner
    1705              :  * to ensure it's closed and deleted when it's no longer needed, typically at
    1706              :  * the end-of-transaction. In most cases, you don't want temporary files to
    1707              :  * outlive the transaction that created them, so this should be false -- but
    1708              :  * if you need "somewhat" temporary storage, this might be useful. In either
    1709              :  * case, the file is removed when the File is explicitly closed.
    1710              :  */
    1711              : File
    1712         2001 : OpenTemporaryFile(bool interXact)
    1713              : {
    1714         2001 :     File        file = 0;
    1715              : 
    1716              :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1717              : 
    1718              :     /*
    1719              :      * Make sure the current resource owner has space for this File before we
    1720              :      * open it, if we'll be registering it below.
    1721              :      */
    1722         2001 :     if (!interXact)
    1723         2001 :         ResourceOwnerEnlarge(CurrentResourceOwner);
    1724              : 
    1725              :     /*
    1726              :      * If some temp tablespace(s) have been given to us, try to use the next
    1727              :      * one.  If a given tablespace can't be found, we silently fall back to
    1728              :      * the database's default tablespace.
    1729              :      *
    1730              :      * BUT: if the temp file is slated to outlive the current transaction,
    1731              :      * force it into the database's default tablespace, so that it will not
    1732              :      * pose a threat to possible tablespace drop attempts.
    1733              :      */
    1734         2001 :     if (numTempTableSpaces > 0 && !interXact)
    1735              :     {
    1736            1 :         Oid         tblspcOid = GetNextTempTableSpace();
    1737              : 
    1738            1 :         if (OidIsValid(tblspcOid))
    1739            1 :             file = OpenTemporaryFileInTablespace(tblspcOid, false);
    1740              :     }
    1741              : 
    1742              :     /*
    1743              :      * If not, or if tablespace is bad, create in database's default
    1744              :      * tablespace.  MyDatabaseTableSpace should normally be set before we get
    1745              :      * here, but just in case it isn't, fall back to pg_default tablespace.
    1746              :      */
    1747         2001 :     if (file <= 0)
    1748         2000 :         file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
    1749              :                                              MyDatabaseTableSpace :
    1750              :                                              DEFAULTTABLESPACE_OID,
    1751              :                                              true);
    1752              : 
    1753              :     /* Mark it for deletion at close and temporary file size limit */
    1754         2001 :     VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
    1755              : 
    1756              :     /* Register it with the current resource owner */
    1757         2001 :     if (!interXact)
    1758         2001 :         RegisterTemporaryFile(file);
    1759              : 
    1760         2001 :     return file;
    1761              : }
    1762              : 
    1763              : /*
    1764              :  * Return the path of the temp directory in a given tablespace.
    1765              :  */
    1766              : void
    1767        11766 : TempTablespacePath(char *path, Oid tablespace)
    1768              : {
    1769              :     /*
    1770              :      * Identify the tempfile directory for this tablespace.
    1771              :      *
    1772              :      * If someone tries to specify pg_global, use pg_default instead.
    1773              :      */
    1774        11766 :     if (tablespace == InvalidOid ||
    1775            1 :         tablespace == DEFAULTTABLESPACE_OID ||
    1776              :         tablespace == GLOBALTABLESPACE_OID)
    1777        11765 :         snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
    1778              :     else
    1779              :     {
    1780              :         /* All other tablespaces are accessed via symlinks */
    1781            1 :         snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
    1782              :                  PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
    1783              :                  PG_TEMP_FILES_DIR);
    1784              :     }
    1785        11766 : }
    1786              : 
    1787              : /*
    1788              :  * Open a temporary file in a specific tablespace.
    1789              :  * Subroutine for OpenTemporaryFile, which see for details.
    1790              :  */
    1791              : static File
    1792         2001 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
    1793              : {
    1794              :     char        tempdirpath[MAXPGPATH];
    1795              :     char        tempfilepath[MAXPGPATH];
    1796              :     File        file;
    1797              : 
    1798         2001 :     TempTablespacePath(tempdirpath, tblspcOid);
    1799              : 
    1800              :     /*
    1801              :      * Generate a tempfile name that should be unique within the current
    1802              :      * database instance.
    1803              :      */
    1804         2001 :     snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
    1805              :              tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
    1806              : 
    1807              :     /*
    1808              :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1809              :      * temp file that can be reused.
    1810              :      */
    1811         2001 :     file = PathNameOpenFile(tempfilepath,
    1812              :                             O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1813         2001 :     if (file <= 0)
    1814              :     {
    1815              :         /*
    1816              :          * We might need to create the tablespace's tempfile directory, if no
    1817              :          * one has yet done so.
    1818              :          *
    1819              :          * Don't check for an error from MakePGDirectory; it could fail if
    1820              :          * someone else just did the same thing.  If it doesn't work then
    1821              :          * we'll bomb out on the second create attempt, instead.
    1822              :          */
    1823          103 :         (void) MakePGDirectory(tempdirpath);
    1824              : 
    1825          103 :         file = PathNameOpenFile(tempfilepath,
    1826              :                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1827          103 :         if (file <= 0 && rejectError)
    1828            0 :             elog(ERROR, "could not create temporary file \"%s\": %m",
    1829              :                  tempfilepath);
    1830              :     }
    1831              : 
    1832         2001 :     return file;
    1833              : }
    1834              : 
    1835              : 
    1836              : /*
    1837              :  * Create a new file.  The directory containing it must already exist.  Files
    1838              :  * created this way are subject to temp_file_limit and are automatically
    1839              :  * closed at end of transaction, but are not automatically deleted on close
    1840              :  * because they are intended to be shared between cooperating backends.
    1841              :  *
    1842              :  * If the file is inside the top-level temporary directory, its name should
    1843              :  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
    1844              :  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
    1845              :  * inside a directory created with PathNameCreateTemporaryDir(), in which case
    1846              :  * the prefix isn't needed.
    1847              :  */
    1848              : File
    1849         2117 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
    1850              : {
    1851              :     File        file;
    1852              : 
    1853              :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1854              : 
    1855         2117 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    1856              : 
    1857              :     /*
    1858              :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1859              :      * temp file that can be reused.
    1860              :      */
    1861         2117 :     file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1862         2117 :     if (file <= 0)
    1863              :     {
    1864          262 :         if (error_on_failure)
    1865            0 :             ereport(ERROR,
    1866              :                     (errcode_for_file_access(),
    1867              :                      errmsg("could not create temporary file \"%s\": %m",
    1868              :                             path)));
    1869              :         else
    1870          262 :             return file;
    1871              :     }
    1872              : 
    1873              :     /* Mark it for temp_file_limit accounting. */
    1874         1855 :     VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
    1875              : 
    1876              :     /* Register it for automatic close. */
    1877         1855 :     RegisterTemporaryFile(file);
    1878              : 
    1879         1855 :     return file;
    1880              : }
    1881              : 
    1882              : /*
    1883              :  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
    1884              :  * another backend.  Files opened this way don't count against the
    1885              :  * temp_file_limit of the caller, are automatically closed at the end of the
    1886              :  * transaction but are not deleted on close.
    1887              :  */
    1888              : File
    1889         4831 : PathNameOpenTemporaryFile(const char *path, int mode)
    1890              : {
    1891              :     File        file;
    1892              : 
    1893              :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1894              : 
    1895         4831 :     ResourceOwnerEnlarge(CurrentResourceOwner);
    1896              : 
    1897         4831 :     file = PathNameOpenFile(path, mode | PG_BINARY);
    1898              : 
    1899              :     /* If no such file, then we don't raise an error. */
    1900         4831 :     if (file <= 0 && errno != ENOENT)
    1901            0 :         ereport(ERROR,
    1902              :                 (errcode_for_file_access(),
    1903              :                  errmsg("could not open temporary file \"%s\": %m",
    1904              :                         path)));
    1905              : 
    1906         4831 :     if (file > 0)
    1907              :     {
    1908              :         /* Register it for automatic close. */
    1909         2279 :         RegisterTemporaryFile(file);
    1910              :     }
    1911              : 
    1912         4831 :     return file;
    1913              : }
    1914              : 
    1915              : /*
    1916              :  * Delete a file by pathname.  Return true if the file existed, false if
    1917              :  * didn't.
    1918              :  */
    1919              : bool
    1920         4062 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
    1921              : {
    1922              :     struct stat filestats;
    1923              :     int         stat_errno;
    1924              : 
    1925              :     /* Get the final size for pgstat reporting. */
    1926         4062 :     if (stat(path, &filestats) != 0)
    1927         2207 :         stat_errno = errno;
    1928              :     else
    1929         1855 :         stat_errno = 0;
    1930              : 
    1931              :     /*
    1932              :      * Unlike FileClose's automatic file deletion code, we tolerate
    1933              :      * non-existence to support BufFileDeleteFileSet which doesn't know how
    1934              :      * many segments it has to delete until it runs out.
    1935              :      */
    1936         4062 :     if (stat_errno == ENOENT)
    1937         2207 :         return false;
    1938              : 
    1939         1855 :     if (unlink(path) < 0)
    1940              :     {
    1941            0 :         if (errno != ENOENT)
    1942            0 :             ereport(error_on_failure ? ERROR : LOG,
    1943              :                     (errcode_for_file_access(),
    1944              :                      errmsg("could not unlink temporary file \"%s\": %m",
    1945              :                             path)));
    1946            0 :         return false;
    1947              :     }
    1948              : 
    1949         1855 :     if (stat_errno == 0)
    1950         1855 :         ReportTemporaryFileUsage(path, filestats.st_size);
    1951              :     else
    1952              :     {
    1953            0 :         errno = stat_errno;
    1954            0 :         ereport(LOG,
    1955              :                 (errcode_for_file_access(),
    1956              :                  errmsg("could not stat file \"%s\": %m", path)));
    1957              :     }
    1958              : 
    1959         1855 :     return true;
    1960              : }
    1961              : 
    1962              : /*
    1963              :  * close a file when done with it
    1964              :  */
    1965              : void
    1966       659120 : FileClose(File file)
    1967              : {
    1968              :     Vfd        *vfdP;
    1969              : 
    1970              :     Assert(FileIsValid(file));
    1971              : 
    1972              :     DO_DB(elog(LOG, "FileClose: %d (%s)",
    1973              :                file, VfdCache[file].fileName));
    1974              : 
    1975       659120 :     vfdP = &VfdCache[file];
    1976              : 
    1977       659120 :     if (!FileIsNotOpen(file))
    1978              :     {
    1979       658694 :         pgaio_closing_fd(vfdP->fd);
    1980              : 
    1981              :         /* close the file */
    1982       658694 :         if (close(vfdP->fd) != 0)
    1983              :         {
    1984              :             /*
    1985              :              * We may need to panic on failure to close non-temporary files;
    1986              :              * see LruDelete.
    1987              :              */
    1988            0 :             elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1989              :                  "could not close file \"%s\": %m", vfdP->fileName);
    1990              :         }
    1991              : 
    1992       658694 :         --nfile;
    1993       658694 :         vfdP->fd = VFD_CLOSED;
    1994              : 
    1995              :         /* remove the file from the lru ring */
    1996       658694 :         Delete(file);
    1997              :     }
    1998              : 
    1999       659120 :     if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    2000              :     {
    2001              :         /* Subtract its size from current usage (do first in case of error) */
    2002         3856 :         temporary_files_size -= vfdP->fileSize;
    2003         3856 :         vfdP->fileSize = 0;
    2004              :     }
    2005              : 
    2006              :     /*
    2007              :      * Delete the file if it was temporary, and make a log entry if wanted
    2008              :      */
    2009       659120 :     if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
    2010              :     {
    2011              :         struct stat filestats;
    2012              :         int         stat_errno;
    2013              : 
    2014              :         /*
    2015              :          * If we get an error, as could happen within the ereport/elog calls,
    2016              :          * we'll come right back here during transaction abort.  Reset the
    2017              :          * flag to ensure that we can't get into an infinite loop.  This code
    2018              :          * is arranged to ensure that the worst-case consequence is failing to
    2019              :          * emit log message(s), not failing to attempt the unlink.
    2020              :          */
    2021         2001 :         vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
    2022              : 
    2023              : 
    2024              :         /* first try the stat() */
    2025         2001 :         if (stat(vfdP->fileName, &filestats))
    2026            0 :             stat_errno = errno;
    2027              :         else
    2028         2001 :             stat_errno = 0;
    2029              : 
    2030              :         /* in any case do the unlink */
    2031         2001 :         if (unlink(vfdP->fileName))
    2032            0 :             ereport(LOG,
    2033              :                     (errcode_for_file_access(),
    2034              :                      errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
    2035              : 
    2036              :         /* and last report the stat results */
    2037         2001 :         if (stat_errno == 0)
    2038         2001 :             ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
    2039              :         else
    2040              :         {
    2041            0 :             errno = stat_errno;
    2042            0 :             ereport(LOG,
    2043              :                     (errcode_for_file_access(),
    2044              :                      errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
    2045              :         }
    2046              :     }
    2047              : 
    2048              :     /* Unregister it from the resource owner */
    2049       659120 :     if (vfdP->resowner)
    2050         6130 :         ResourceOwnerForgetFile(vfdP->resowner, file);
    2051              : 
    2052              :     /*
    2053              :      * Return the Vfd slot to the free list
    2054              :      */
    2055       659120 :     FreeVfd(file);
    2056       659120 : }
    2057              : 
    2058              : /*
    2059              :  * FilePrefetch - initiate asynchronous read of a given range of the file.
    2060              :  *
    2061              :  * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
    2062              :  *
    2063              :  * posix_fadvise() is the simplest standardized interface that accomplishes
    2064              :  * this.
    2065              :  */
    2066              : int
    2067         8778 : FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
    2068              : {
    2069              :     Assert(FileIsValid(file));
    2070              : 
    2071              :     DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2072              :                file, VfdCache[file].fileName,
    2073              :                (int64) offset, (int64) amount));
    2074              : 
    2075              : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
    2076              :     {
    2077              :         int         returnCode;
    2078              : 
    2079         8778 :         returnCode = FileAccess(file);
    2080         8778 :         if (returnCode < 0)
    2081            0 :             return returnCode;
    2082              : 
    2083         8778 : retry:
    2084         8778 :         pgstat_report_wait_start(wait_event_info);
    2085         8778 :         returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
    2086              :                                    POSIX_FADV_WILLNEED);
    2087         8778 :         pgstat_report_wait_end();
    2088              : 
    2089         8778 :         if (returnCode == EINTR)
    2090            0 :             goto retry;
    2091              : 
    2092         8778 :         return returnCode;
    2093              :     }
    2094              : #elif defined(__darwin__)
    2095              :     {
    2096              :         struct radvisory
    2097              :         {
    2098              :             off_t       ra_offset;  /* offset into the file */
    2099              :             int         ra_count;   /* size of the read     */
    2100              :         }           ra;
    2101              :         int         returnCode;
    2102              : 
    2103              :         returnCode = FileAccess(file);
    2104              :         if (returnCode < 0)
    2105              :             return returnCode;
    2106              : 
    2107              :         ra.ra_offset = offset;
    2108              :         ra.ra_count = amount;
    2109              :         pgstat_report_wait_start(wait_event_info);
    2110              :         returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
    2111              :         pgstat_report_wait_end();
    2112              :         if (returnCode != -1)
    2113              :             return 0;
    2114              :         else
    2115              :             return errno;
    2116              :     }
    2117              : #else
    2118              :     return 0;
    2119              : #endif
    2120              : }
    2121              : 
    2122              : void
    2123            0 : FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
    2124              : {
    2125              :     int         returnCode;
    2126              : 
    2127              :     Assert(FileIsValid(file));
    2128              : 
    2129              :     DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2130              :                file, VfdCache[file].fileName,
    2131              :                (int64) offset, (int64) nbytes));
    2132              : 
    2133            0 :     if (nbytes <= 0)
    2134            0 :         return;
    2135              : 
    2136            0 :     if (VfdCache[file].fileFlags & PG_O_DIRECT)
    2137            0 :         return;
    2138              : 
    2139            0 :     returnCode = FileAccess(file);
    2140            0 :     if (returnCode < 0)
    2141            0 :         return;
    2142              : 
    2143            0 :     pgstat_report_wait_start(wait_event_info);
    2144            0 :     pg_flush_data(VfdCache[file].fd, offset, nbytes);
    2145            0 :     pgstat_report_wait_end();
    2146              : }
    2147              : 
    2148              : ssize_t
    2149       420091 : FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
    2150              :           uint32 wait_event_info)
    2151              : {
    2152              :     ssize_t     returnCode;
    2153              :     Vfd        *vfdP;
    2154              : 
    2155              :     Assert(FileIsValid(file));
    2156              : 
    2157              :     DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
    2158              :                file, VfdCache[file].fileName,
    2159              :                (int64) offset,
    2160              :                iovcnt));
    2161              : 
    2162       420091 :     returnCode = FileAccess(file);
    2163       420091 :     if (returnCode < 0)
    2164            0 :         return returnCode;
    2165              : 
    2166       420091 :     vfdP = &VfdCache[file];
    2167              : 
    2168       420091 : retry:
    2169       420091 :     pgstat_report_wait_start(wait_event_info);
    2170       420091 :     returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
    2171       420091 :     pgstat_report_wait_end();
    2172              : 
    2173       420091 :     if (returnCode < 0)
    2174              :     {
    2175              :         /*
    2176              :          * Windows may run out of kernel buffers and return "Insufficient
    2177              :          * system resources" error.  Wait a bit and retry to solve it.
    2178              :          *
    2179              :          * It is rumored that EINTR is also possible on some Unix filesystems,
    2180              :          * in which case immediate retry is indicated.
    2181              :          */
    2182              : #ifdef WIN32
    2183              :         DWORD       error = GetLastError();
    2184              : 
    2185              :         switch (error)
    2186              :         {
    2187              :             case ERROR_NO_SYSTEM_RESOURCES:
    2188              :                 pg_usleep(1000L);
    2189              :                 errno = EINTR;
    2190              :                 break;
    2191              :             default:
    2192              :                 _dosmaperr(error);
    2193              :                 break;
    2194              :         }
    2195              : #endif
    2196              :         /* OK to retry if interrupted */
    2197            0 :         if (errno == EINTR)
    2198            0 :             goto retry;
    2199              :     }
    2200              : 
    2201       420091 :     return returnCode;
    2202              : }
    2203              : 
    2204              : int
    2205      1378704 : FileStartReadV(PgAioHandle *ioh, File file,
    2206              :                int iovcnt, pgoff_t offset,
    2207              :                uint32 wait_event_info)
    2208              : {
    2209              :     int         returnCode;
    2210              :     Vfd        *vfdP;
    2211              : 
    2212              :     Assert(FileIsValid(file));
    2213              : 
    2214              :     DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
    2215              :                file, VfdCache[file].fileName,
    2216              :                (int64) offset,
    2217              :                iovcnt));
    2218              : 
    2219      1378704 :     returnCode = FileAccess(file);
    2220      1378704 :     if (returnCode < 0)
    2221            0 :         return returnCode;
    2222              : 
    2223      1378704 :     vfdP = &VfdCache[file];
    2224              : 
    2225      1378704 :     pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
    2226              : 
    2227      1378704 :     return 0;
    2228              : }
    2229              : 
    2230              : ssize_t
    2231       831411 : FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
    2232              :            uint32 wait_event_info)
    2233              : {
    2234              :     ssize_t     returnCode;
    2235              :     Vfd        *vfdP;
    2236              : 
    2237              :     Assert(FileIsValid(file));
    2238              : 
    2239              :     DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
    2240              :                file, VfdCache[file].fileName,
    2241              :                (int64) offset,
    2242              :                iovcnt));
    2243              : 
    2244       831411 :     returnCode = FileAccess(file);
    2245       831411 :     if (returnCode < 0)
    2246            0 :         return returnCode;
    2247              : 
    2248       831411 :     vfdP = &VfdCache[file];
    2249              : 
    2250              :     /*
    2251              :      * If enforcing temp_file_limit and it's a temp file, check to see if the
    2252              :      * write would overrun temp_file_limit, and throw error if so.  Note: it's
    2253              :      * really a modularity violation to throw error here; we should set errno
    2254              :      * and return -1.  However, there's no way to report a suitable error
    2255              :      * message if we do that.  All current callers would just throw error
    2256              :      * immediately anyway, so this is safe at present.
    2257              :      */
    2258       831411 :     if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
    2259              :     {
    2260            0 :         pgoff_t     past_write = offset;
    2261              : 
    2262            0 :         for (int i = 0; i < iovcnt; ++i)
    2263            0 :             past_write += iov[i].iov_len;
    2264              : 
    2265            0 :         if (past_write > vfdP->fileSize)
    2266              :         {
    2267            0 :             uint64      newTotal = temporary_files_size;
    2268              : 
    2269            0 :             newTotal += past_write - vfdP->fileSize;
    2270            0 :             if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
    2271            0 :                 ereport(ERROR,
    2272              :                         (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
    2273              :                          errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
    2274              :                                 temp_file_limit)));
    2275              :         }
    2276              :     }
    2277              : 
    2278       831411 : retry:
    2279       831411 :     pgstat_report_wait_start(wait_event_info);
    2280       831411 :     returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
    2281       831411 :     pgstat_report_wait_end();
    2282              : 
    2283       831411 :     if (returnCode >= 0)
    2284              :     {
    2285              :         /*
    2286              :          * Some callers expect short writes to set errno, and traditionally we
    2287              :          * have assumed that they imply disk space shortage.  We don't want to
    2288              :          * waste CPU cycles adding up the total size here, so we'll just set
    2289              :          * it for all successful writes in case such a caller determines that
    2290              :          * the write was short and ereports "%m".
    2291              :          */
    2292       831411 :         errno = ENOSPC;
    2293              : 
    2294              :         /*
    2295              :          * Maintain fileSize and temporary_files_size if it's a temp file.
    2296              :          */
    2297       831411 :         if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    2298              :         {
    2299        71442 :             pgoff_t     past_write = offset + returnCode;
    2300              : 
    2301        71442 :             if (past_write > vfdP->fileSize)
    2302              :             {
    2303        49382 :                 temporary_files_size += past_write - vfdP->fileSize;
    2304        49382 :                 vfdP->fileSize = past_write;
    2305              :             }
    2306              :         }
    2307              :     }
    2308              :     else
    2309              :     {
    2310              :         /*
    2311              :          * See comments in FileReadV()
    2312              :          */
    2313              : #ifdef WIN32
    2314              :         DWORD       error = GetLastError();
    2315              : 
    2316              :         switch (error)
    2317              :         {
    2318              :             case ERROR_NO_SYSTEM_RESOURCES:
    2319              :                 pg_usleep(1000L);
    2320              :                 errno = EINTR;
    2321              :                 break;
    2322              :             default:
    2323              :                 _dosmaperr(error);
    2324              :                 break;
    2325              :         }
    2326              : #endif
    2327              :         /* OK to retry if interrupted */
    2328            0 :         if (errno == EINTR)
    2329            0 :             goto retry;
    2330              :     }
    2331              : 
    2332       831411 :     return returnCode;
    2333              : }
    2334              : 
    2335              : int
    2336          413 : FileSync(File file, uint32 wait_event_info)
    2337              : {
    2338              :     int         returnCode;
    2339              : 
    2340              :     Assert(FileIsValid(file));
    2341              : 
    2342              :     DO_DB(elog(LOG, "FileSync: %d (%s)",
    2343              :                file, VfdCache[file].fileName));
    2344              : 
    2345          413 :     returnCode = FileAccess(file);
    2346          413 :     if (returnCode < 0)
    2347            0 :         return returnCode;
    2348              : 
    2349          413 :     pgstat_report_wait_start(wait_event_info);
    2350          413 :     returnCode = pg_fsync(VfdCache[file].fd);
    2351          413 :     pgstat_report_wait_end();
    2352              : 
    2353          413 :     return returnCode;
    2354              : }
    2355              : 
    2356              : /*
    2357              :  * Zero a region of the file.
    2358              :  *
    2359              :  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
    2360              :  * appropriate error.
    2361              :  */
    2362              : int
    2363       245333 : FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
    2364              : {
    2365              :     int         returnCode;
    2366              :     ssize_t     written;
    2367              : 
    2368              :     Assert(FileIsValid(file));
    2369              : 
    2370              :     DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2371              :                file, VfdCache[file].fileName,
    2372              :                (int64) offset, (int64) amount));
    2373              : 
    2374       245333 :     returnCode = FileAccess(file);
    2375       245333 :     if (returnCode < 0)
    2376            0 :         return returnCode;
    2377              : 
    2378       245333 :     pgstat_report_wait_start(wait_event_info);
    2379       245333 :     written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
    2380       245333 :     pgstat_report_wait_end();
    2381              : 
    2382       245333 :     if (written < 0)
    2383            0 :         return -1;
    2384       245333 :     else if (written != amount)
    2385              :     {
    2386              :         /* if errno is unset, assume problem is no disk space */
    2387            0 :         if (errno == 0)
    2388            0 :             errno = ENOSPC;
    2389            0 :         return -1;
    2390              :     }
    2391              : 
    2392       245333 :     return 0;
    2393              : }
    2394              : 
    2395              : /*
    2396              :  * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
    2397              :  * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
    2398              :  * use FileZero() instead.
    2399              :  *
    2400              :  * Note that at least glibc() implements posix_fallocate() in userspace if not
    2401              :  * implemented by the filesystem. That's not the case for all environments
    2402              :  * though.
    2403              :  *
    2404              :  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
    2405              :  * appropriate error.
    2406              :  */
    2407              : int
    2408          694 : FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
    2409              : {
    2410              : #ifdef HAVE_POSIX_FALLOCATE
    2411              :     int         returnCode;
    2412              : 
    2413              :     Assert(FileIsValid(file));
    2414              : 
    2415              :     DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2416              :                file, VfdCache[file].fileName,
    2417              :                (int64) offset, (int64) amount));
    2418              : 
    2419          694 :     returnCode = FileAccess(file);
    2420          694 :     if (returnCode < 0)
    2421            0 :         return -1;
    2422              : 
    2423          694 : retry:
    2424          694 :     pgstat_report_wait_start(wait_event_info);
    2425          694 :     returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
    2426          694 :     pgstat_report_wait_end();
    2427              : 
    2428          694 :     if (returnCode == 0)
    2429          694 :         return 0;
    2430            0 :     else if (returnCode == EINTR)
    2431            0 :         goto retry;
    2432              : 
    2433              :     /* for compatibility with %m printing etc */
    2434            0 :     errno = returnCode;
    2435              : 
    2436              :     /*
    2437              :      * Return in cases of a "real" failure, if fallocate is not supported,
    2438              :      * fall through to the FileZero() backed implementation.
    2439              :      */
    2440            0 :     if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
    2441            0 :         return -1;
    2442              : #endif
    2443              : 
    2444            0 :     return FileZero(file, offset, amount, wait_event_info);
    2445              : }
    2446              : 
    2447              : pgoff_t
    2448      3457890 : FileSize(File file)
    2449              : {
    2450              :     Assert(FileIsValid(file));
    2451              : 
    2452              :     DO_DB(elog(LOG, "FileSize %d (%s)",
    2453              :                file, VfdCache[file].fileName));
    2454              : 
    2455      3457890 :     if (FileIsNotOpen(file))
    2456              :     {
    2457           29 :         if (FileAccess(file) < 0)
    2458            0 :             return (pgoff_t) -1;
    2459              :     }
    2460              : 
    2461      3457890 :     return lseek(VfdCache[file].fd, 0, SEEK_END);
    2462              : }
    2463              : 
    2464              : int
    2465          672 : FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
    2466              : {
    2467              :     int         returnCode;
    2468              : 
    2469              :     Assert(FileIsValid(file));
    2470              : 
    2471              :     DO_DB(elog(LOG, "FileTruncate %d (%s)",
    2472              :                file, VfdCache[file].fileName));
    2473              : 
    2474          672 :     returnCode = FileAccess(file);
    2475          672 :     if (returnCode < 0)
    2476            0 :         return returnCode;
    2477              : 
    2478          672 :     pgstat_report_wait_start(wait_event_info);
    2479          672 :     returnCode = pg_ftruncate(VfdCache[file].fd, offset);
    2480          672 :     pgstat_report_wait_end();
    2481              : 
    2482          672 :     if (returnCode == 0 && VfdCache[file].fileSize > offset)
    2483              :     {
    2484              :         /* adjust our state for truncation of a temp file */
    2485              :         Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
    2486            0 :         temporary_files_size -= VfdCache[file].fileSize - offset;
    2487            0 :         VfdCache[file].fileSize = offset;
    2488              :     }
    2489              : 
    2490          672 :     return returnCode;
    2491              : }
    2492              : 
    2493              : /*
    2494              :  * Return the pathname associated with an open file.
    2495              :  *
    2496              :  * The returned string points to an internal buffer, which is valid until
    2497              :  * the file is closed.
    2498              :  */
    2499              : char *
    2500           32 : FilePathName(File file)
    2501              : {
    2502              :     Assert(FileIsValid(file));
    2503              : 
    2504           32 :     return VfdCache[file].fileName;
    2505              : }
    2506              : 
    2507              : /*
    2508              :  * Return the raw file descriptor of an opened file.
    2509              :  *
    2510              :  * The returned file descriptor will be valid until the file is closed, but
    2511              :  * there are a lot of things that can make that happen.  So the caller should
    2512              :  * be careful not to do much of anything else before it finishes using the
    2513              :  * returned file descriptor.
    2514              :  */
    2515              : int
    2516       506398 : FileGetRawDesc(File file)
    2517              : {
    2518              :     int         returnCode;
    2519              : 
    2520       506398 :     returnCode = FileAccess(file);
    2521       506398 :     if (returnCode < 0)
    2522            0 :         return returnCode;
    2523              : 
    2524              :     Assert(FileIsValid(file));
    2525       506398 :     return VfdCache[file].fd;
    2526              : }
    2527              : 
    2528              : /*
    2529              :  * FileGetRawFlags - returns the file flags on open(2)
    2530              :  */
    2531              : int
    2532            0 : FileGetRawFlags(File file)
    2533              : {
    2534              :     Assert(FileIsValid(file));
    2535            0 :     return VfdCache[file].fileFlags;
    2536              : }
    2537              : 
    2538              : /*
    2539              :  * FileGetRawMode - returns the mode bitmask passed to open(2)
    2540              :  */
    2541              : mode_t
    2542            0 : FileGetRawMode(File file)
    2543              : {
    2544              :     Assert(FileIsValid(file));
    2545            0 :     return VfdCache[file].fileMode;
    2546              : }
    2547              : 
    2548              : /*
    2549              :  * Make room for another allocatedDescs[] array entry if needed and possible.
    2550              :  * Returns true if an array element is available.
    2551              :  */
    2552              : static bool
    2553      7936490 : reserveAllocatedDesc(void)
    2554              : {
    2555              :     AllocateDesc *newDescs;
    2556              :     int         newMax;
    2557              : 
    2558              :     /* Quick out if array already has a free slot. */
    2559      7936490 :     if (numAllocatedDescs < maxAllocatedDescs)
    2560      7935292 :         return true;
    2561              : 
    2562              :     /*
    2563              :      * If the array hasn't yet been created in the current process, initialize
    2564              :      * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
    2565              :      * we will ever need, anyway.  We don't want to look at max_safe_fds
    2566              :      * immediately because set_max_safe_fds() may not have run yet.
    2567              :      */
    2568         1198 :     if (allocatedDescs == NULL)
    2569              :     {
    2570         1198 :         newMax = FD_MINFREE / 3;
    2571         1198 :         newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
    2572              :         /* Out of memory already?  Treat as fatal error. */
    2573         1198 :         if (newDescs == NULL)
    2574            0 :             ereport(ERROR,
    2575              :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    2576              :                      errmsg("out of memory")));
    2577         1198 :         allocatedDescs = newDescs;
    2578         1198 :         maxAllocatedDescs = newMax;
    2579         1198 :         return true;
    2580              :     }
    2581              : 
    2582              :     /*
    2583              :      * Consider enlarging the array beyond the initial allocation used above.
    2584              :      * By the time this happens, max_safe_fds should be known accurately.
    2585              :      *
    2586              :      * We mustn't let allocated descriptors hog all the available FDs, and in
    2587              :      * practice we'd better leave a reasonable number of FDs for VFD use.  So
    2588              :      * set the maximum to max_safe_fds / 3.  (This should certainly be at
    2589              :      * least as large as the initial size, FD_MINFREE / 3, so we aren't
    2590              :      * tightening the restriction here.)  Recall that "external" FDs are
    2591              :      * allowed to consume another third of max_safe_fds.
    2592              :      */
    2593            0 :     newMax = max_safe_fds / 3;
    2594            0 :     if (newMax > maxAllocatedDescs)
    2595              :     {
    2596            0 :         newDescs = (AllocateDesc *) realloc(allocatedDescs,
    2597              :                                             newMax * sizeof(AllocateDesc));
    2598              :         /* Treat out-of-memory as a non-fatal error. */
    2599            0 :         if (newDescs == NULL)
    2600            0 :             return false;
    2601            0 :         allocatedDescs = newDescs;
    2602            0 :         maxAllocatedDescs = newMax;
    2603            0 :         return true;
    2604              :     }
    2605              : 
    2606              :     /* Can't enlarge allocatedDescs[] any more. */
    2607            0 :     return false;
    2608              : }
    2609              : 
    2610              : /*
    2611              :  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
    2612              :  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
    2613              :  * necessary to open the file.  When done, call FreeFile rather than fclose.
    2614              :  *
    2615              :  * Note that files that will be open for any significant length of time
    2616              :  * should NOT be handled this way, since they cannot share kernel file
    2617              :  * descriptors with other files; there is grave risk of running out of FDs
    2618              :  * if anyone locks down too many FDs.  Most callers of this routine are
    2619              :  * simply reading a config file that they will read and close immediately.
    2620              :  *
    2621              :  * fd.c will automatically close all files opened with AllocateFile at
    2622              :  * transaction commit or abort; this prevents FD leakage if a routine
    2623              :  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
    2624              :  *
    2625              :  * Ideally this should be the *only* direct call of fopen() in the backend.
    2626              :  */
    2627              : FILE *
    2628        99141 : AllocateFile(const char *name, const char *mode)
    2629              : {
    2630              :     FILE       *file;
    2631              : 
    2632              :     DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
    2633              :                numAllocatedDescs, name));
    2634              : 
    2635              :     /* Can we allocate another non-virtual FD? */
    2636        99141 :     if (!reserveAllocatedDesc())
    2637            0 :         ereport(ERROR,
    2638              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2639              :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2640              :                         maxAllocatedDescs, name)));
    2641              : 
    2642              :     /* Close excess kernel FDs. */
    2643        99141 :     ReleaseLruFiles();
    2644              : 
    2645        99141 : TryAgain:
    2646        99141 :     if ((file = fopen(name, mode)) != NULL)
    2647              :     {
    2648        91437 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2649              : 
    2650        91437 :         desc->kind = AllocateDescFile;
    2651        91437 :         desc->desc.file = file;
    2652        91437 :         desc->create_subid = GetCurrentSubTransactionId();
    2653        91437 :         numAllocatedDescs++;
    2654        91437 :         return desc->desc.file;
    2655              :     }
    2656              : 
    2657         7704 :     if (errno == EMFILE || errno == ENFILE)
    2658              :     {
    2659            0 :         int         save_errno = errno;
    2660              : 
    2661            0 :         ereport(LOG,
    2662              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2663              :                  errmsg("out of file descriptors: %m; release and retry")));
    2664            0 :         errno = 0;
    2665            0 :         if (ReleaseLruFile())
    2666            0 :             goto TryAgain;
    2667            0 :         errno = save_errno;
    2668              :     }
    2669              : 
    2670         7704 :     return NULL;
    2671              : }
    2672              : 
    2673              : /*
    2674              :  * Open a file with OpenTransientFilePerm() and pass default file mode for
    2675              :  * the fileMode parameter.
    2676              :  */
    2677              : int
    2678      7786024 : OpenTransientFile(const char *fileName, int fileFlags)
    2679              : {
    2680      7786024 :     return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
    2681              : }
    2682              : 
    2683              : /*
    2684              :  * Like AllocateFile, but returns an unbuffered fd like open(2)
    2685              :  */
    2686              : int
    2687      7786032 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    2688              : {
    2689              :     int         fd;
    2690              : 
    2691              :     DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
    2692              :                numAllocatedDescs, fileName));
    2693              : 
    2694              :     /* Can we allocate another non-virtual FD? */
    2695      7786032 :     if (!reserveAllocatedDesc())
    2696            0 :         ereport(ERROR,
    2697              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2698              :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2699              :                         maxAllocatedDescs, fileName)));
    2700              : 
    2701              :     /* Close excess kernel FDs. */
    2702      7786032 :     ReleaseLruFiles();
    2703              : 
    2704      7786032 :     fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    2705              : 
    2706      7786032 :     if (fd >= 0)
    2707              :     {
    2708      7780914 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2709              : 
    2710      7780914 :         desc->kind = AllocateDescRawFD;
    2711      7780914 :         desc->desc.fd = fd;
    2712      7780914 :         desc->create_subid = GetCurrentSubTransactionId();
    2713      7780914 :         numAllocatedDescs++;
    2714              : 
    2715      7780914 :         return fd;
    2716              :     }
    2717              : 
    2718         5118 :     return -1;                  /* failure */
    2719              : }
    2720              : 
    2721              : /*
    2722              :  * Routines that want to initiate a pipe stream should use OpenPipeStream
    2723              :  * rather than plain popen().  This lets fd.c deal with freeing FDs if
    2724              :  * necessary.  When done, call ClosePipeStream rather than pclose.
    2725              :  *
    2726              :  * This function also ensures that the popen'd program is run with default
    2727              :  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
    2728              :  * uses.  This ensures desirable response to, eg, closing a read pipe early.
    2729              :  */
    2730              : FILE *
    2731           69 : OpenPipeStream(const char *command, const char *mode)
    2732              : {
    2733              :     FILE       *file;
    2734              :     int         save_errno;
    2735              : 
    2736              :     DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
    2737              :                numAllocatedDescs, command));
    2738              : 
    2739              :     /* Can we allocate another non-virtual FD? */
    2740           69 :     if (!reserveAllocatedDesc())
    2741            0 :         ereport(ERROR,
    2742              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2743              :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
    2744              :                         maxAllocatedDescs, command)));
    2745              : 
    2746              :     /* Close excess kernel FDs. */
    2747           69 :     ReleaseLruFiles();
    2748              : 
    2749           69 : TryAgain:
    2750           69 :     fflush(NULL);
    2751           69 :     pqsignal(SIGPIPE, SIG_DFL);
    2752           69 :     errno = 0;
    2753           69 :     file = popen(command, mode);
    2754           69 :     save_errno = errno;
    2755           69 :     pqsignal(SIGPIPE, SIG_IGN);
    2756           69 :     errno = save_errno;
    2757           69 :     if (file != NULL)
    2758              :     {
    2759           69 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2760              : 
    2761           69 :         desc->kind = AllocateDescPipe;
    2762           69 :         desc->desc.file = file;
    2763           69 :         desc->create_subid = GetCurrentSubTransactionId();
    2764           69 :         numAllocatedDescs++;
    2765           69 :         return desc->desc.file;
    2766              :     }
    2767              : 
    2768            0 :     if (errno == EMFILE || errno == ENFILE)
    2769              :     {
    2770            0 :         ereport(LOG,
    2771              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2772              :                  errmsg("out of file descriptors: %m; release and retry")));
    2773            0 :         if (ReleaseLruFile())
    2774            0 :             goto TryAgain;
    2775            0 :         errno = save_errno;
    2776              :     }
    2777              : 
    2778            0 :     return NULL;
    2779              : }
    2780              : 
    2781              : /*
    2782              :  * Free an AllocateDesc of any type.
    2783              :  *
    2784              :  * The argument *must* point into the allocatedDescs[] array.
    2785              :  */
    2786              : static int
    2787      7922691 : FreeDesc(AllocateDesc *desc)
    2788              : {
    2789              :     int         result;
    2790              : 
    2791              :     /* Close the underlying object */
    2792      7922691 :     switch (desc->kind)
    2793              :     {
    2794        91440 :         case AllocateDescFile:
    2795        91440 :             result = fclose(desc->desc.file);
    2796        91440 :             break;
    2797           69 :         case AllocateDescPipe:
    2798           69 :             result = pclose(desc->desc.file);
    2799           69 :             break;
    2800        50266 :         case AllocateDescDir:
    2801        50266 :             result = closedir(desc->desc.dir);
    2802        50266 :             break;
    2803      7780916 :         case AllocateDescRawFD:
    2804      7780916 :             pgaio_closing_fd(desc->desc.fd);
    2805      7780916 :             result = close(desc->desc.fd);
    2806      7780916 :             break;
    2807            0 :         default:
    2808            0 :             elog(ERROR, "AllocateDesc kind not recognized");
    2809              :             result = 0;         /* keep compiler quiet */
    2810              :             break;
    2811              :     }
    2812              : 
    2813              :     /* Compact storage in the allocatedDescs array */
    2814      7922691 :     numAllocatedDescs--;
    2815      7922691 :     *desc = allocatedDescs[numAllocatedDescs];
    2816              : 
    2817      7922691 :     return result;
    2818              : }
    2819              : 
    2820              : /*
    2821              :  * Close a file returned by AllocateFile.
    2822              :  *
    2823              :  * Note we do not check fclose's return value --- it is up to the caller
    2824              :  * to handle close errors.
    2825              :  */
    2826              : int
    2827        91420 : FreeFile(FILE *file)
    2828              : {
    2829              :     int         i;
    2830              : 
    2831              :     DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
    2832              : 
    2833              :     /* Remove file from list of allocated files, if it's present */
    2834        91423 :     for (i = numAllocatedDescs; --i >= 0;)
    2835              :     {
    2836        91423 :         AllocateDesc *desc = &allocatedDescs[i];
    2837              : 
    2838        91423 :         if (desc->kind == AllocateDescFile && desc->desc.file == file)
    2839        91420 :             return FreeDesc(desc);
    2840              :     }
    2841              : 
    2842              :     /* Only get here if someone passes us a file not in allocatedDescs */
    2843            0 :     elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
    2844              : 
    2845            0 :     return fclose(file);
    2846              : }
    2847              : 
    2848              : /*
    2849              :  * Close a file returned by OpenTransientFile.
    2850              :  *
    2851              :  * Note we do not check close's return value --- it is up to the caller
    2852              :  * to handle close errors.
    2853              :  */
    2854              : int
    2855      7780915 : CloseTransientFile(int fd)
    2856              : {
    2857              :     int         i;
    2858              : 
    2859              :     DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
    2860              : 
    2861              :     /* Remove fd from list of allocated files, if it's present */
    2862      7780925 :     for (i = numAllocatedDescs; --i >= 0;)
    2863              :     {
    2864      7780925 :         AllocateDesc *desc = &allocatedDescs[i];
    2865              : 
    2866      7780925 :         if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
    2867      7780915 :             return FreeDesc(desc);
    2868              :     }
    2869              : 
    2870              :     /* Only get here if someone passes us a file not in allocatedDescs */
    2871            0 :     elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
    2872              : 
    2873            0 :     pgaio_closing_fd(fd);
    2874              : 
    2875            0 :     return close(fd);
    2876              : }
    2877              : 
    2878              : /*
    2879              :  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
    2880              :  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
    2881              :  * necessary to open the directory, and with closing it after an elog.
    2882              :  * When done, call FreeDir rather than closedir.
    2883              :  *
    2884              :  * Returns NULL, with errno set, on failure.  Note that failure detection
    2885              :  * is commonly left to the following call of ReadDir or ReadDirExtended;
    2886              :  * see the comments for ReadDir.
    2887              :  *
    2888              :  * Ideally this should be the *only* direct call of opendir() in the backend.
    2889              :  */
    2890              : DIR *
    2891        51248 : AllocateDir(const char *dirname)
    2892              : {
    2893              :     DIR        *dir;
    2894              : 
    2895              :     DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
    2896              :                numAllocatedDescs, dirname));
    2897              : 
    2898              :     /* Can we allocate another non-virtual FD? */
    2899        51248 :     if (!reserveAllocatedDesc())
    2900            0 :         ereport(ERROR,
    2901              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2902              :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
    2903              :                         maxAllocatedDescs, dirname)));
    2904              : 
    2905              :     /* Close excess kernel FDs. */
    2906        51248 :     ReleaseLruFiles();
    2907              : 
    2908        51248 : TryAgain:
    2909        51248 :     if ((dir = opendir(dirname)) != NULL)
    2910              :     {
    2911        50266 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2912              : 
    2913        50266 :         desc->kind = AllocateDescDir;
    2914        50266 :         desc->desc.dir = dir;
    2915        50266 :         desc->create_subid = GetCurrentSubTransactionId();
    2916        50266 :         numAllocatedDescs++;
    2917        50266 :         return desc->desc.dir;
    2918              :     }
    2919              : 
    2920          982 :     if (errno == EMFILE || errno == ENFILE)
    2921              :     {
    2922            0 :         int         save_errno = errno;
    2923              : 
    2924            0 :         ereport(LOG,
    2925              :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2926              :                  errmsg("out of file descriptors: %m; release and retry")));
    2927            0 :         errno = 0;
    2928            0 :         if (ReleaseLruFile())
    2929            0 :             goto TryAgain;
    2930            0 :         errno = save_errno;
    2931              :     }
    2932              : 
    2933          982 :     return NULL;
    2934              : }
    2935              : 
    2936              : /*
    2937              :  * Read a directory opened with AllocateDir, ereport'ing any error.
    2938              :  *
    2939              :  * This is easier to use than raw readdir() since it takes care of some
    2940              :  * otherwise rather tedious and error-prone manipulation of errno.  Also,
    2941              :  * if you are happy with a generic error message for AllocateDir failure,
    2942              :  * you can just do
    2943              :  *
    2944              :  *      dir = AllocateDir(path);
    2945              :  *      while ((dirent = ReadDir(dir, path)) != NULL)
    2946              :  *          process dirent;
    2947              :  *      FreeDir(dir);
    2948              :  *
    2949              :  * since a NULL dir parameter is taken as indicating AllocateDir failed.
    2950              :  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
    2951              :  * use this shortcut.)
    2952              :  *
    2953              :  * The pathname passed to AllocateDir must be passed to this routine too,
    2954              :  * but it is only used for error reporting.
    2955              :  */
    2956              : struct dirent *
    2957      2545582 : ReadDir(DIR *dir, const char *dirname)
    2958              : {
    2959      2545582 :     return ReadDirExtended(dir, dirname, ERROR);
    2960              : }
    2961              : 
    2962              : /*
    2963              :  * Alternate version of ReadDir that allows caller to specify the elevel
    2964              :  * for any error report (whether it's reporting an initial failure of
    2965              :  * AllocateDir or a subsequent directory read failure).
    2966              :  *
    2967              :  * If elevel < ERROR, returns NULL after any error.  With the normal coding
    2968              :  * pattern, this will result in falling out of the loop immediately as
    2969              :  * though the directory contained no (more) entries.
    2970              :  */
    2971              : struct dirent *
    2972      4500446 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
    2973              : {
    2974              :     struct dirent *dent;
    2975              : 
    2976              :     /* Give a generic message for AllocateDir failure, if caller didn't */
    2977      4500446 :     if (dir == NULL)
    2978              :     {
    2979            4 :         ereport(elevel,
    2980              :                 (errcode_for_file_access(),
    2981              :                  errmsg("could not open directory \"%s\": %m",
    2982              :                         dirname)));
    2983            0 :         return NULL;
    2984              :     }
    2985              : 
    2986      4500442 :     errno = 0;
    2987      4500442 :     if ((dent = readdir(dir)) != NULL)
    2988      4464025 :         return dent;
    2989              : 
    2990        36417 :     if (errno)
    2991            0 :         ereport(elevel,
    2992              :                 (errcode_for_file_access(),
    2993              :                  errmsg("could not read directory \"%s\": %m",
    2994              :                         dirname)));
    2995        36417 :     return NULL;
    2996              : }
    2997              : 
    2998              : /*
    2999              :  * Close a directory opened with AllocateDir.
    3000              :  *
    3001              :  * Returns closedir's return value (with errno set if it's not 0).
    3002              :  * Note we do not check the return value --- it is up to the caller
    3003              :  * to handle close errors if wanted.
    3004              :  *
    3005              :  * Does nothing if dir == NULL; we assume that directory open failure was
    3006              :  * already reported if desired.
    3007              :  */
    3008              : int
    3009        50133 : FreeDir(DIR *dir)
    3010              : {
    3011              :     int         i;
    3012              : 
    3013              :     /* Nothing to do if AllocateDir failed */
    3014        50133 :     if (dir == NULL)
    3015            0 :         return 0;
    3016              : 
    3017              :     DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
    3018              : 
    3019              :     /* Remove dir from list of allocated dirs, if it's present */
    3020        50133 :     for (i = numAllocatedDescs; --i >= 0;)
    3021              :     {
    3022        50133 :         AllocateDesc *desc = &allocatedDescs[i];
    3023              : 
    3024        50133 :         if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
    3025        50133 :             return FreeDesc(desc);
    3026              :     }
    3027              : 
    3028              :     /* Only get here if someone passes us a dir not in allocatedDescs */
    3029            0 :     elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
    3030              : 
    3031            0 :     return closedir(dir);
    3032              : }
    3033              : 
    3034              : 
    3035              : /*
    3036              :  * Close a pipe stream returned by OpenPipeStream.
    3037              :  */
    3038              : int
    3039           69 : ClosePipeStream(FILE *file)
    3040              : {
    3041              :     int         i;
    3042              : 
    3043              :     DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
    3044              : 
    3045              :     /* Remove file from list of allocated files, if it's present */
    3046           69 :     for (i = numAllocatedDescs; --i >= 0;)
    3047              :     {
    3048           69 :         AllocateDesc *desc = &allocatedDescs[i];
    3049              : 
    3050           69 :         if (desc->kind == AllocateDescPipe && desc->desc.file == file)
    3051           69 :             return FreeDesc(desc);
    3052              :     }
    3053              : 
    3054              :     /* Only get here if someone passes us a file not in allocatedDescs */
    3055            0 :     elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
    3056              : 
    3057            0 :     return pclose(file);
    3058              : }
    3059              : 
    3060              : /*
    3061              :  * closeAllVfds
    3062              :  *
    3063              :  * Force all VFDs into the physically-closed state, so that the fewest
    3064              :  * possible number of kernel file descriptors are in use.  There is no
    3065              :  * change in the logical state of the VFDs.
    3066              :  */
    3067              : void
    3068           43 : closeAllVfds(void)
    3069              : {
    3070              :     Index       i;
    3071              : 
    3072           43 :     if (SizeVfdCache > 0)
    3073              :     {
    3074              :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    3075         1376 :         for (i = 1; i < SizeVfdCache; i++)
    3076              :         {
    3077         1333 :             if (!FileIsNotOpen(i))
    3078          148 :                 LruDelete(i);
    3079              :         }
    3080              :     }
    3081           43 : }
    3082              : 
    3083              : 
    3084              : /*
    3085              :  * SetTempTablespaces
    3086              :  *
    3087              :  * Define a list (actually an array) of OIDs of tablespaces to use for
    3088              :  * temporary files.  This list will be used until end of transaction,
    3089              :  * unless this function is called again before then.  It is caller's
    3090              :  * responsibility that the passed-in array has adequate lifespan (typically
    3091              :  * it'd be allocated in TopTransactionContext).
    3092              :  *
    3093              :  * Some entries of the array may be InvalidOid, indicating that the current
    3094              :  * database's default tablespace should be used.
    3095              :  */
    3096              : void
    3097         4078 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
    3098              : {
    3099              :     Assert(numSpaces >= 0);
    3100         4078 :     tempTableSpaces = tableSpaces;
    3101         4078 :     numTempTableSpaces = numSpaces;
    3102              : 
    3103              :     /*
    3104              :      * Select a random starting point in the list.  This is to minimize
    3105              :      * conflicts between backends that are most likely sharing the same list
    3106              :      * of temp tablespaces.  Note that if we create multiple temp files in the
    3107              :      * same transaction, we'll advance circularly through the list --- this
    3108              :      * ensures that large temporary sort files are nicely spread across all
    3109              :      * available tablespaces.
    3110              :      */
    3111         4078 :     if (numSpaces > 1)
    3112            0 :         nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
    3113            0 :                                                   0, numSpaces - 1);
    3114              :     else
    3115         4078 :         nextTempTableSpace = 0;
    3116         4078 : }
    3117              : 
    3118              : /*
    3119              :  * TempTablespacesAreSet
    3120              :  *
    3121              :  * Returns true if SetTempTablespaces has been called in current transaction.
    3122              :  * (This is just so that tablespaces.c doesn't need its own per-transaction
    3123              :  * state.)
    3124              :  */
    3125              : bool
    3126         6114 : TempTablespacesAreSet(void)
    3127              : {
    3128         6114 :     return (numTempTableSpaces >= 0);
    3129              : }
    3130              : 
    3131              : /*
    3132              :  * GetTempTablespaces
    3133              :  *
    3134              :  * Populate an array with the OIDs of the tablespaces that should be used for
    3135              :  * temporary files.  (Some entries may be InvalidOid, indicating that the
    3136              :  * current database's default tablespace should be used.)  At most numSpaces
    3137              :  * entries will be filled.
    3138              :  * Returns the number of OIDs that were copied into the output array.
    3139              :  */
    3140              : int
    3141          277 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
    3142              : {
    3143              :     int         i;
    3144              : 
    3145              :     Assert(TempTablespacesAreSet());
    3146          277 :     for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
    3147            0 :         tableSpaces[i] = tempTableSpaces[i];
    3148              : 
    3149          277 :     return i;
    3150              : }
    3151              : 
    3152              : /*
    3153              :  * GetNextTempTableSpace
    3154              :  *
    3155              :  * Select the next temp tablespace to use.  A result of InvalidOid means
    3156              :  * to use the current database's default tablespace.
    3157              :  */
    3158              : Oid
    3159         3023 : GetNextTempTableSpace(void)
    3160              : {
    3161         3023 :     if (numTempTableSpaces > 0)
    3162              :     {
    3163              :         /* Advance nextTempTableSpace counter with wraparound */
    3164            1 :         if (++nextTempTableSpace >= numTempTableSpaces)
    3165            1 :             nextTempTableSpace = 0;
    3166            1 :         return tempTableSpaces[nextTempTableSpace];
    3167              :     }
    3168         3022 :     return InvalidOid;
    3169              : }
    3170              : 
    3171              : 
    3172              : /*
    3173              :  * AtEOSubXact_Files
    3174              :  *
    3175              :  * Take care of subtransaction commit/abort.  At abort, we close AllocateDescs
    3176              :  * that the subtransaction may have opened.  At commit, we reassign them to
    3177              :  * the parent subtransaction.  (Temporary files are tracked by ResourceOwners
    3178              :  * instead.)
    3179              :  */
    3180              : void
    3181        11803 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
    3182              :                   SubTransactionId parentSubid)
    3183              : {
    3184              :     Index       i;
    3185              : 
    3186        11803 :     for (i = 0; i < numAllocatedDescs; i++)
    3187              :     {
    3188            0 :         if (allocatedDescs[i].create_subid == mySubid)
    3189              :         {
    3190            0 :             if (isCommit)
    3191            0 :                 allocatedDescs[i].create_subid = parentSubid;
    3192              :             else
    3193              :             {
    3194              :                 /* have to recheck the item after FreeDesc (ugly) */
    3195            0 :                 FreeDesc(&allocatedDescs[i--]);
    3196              :             }
    3197              :         }
    3198              :     }
    3199        11803 : }
    3200              : 
    3201              : /*
    3202              :  * AtEOXact_Files
    3203              :  *
    3204              :  * This routine is called during transaction commit or abort.  All still-open
    3205              :  * per-transaction temporary file VFDs are closed, which also causes the
    3206              :  * underlying files to be deleted (although they should've been closed already
    3207              :  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
    3208              :  * closed. We also forget any transaction-local temp tablespace list.
    3209              :  *
    3210              :  * The isCommit flag is used only to decide whether to emit warnings about
    3211              :  * unclosed files.
    3212              :  */
    3213              : void
    3214       622864 : AtEOXact_Files(bool isCommit)
    3215              : {
    3216       622864 :     CleanupTempFiles(isCommit, false);
    3217       622864 :     tempTableSpaces = NULL;
    3218       622864 :     numTempTableSpaces = -1;
    3219       622864 : }
    3220              : 
    3221              : /*
    3222              :  * BeforeShmemExit_Files
    3223              :  *
    3224              :  * before_shmem_exit hook to clean up temp files during backend shutdown.
    3225              :  * Here, we want to clean up *all* temp files including interXact ones.
    3226              :  */
    3227              : static void
    3228        24252 : BeforeShmemExit_Files(int code, Datum arg)
    3229              : {
    3230        24252 :     CleanupTempFiles(false, true);
    3231              : 
    3232              :     /* prevent further temp files from being created */
    3233              : #ifdef USE_ASSERT_CHECKING
    3234              :     temporary_files_allowed = false;
    3235              : #endif
    3236        24252 : }
    3237              : 
    3238              : /*
    3239              :  * Close temporary files and delete their underlying files.
    3240              :  *
    3241              :  * isCommit: if true, this is normal transaction commit, and we don't
    3242              :  * expect any remaining files; warn if there are some.
    3243              :  *
    3244              :  * isProcExit: if true, this is being called as the backend process is
    3245              :  * exiting. If that's the case, we should remove all temporary files; if
    3246              :  * that's not the case, we are being called for transaction commit/abort
    3247              :  * and should only remove transaction-local temp files.  In either case,
    3248              :  * also clean up "allocated" stdio files, dirs and fds.
    3249              :  */
    3250              : static void
    3251       647116 : CleanupTempFiles(bool isCommit, bool isProcExit)
    3252              : {
    3253              :     Index       i;
    3254              : 
    3255              :     /*
    3256              :      * Careful here: at proc_exit we need extra cleanup, not just
    3257              :      * xact_temporary files.
    3258              :      */
    3259       647116 :     if (isProcExit || have_xact_temporary_files)
    3260              :     {
    3261              :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    3262      1575228 :         for (i = 1; i < SizeVfdCache; i++)
    3263              :         {
    3264      1550016 :             unsigned short fdstate = VfdCache[i].fdstate;
    3265              : 
    3266      1550016 :             if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
    3267            4 :                 VfdCache[i].fileName != NULL)
    3268              :             {
    3269              :                 /*
    3270              :                  * If we're in the process of exiting a backend process, close
    3271              :                  * all temporary files. Otherwise, only close temporary files
    3272              :                  * local to the current transaction. They should be closed by
    3273              :                  * the ResourceOwner mechanism already, so this is just a
    3274              :                  * debugging cross-check.
    3275              :                  */
    3276            4 :                 if (isProcExit)
    3277            4 :                     FileClose(i);
    3278            0 :                 else if (fdstate & FD_CLOSE_AT_EOXACT)
    3279              :                 {
    3280            0 :                     elog(WARNING,
    3281              :                          "temporary file %s not closed at end-of-transaction",
    3282              :                          VfdCache[i].fileName);
    3283            0 :                     FileClose(i);
    3284              :                 }
    3285              :             }
    3286              :         }
    3287              : 
    3288        25212 :         have_xact_temporary_files = false;
    3289              :     }
    3290              : 
    3291              :     /* Complain if any allocated files remain open at commit. */
    3292       647116 :     if (isCommit && numAllocatedDescs > 0)
    3293            0 :         elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
    3294              :              numAllocatedDescs);
    3295              : 
    3296              :     /* Clean up "allocated" stdio files, dirs and fds. */
    3297       647270 :     while (numAllocatedDescs > 0)
    3298          154 :         FreeDesc(&allocatedDescs[0]);
    3299       647116 : }
    3300              : 
    3301              : 
    3302              : /*
    3303              :  * Remove temporary and temporary relation files left over from a prior
    3304              :  * postmaster session
    3305              :  *
    3306              :  * This should be called during postmaster startup.  It will forcibly
    3307              :  * remove any leftover files created by OpenTemporaryFile and any leftover
    3308              :  * temporary relation files created by mdcreate.
    3309              :  *
    3310              :  * During post-backend-crash restart cycle, this routine is called when
    3311              :  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
    3312              :  * queries are using temp files could result in useless storage usage that can
    3313              :  * only be reclaimed by a service restart. The argument against enabling it is
    3314              :  * that someone might want to examine the temporary files for debugging
    3315              :  * purposes. This does however mean that OpenTemporaryFile had better allow for
    3316              :  * collision with an existing temp file name.
    3317              :  *
    3318              :  * NOTE: this function and its subroutines generally report syscall failures
    3319              :  * with ereport(LOG) and keep going.  Removing temp files is not so critical
    3320              :  * that we should fail to start the database when we can't do it.
    3321              :  */
    3322              : void
    3323          956 : RemovePgTempFiles(void)
    3324              : {
    3325              :     char        temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
    3326              :     DIR        *spc_dir;
    3327              :     struct dirent *spc_de;
    3328              : 
    3329              :     /*
    3330              :      * First process temp files in pg_default ($PGDATA/base)
    3331              :      */
    3332          956 :     snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
    3333          956 :     RemovePgTempFilesInDir(temp_path, true, false);
    3334          956 :     RemovePgTempRelationFiles("base");
    3335              : 
    3336              :     /*
    3337              :      * Cycle through temp directories for all non-default tablespaces.
    3338              :      */
    3339          956 :     spc_dir = AllocateDir(PG_TBLSPC_DIR);
    3340              : 
    3341         2941 :     while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
    3342              :     {
    3343         1985 :         if (strcmp(spc_de->d_name, ".") == 0 ||
    3344         1029 :             strcmp(spc_de->d_name, "..") == 0)
    3345         1912 :             continue;
    3346              : 
    3347           73 :         snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
    3348           73 :                  PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
    3349              :                  PG_TEMP_FILES_DIR);
    3350           73 :         RemovePgTempFilesInDir(temp_path, true, false);
    3351              : 
    3352           73 :         snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
    3353           73 :                  PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
    3354           73 :         RemovePgTempRelationFiles(temp_path);
    3355              :     }
    3356              : 
    3357          956 :     FreeDir(spc_dir);
    3358              : 
    3359              :     /*
    3360              :      * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
    3361              :      * DataDir as well.  However, that is *not* cleaned here because doing so
    3362              :      * would create a race condition.  It's done separately, earlier in
    3363              :      * postmaster startup.
    3364              :      */
    3365          956 : }
    3366              : 
    3367              : /*
    3368              :  * Process one pgsql_tmp directory for RemovePgTempFiles.
    3369              :  *
    3370              :  * If missing_ok is true, it's all right for the named directory to not exist.
    3371              :  * Any other problem results in a LOG message.  (missing_ok should be true at
    3372              :  * the top level, since pgsql_tmp directories are not created until needed.)
    3373              :  *
    3374              :  * At the top level, this should be called with unlink_all = false, so that
    3375              :  * only files matching the temporary name prefix will be unlinked.  When
    3376              :  * recursing it will be called with unlink_all = true to unlink everything
    3377              :  * under a top-level temporary directory.
    3378              :  *
    3379              :  * (These two flags could be replaced by one, but it seems clearer to keep
    3380              :  * them separate.)
    3381              :  */
    3382              : void
    3383         1030 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
    3384              : {
    3385              :     DIR        *temp_dir;
    3386              :     struct dirent *temp_de;
    3387              :     char        rm_path[MAXPGPATH * 2];
    3388              : 
    3389         1030 :     temp_dir = AllocateDir(tmpdirname);
    3390              : 
    3391         1030 :     if (temp_dir == NULL && errno == ENOENT && missing_ok)
    3392          958 :         return;
    3393              : 
    3394          219 :     while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
    3395              :     {
    3396          147 :         if (strcmp(temp_de->d_name, ".") == 0 ||
    3397           75 :             strcmp(temp_de->d_name, "..") == 0)
    3398          144 :             continue;
    3399              : 
    3400            3 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3401            3 :                  tmpdirname, temp_de->d_name);
    3402              : 
    3403            3 :         if (unlink_all ||
    3404            3 :             strncmp(temp_de->d_name,
    3405              :                     PG_TEMP_FILE_PREFIX,
    3406              :                     strlen(PG_TEMP_FILE_PREFIX)) == 0)
    3407            3 :         {
    3408            3 :             PGFileType  type = get_dirent_type(rm_path, temp_de, false, LOG);
    3409              : 
    3410            3 :             if (type == PGFILETYPE_ERROR)
    3411            0 :                 continue;
    3412            3 :             else if (type == PGFILETYPE_DIR)
    3413              :             {
    3414              :                 /* recursively remove contents, then directory itself */
    3415            1 :                 RemovePgTempFilesInDir(rm_path, false, true);
    3416              : 
    3417            1 :                 if (rmdir(rm_path) < 0)
    3418            0 :                     ereport(LOG,
    3419              :                             (errcode_for_file_access(),
    3420              :                              errmsg("could not remove directory \"%s\": %m",
    3421              :                                     rm_path)));
    3422              :             }
    3423              :             else
    3424              :             {
    3425            2 :                 if (unlink(rm_path) < 0)
    3426            0 :                     ereport(LOG,
    3427              :                             (errcode_for_file_access(),
    3428              :                              errmsg("could not remove file \"%s\": %m",
    3429              :                                     rm_path)));
    3430              :             }
    3431              :         }
    3432              :         else
    3433            0 :             ereport(LOG,
    3434              :                     (errmsg("unexpected file found in temporary-files directory: \"%s\"",
    3435              :                             rm_path)));
    3436              :     }
    3437              : 
    3438           72 :     FreeDir(temp_dir);
    3439              : }
    3440              : 
    3441              : /* Process one tablespace directory, look for per-DB subdirectories */
    3442              : static void
    3443         1029 : RemovePgTempRelationFiles(const char *tsdirname)
    3444              : {
    3445              :     DIR        *ts_dir;
    3446              :     struct dirent *de;
    3447              :     char        dbspace_path[MAXPGPATH * 2];
    3448              : 
    3449         1029 :     ts_dir = AllocateDir(tsdirname);
    3450              : 
    3451         6459 :     while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
    3452              :     {
    3453              :         /*
    3454              :          * We're only interested in the per-database directories, which have
    3455              :          * numeric names.  Note that this code will also (properly) ignore "."
    3456              :          * and "..".
    3457              :          */
    3458         5430 :         if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    3459         2129 :             continue;
    3460              : 
    3461         3301 :         snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
    3462         3301 :                  tsdirname, de->d_name);
    3463         3301 :         RemovePgTempRelationFilesInDbspace(dbspace_path);
    3464              :     }
    3465              : 
    3466         1029 :     FreeDir(ts_dir);
    3467         1029 : }
    3468              : 
    3469              : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
    3470              : static void
    3471         3301 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
    3472              : {
    3473              :     DIR        *dbspace_dir;
    3474              :     struct dirent *de;
    3475              :     char        rm_path[MAXPGPATH * 2];
    3476              : 
    3477         3301 :     dbspace_dir = AllocateDir(dbspacedirname);
    3478              : 
    3479      1066360 :     while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
    3480              :     {
    3481      1063059 :         if (!looks_like_temp_rel_name(de->d_name))
    3482      1063055 :             continue;
    3483              : 
    3484            4 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3485            4 :                  dbspacedirname, de->d_name);
    3486              : 
    3487            4 :         if (unlink(rm_path) < 0)
    3488            0 :             ereport(LOG,
    3489              :                     (errcode_for_file_access(),
    3490              :                      errmsg("could not remove file \"%s\": %m",
    3491              :                             rm_path)));
    3492              :     }
    3493              : 
    3494         3301 :     FreeDir(dbspace_dir);
    3495         3301 : }
    3496              : 
    3497              : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
    3498              : bool
    3499      1400601 : looks_like_temp_rel_name(const char *name)
    3500              : {
    3501              :     int         pos;
    3502              :     int         savepos;
    3503              : 
    3504              :     /* Must start with "t". */
    3505      1400601 :     if (name[0] != 't')
    3506      1400561 :         return false;
    3507              : 
    3508              :     /* Followed by a non-empty string of digits and then an underscore. */
    3509          196 :     for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
    3510              :         ;
    3511           40 :     if (pos == 1 || name[pos] != '_')
    3512            0 :         return false;
    3513              : 
    3514              :     /* Followed by another nonempty string of digits. */
    3515          196 :     for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
    3516              :         ;
    3517           40 :     if (savepos == pos)
    3518            0 :         return false;
    3519              : 
    3520              :     /* We might have _forkname or .segment or both. */
    3521           40 :     if (name[pos] == '_')
    3522              :     {
    3523           20 :         int         forkchar = forkname_chars(&name[pos + 1], NULL);
    3524              : 
    3525           20 :         if (forkchar <= 0)
    3526            0 :             return false;
    3527           20 :         pos += forkchar + 1;
    3528              :     }
    3529           40 :     if (name[pos] == '.')
    3530              :     {
    3531              :         int         segchar;
    3532              : 
    3533           40 :         for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
    3534              :             ;
    3535           20 :         if (segchar <= 1)
    3536            0 :             return false;
    3537           20 :         pos += segchar;
    3538              :     }
    3539              : 
    3540              :     /* Now we should be at the end. */
    3541           40 :     if (name[pos] != '\0')
    3542            0 :         return false;
    3543           40 :     return true;
    3544              : }
    3545              : 
    3546              : #ifdef HAVE_SYNCFS
    3547              : static void
    3548            0 : do_syncfs(const char *path)
    3549              : {
    3550              :     int         fd;
    3551              : 
    3552            0 :     ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
    3553              :                              path);
    3554              : 
    3555            0 :     fd = OpenTransientFile(path, O_RDONLY);
    3556            0 :     if (fd < 0)
    3557              :     {
    3558            0 :         ereport(LOG,
    3559              :                 (errcode_for_file_access(),
    3560              :                  errmsg("could not open file \"%s\": %m", path)));
    3561            0 :         return;
    3562              :     }
    3563            0 :     if (syncfs(fd) < 0)
    3564            0 :         ereport(LOG,
    3565              :                 (errcode_for_file_access(),
    3566              :                  errmsg("could not synchronize file system for file \"%s\": %m", path)));
    3567            0 :     CloseTransientFile(fd);
    3568              : }
    3569              : #endif
    3570              : 
    3571              : /*
    3572              :  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
    3573              :  * all potential filesystem, depending on recovery_init_sync_method setting.
    3574              :  *
    3575              :  * We fsync regular files and directories wherever they are, but we
    3576              :  * follow symlinks only for pg_wal and immediately under pg_tblspc.
    3577              :  * Other symlinks are presumed to point at files we're not responsible
    3578              :  * for fsyncing, and might not have privileges to write at all.
    3579              :  *
    3580              :  * Errors are logged but not considered fatal; that's because this is used
    3581              :  * only during database startup, to deal with the possibility that there are
    3582              :  * issued-but-unsynced writes pending against the data directory.  We want to
    3583              :  * ensure that such writes reach disk before anything that's done in the new
    3584              :  * run.  However, aborting on error would result in failure to start for
    3585              :  * harmless cases such as read-only files in the data directory, and that's
    3586              :  * not good either.
    3587              :  *
    3588              :  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
    3589              :  * rewriting all changes again during recovery.
    3590              :  *
    3591              :  * Note we assume we're chdir'd into PGDATA to begin with.
    3592              :  */
    3593              : void
    3594          186 : SyncDataDirectory(void)
    3595              : {
    3596              :     bool        xlog_is_symlink;
    3597              : 
    3598              :     /* We can skip this whole thing if fsync is disabled. */
    3599          186 :     if (!enableFsync)
    3600          186 :         return;
    3601              : 
    3602              :     /*
    3603              :      * If pg_wal is a symlink, we'll need to recurse into it separately,
    3604              :      * because the first walkdir below will ignore it.
    3605              :      */
    3606            0 :     xlog_is_symlink = false;
    3607              : 
    3608              :     {
    3609              :         struct stat st;
    3610              : 
    3611            0 :         if (lstat("pg_wal", &st) < 0)
    3612            0 :             ereport(LOG,
    3613              :                     (errcode_for_file_access(),
    3614              :                      errmsg("could not stat file \"%s\": %m",
    3615              :                             "pg_wal")));
    3616            0 :         else if (S_ISLNK(st.st_mode))
    3617            0 :             xlog_is_symlink = true;
    3618              :     }
    3619              : 
    3620              : #ifdef HAVE_SYNCFS
    3621            0 :     if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
    3622              :     {
    3623              :         DIR        *dir;
    3624              :         struct dirent *de;
    3625              : 
    3626              :         /*
    3627              :          * On Linux, we don't have to open every single file one by one.  We
    3628              :          * can use syncfs() to sync whole filesystems.  We only expect
    3629              :          * filesystem boundaries to exist where we tolerate symlinks, namely
    3630              :          * pg_wal and the tablespaces, so we call syncfs() for each of those
    3631              :          * directories.
    3632              :          */
    3633              : 
    3634              :         /* Prepare to report progress syncing the data directory via syncfs. */
    3635            0 :         begin_startup_progress_phase();
    3636              : 
    3637              :         /* Sync the top level pgdata directory. */
    3638            0 :         do_syncfs(".");
    3639              :         /* If any tablespaces are configured, sync each of those. */
    3640            0 :         dir = AllocateDir(PG_TBLSPC_DIR);
    3641            0 :         while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
    3642              :         {
    3643              :             char        path[MAXPGPATH];
    3644              : 
    3645            0 :             if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
    3646            0 :                 continue;
    3647              : 
    3648            0 :             snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
    3649            0 :             do_syncfs(path);
    3650              :         }
    3651            0 :         FreeDir(dir);
    3652              :         /* If pg_wal is a symlink, process that too. */
    3653            0 :         if (xlog_is_symlink)
    3654            0 :             do_syncfs("pg_wal");
    3655            0 :         return;
    3656              :     }
    3657              : #endif                          /* !HAVE_SYNCFS */
    3658              : 
    3659              : #ifdef PG_FLUSH_DATA_WORKS
    3660              :     /* Prepare to report progress of the pre-fsync phase. */
    3661            0 :     begin_startup_progress_phase();
    3662              : 
    3663              :     /*
    3664              :      * If possible, hint to the kernel that we're soon going to fsync the data
    3665              :      * directory and its contents.  Errors in this step are even less
    3666              :      * interesting than normal, so log them only at DEBUG1.
    3667              :      */
    3668            0 :     walkdir(".", pre_sync_fname, false, DEBUG1);
    3669            0 :     if (xlog_is_symlink)
    3670            0 :         walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
    3671            0 :     walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
    3672              : #endif
    3673              : 
    3674              :     /* Prepare to report progress syncing the data directory via fsync. */
    3675            0 :     begin_startup_progress_phase();
    3676              : 
    3677              :     /*
    3678              :      * Now we do the fsync()s in the same order.
    3679              :      *
    3680              :      * The main call ignores symlinks, so in addition to specially processing
    3681              :      * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
    3682              :      * process_symlinks = true.  Note that if there are any plain directories
    3683              :      * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
    3684              :      * so we don't worry about optimizing it.
    3685              :      */
    3686            0 :     walkdir(".", datadir_fsync_fname, false, LOG);
    3687            0 :     if (xlog_is_symlink)
    3688            0 :         walkdir("pg_wal", datadir_fsync_fname, false, LOG);
    3689            0 :     walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
    3690              : }
    3691              : 
    3692              : /*
    3693              :  * walkdir: recursively walk a directory, applying the action to each
    3694              :  * regular file and directory (including the named directory itself).
    3695              :  *
    3696              :  * If process_symlinks is true, the action and recursion are also applied
    3697              :  * to regular files and directories that are pointed to by symlinks in the
    3698              :  * given directory; otherwise symlinks are ignored.  Symlinks are always
    3699              :  * ignored in subdirectories, ie we intentionally don't pass down the
    3700              :  * process_symlinks flag to recursive calls.
    3701              :  *
    3702              :  * Errors are reported at level elevel, which might be ERROR or less.
    3703              :  *
    3704              :  * See also walkdir in file_utils.c, which is a frontend version of this
    3705              :  * logic.
    3706              :  */
    3707              : static void
    3708          252 : walkdir(const char *path,
    3709              :         void (*action) (const char *fname, bool isdir, int elevel),
    3710              :         bool process_symlinks,
    3711              :         int elevel)
    3712              : {
    3713              :     DIR        *dir;
    3714              :     struct dirent *de;
    3715              : 
    3716          252 :     dir = AllocateDir(path);
    3717              : 
    3718         2572 :     while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
    3719              :     {
    3720              :         char        subpath[MAXPGPATH * 2];
    3721              : 
    3722         2320 :         CHECK_FOR_INTERRUPTS();
    3723              : 
    3724         2320 :         if (strcmp(de->d_name, ".") == 0 ||
    3725         2068 :             strcmp(de->d_name, "..") == 0)
    3726          504 :             continue;
    3727              : 
    3728         1816 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
    3729              : 
    3730         1816 :         switch (get_dirent_type(subpath, de, process_symlinks, elevel))
    3731              :         {
    3732         1816 :             case PGFILETYPE_REG:
    3733         1816 :                 (*action) (subpath, false, elevel);
    3734         1816 :                 break;
    3735            0 :             case PGFILETYPE_DIR:
    3736            0 :                 walkdir(subpath, action, false, elevel);
    3737            0 :                 break;
    3738            0 :             default:
    3739              : 
    3740              :                 /*
    3741              :                  * Errors are already reported directly by get_dirent_type(),
    3742              :                  * and any remaining symlinks and unknown file types are
    3743              :                  * ignored.
    3744              :                  */
    3745            0 :                 break;
    3746              :         }
    3747              :     }
    3748              : 
    3749          252 :     FreeDir(dir);               /* we ignore any error here */
    3750              : 
    3751              :     /*
    3752              :      * It's important to fsync the destination directory itself as individual
    3753              :      * file fsyncs don't guarantee that the directory entry for the file is
    3754              :      * synced.  However, skip this if AllocateDir failed; the action function
    3755              :      * might not be robust against that.
    3756              :      */
    3757          252 :     if (dir)
    3758          252 :         (*action) (path, true, elevel);
    3759          252 : }
    3760              : 
    3761              : 
    3762              : /*
    3763              :  * Hint to the OS that it should get ready to fsync() this file.
    3764              :  *
    3765              :  * Ignores errors trying to open unreadable files, and logs other errors at a
    3766              :  * caller-specified level.
    3767              :  */
    3768              : #ifdef PG_FLUSH_DATA_WORKS
    3769              : 
    3770              : static void
    3771            0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
    3772              : {
    3773              :     int         fd;
    3774              : 
    3775              :     /* Don't try to flush directories, it'll likely just fail */
    3776            0 :     if (isdir)
    3777            0 :         return;
    3778              : 
    3779            0 :     ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
    3780              :                              fname);
    3781              : 
    3782            0 :     fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
    3783              : 
    3784            0 :     if (fd < 0)
    3785              :     {
    3786            0 :         if (errno == EACCES)
    3787            0 :             return;
    3788            0 :         ereport(elevel,
    3789              :                 (errcode_for_file_access(),
    3790              :                  errmsg("could not open file \"%s\": %m", fname)));
    3791            0 :         return;
    3792              :     }
    3793              : 
    3794              :     /*
    3795              :      * pg_flush_data() ignores errors, which is ok because this is only a
    3796              :      * hint.
    3797              :      */
    3798            0 :     pg_flush_data(fd, 0, 0);
    3799              : 
    3800            0 :     if (CloseTransientFile(fd) != 0)
    3801            0 :         ereport(elevel,
    3802              :                 (errcode_for_file_access(),
    3803              :                  errmsg("could not close file \"%s\": %m", fname)));
    3804              : }
    3805              : 
    3806              : #endif                          /* PG_FLUSH_DATA_WORKS */
    3807              : 
    3808              : static void
    3809            0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
    3810              : {
    3811            0 :     ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
    3812              :                              fname);
    3813              : 
    3814              :     /*
    3815              :      * We want to silently ignoring errors about unreadable files.  Pass that
    3816              :      * desire on to fsync_fname_ext().
    3817              :      */
    3818            0 :     fsync_fname_ext(fname, isdir, true, elevel);
    3819            0 : }
    3820              : 
    3821              : static void
    3822         2068 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
    3823              : {
    3824         2068 :     if (isdir)
    3825              :     {
    3826          252 :         if (rmdir(fname) != 0 && errno != ENOENT)
    3827            0 :             ereport(elevel,
    3828              :                     (errcode_for_file_access(),
    3829              :                      errmsg("could not remove directory \"%s\": %m", fname)));
    3830              :     }
    3831              :     else
    3832              :     {
    3833              :         /* Use PathNameDeleteTemporaryFile to report filesize */
    3834         1816 :         PathNameDeleteTemporaryFile(fname, false);
    3835              :     }
    3836         2068 : }
    3837              : 
    3838              : /*
    3839              :  * fsync_fname_ext -- Try to fsync a file or directory
    3840              :  *
    3841              :  * If ignore_perm is true, ignore errors upon trying to open unreadable
    3842              :  * files. Logs other errors at a caller-specified level.
    3843              :  *
    3844              :  * Returns 0 if the operation succeeded, -1 otherwise.
    3845              :  */
    3846              : int
    3847        44396 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
    3848              : {
    3849              :     int         fd;
    3850              :     int         flags;
    3851              :     int         returncode;
    3852              : 
    3853              :     /*
    3854              :      * Some OSs require directories to be opened read-only whereas other
    3855              :      * systems don't allow us to fsync files opened read-only; so we need both
    3856              :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
    3857              :      * not writable by our userid, but we assume that's OK.
    3858              :      */
    3859        44396 :     flags = PG_BINARY;
    3860        44396 :     if (!isdir)
    3861        16514 :         flags |= O_RDWR;
    3862              :     else
    3863        27882 :         flags |= O_RDONLY;
    3864              : 
    3865        44396 :     fd = OpenTransientFile(fname, flags);
    3866              : 
    3867              :     /*
    3868              :      * Some OSs don't allow us to open directories at all (Windows returns
    3869              :      * EACCES), just ignore the error in that case.  If desired also silently
    3870              :      * ignoring errors about unreadable files. Log others.
    3871              :      */
    3872        44396 :     if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
    3873            0 :         return 0;
    3874        44396 :     else if (fd < 0 && ignore_perm && errno == EACCES)
    3875            0 :         return 0;
    3876        44396 :     else if (fd < 0)
    3877              :     {
    3878            0 :         ereport(elevel,
    3879              :                 (errcode_for_file_access(),
    3880              :                  errmsg("could not open file \"%s\": %m", fname)));
    3881            0 :         return -1;
    3882              :     }
    3883              : 
    3884        44396 :     returncode = pg_fsync(fd);
    3885              : 
    3886              :     /*
    3887              :      * Some OSes don't allow us to fsync directories at all, so we can ignore
    3888              :      * those errors. Anything else needs to be logged.
    3889              :      */
    3890        44396 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
    3891              :     {
    3892              :         int         save_errno;
    3893              : 
    3894              :         /* close file upon error, might not be in transaction context */
    3895            0 :         save_errno = errno;
    3896            0 :         (void) CloseTransientFile(fd);
    3897            0 :         errno = save_errno;
    3898              : 
    3899            0 :         ereport(elevel,
    3900              :                 (errcode_for_file_access(),
    3901              :                  errmsg("could not fsync file \"%s\": %m", fname)));
    3902            0 :         return -1;
    3903              :     }
    3904              : 
    3905        44396 :     if (CloseTransientFile(fd) != 0)
    3906              :     {
    3907            0 :         ereport(elevel,
    3908              :                 (errcode_for_file_access(),
    3909              :                  errmsg("could not close file \"%s\": %m", fname)));
    3910            0 :         return -1;
    3911              :     }
    3912              : 
    3913        44396 :     return 0;
    3914              : }
    3915              : 
    3916              : /*
    3917              :  * fsync_parent_path -- fsync the parent path of a file or directory
    3918              :  *
    3919              :  * This is aimed at making file operations persistent on disk in case of
    3920              :  * an OS crash or power failure.
    3921              :  */
    3922              : static int
    3923         8036 : fsync_parent_path(const char *fname, int elevel)
    3924              : {
    3925              :     char        parentpath[MAXPGPATH];
    3926              : 
    3927         8036 :     strlcpy(parentpath, fname, MAXPGPATH);
    3928         8036 :     get_parent_directory(parentpath);
    3929              : 
    3930              :     /*
    3931              :      * get_parent_directory() returns an empty string if the input argument is
    3932              :      * just a file name (see comments in path.c), so handle that as being the
    3933              :      * current directory.
    3934              :      */
    3935         8036 :     if (strlen(parentpath) == 0)
    3936          223 :         strlcpy(parentpath, ".", MAXPGPATH);
    3937              : 
    3938         8036 :     if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
    3939            0 :         return -1;
    3940              : 
    3941         8036 :     return 0;
    3942              : }
    3943              : 
    3944              : /*
    3945              :  * Create a PostgreSQL data sub-directory
    3946              :  *
    3947              :  * The data directory itself, and most of its sub-directories, are created at
    3948              :  * initdb time, but we do have some occasions when we create directories in
    3949              :  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
    3950              :  * make sure that those directories are created consistently.  Today, that means
    3951              :  * making sure that the created directory has the correct permissions, which is
    3952              :  * what pg_dir_create_mode tracks for us.
    3953              :  *
    3954              :  * Note that we also set the umask() based on what we understand the correct
    3955              :  * permissions to be (see file_perm.c).
    3956              :  *
    3957              :  * For permissions other than the default, mkdir() can be used directly, but
    3958              :  * be sure to consider carefully such cases -- a sub-directory with incorrect
    3959              :  * permissions in a PostgreSQL data directory could cause backups and other
    3960              :  * processes to fail.
    3961              :  */
    3962              : int
    3963         1690 : MakePGDirectory(const char *directoryName)
    3964              : {
    3965         1690 :     return mkdir(directoryName, pg_dir_create_mode);
    3966              : }
    3967              : 
    3968              : /*
    3969              :  * Return the passed-in error level, or PANIC if data_sync_retry is off.
    3970              :  *
    3971              :  * Failure to fsync any data file is cause for immediate panic, unless
    3972              :  * data_sync_retry is enabled.  Data may have been written to the operating
    3973              :  * system and removed from our buffer pool already, and if we are running on
    3974              :  * an operating system that forgets dirty data on write-back failure, there
    3975              :  * may be only one copy of the data remaining: in the WAL.  A later attempt to
    3976              :  * fsync again might falsely report success.  Therefore we must not allow any
    3977              :  * further checkpoints to be attempted.  data_sync_retry can in theory be
    3978              :  * enabled on systems known not to drop dirty buffered data on write-back
    3979              :  * failure (with the likely outcome that checkpoints will continue to fail
    3980              :  * until the underlying problem is fixed).
    3981              :  *
    3982              :  * Any code that reports a failure from fsync() or related functions should
    3983              :  * filter the error level with this function.
    3984              :  */
    3985              : int
    3986        22892 : data_sync_elevel(int elevel)
    3987              : {
    3988        22892 :     return data_sync_retry ? elevel : PANIC;
    3989              : }
    3990              : 
    3991              : bool
    3992         1224 : check_debug_io_direct(char **newval, void **extra, GucSource source)
    3993              : {
    3994         1224 :     bool        result = true;
    3995              :     int         flags;
    3996              : 
    3997              : #if PG_O_DIRECT == 0
    3998              :     if (strcmp(*newval, "") != 0)
    3999              :     {
    4000              :         GUC_check_errdetail("\"%s\" is not supported on this platform.",
    4001              :                             "debug_io_direct");
    4002              :         result = false;
    4003              :     }
    4004              :     flags = 0;
    4005              : #else
    4006              :     List       *elemlist;
    4007              :     ListCell   *l;
    4008              :     char       *rawstring;
    4009              : 
    4010              :     /* Need a modifiable copy of string */
    4011         1224 :     rawstring = pstrdup(*newval);
    4012              : 
    4013         1224 :     if (!SplitGUCList(rawstring, ',', &elemlist))
    4014              :     {
    4015            0 :         GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
    4016              :                             "debug_io_direct");
    4017            0 :         pfree(rawstring);
    4018            0 :         list_free(elemlist);
    4019            0 :         return false;
    4020              :     }
    4021              : 
    4022         1224 :     flags = 0;
    4023         1230 :     foreach(l, elemlist)
    4024              :     {
    4025            6 :         char       *item = (char *) lfirst(l);
    4026              : 
    4027            6 :         if (pg_strcasecmp(item, "data") == 0)
    4028            2 :             flags |= IO_DIRECT_DATA;
    4029            4 :         else if (pg_strcasecmp(item, "wal") == 0)
    4030            2 :             flags |= IO_DIRECT_WAL;
    4031            2 :         else if (pg_strcasecmp(item, "wal_init") == 0)
    4032            2 :             flags |= IO_DIRECT_WAL_INIT;
    4033              :         else
    4034              :         {
    4035            0 :             GUC_check_errdetail("Invalid option \"%s\".", item);
    4036            0 :             result = false;
    4037            0 :             break;
    4038              :         }
    4039              :     }
    4040              : 
    4041              :     /*
    4042              :      * It's possible to configure block sizes smaller than our assumed I/O
    4043              :      * alignment size, which could result in invalid I/O requests.
    4044              :      */
    4045              : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
    4046              :     if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
    4047              :     {
    4048              :         GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
    4049              :                             "debug_io_direct", "XLOG_BLCKSZ");
    4050              :         result = false;
    4051              :     }
    4052              : #endif
    4053              : #if BLCKSZ < PG_IO_ALIGN_SIZE
    4054              :     if (result && (flags & IO_DIRECT_DATA))
    4055              :     {
    4056              :         GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
    4057              :                             "debug_io_direct", "BLCKSZ");
    4058              :         result = false;
    4059              :     }
    4060              : #endif
    4061              : 
    4062         1224 :     pfree(rawstring);
    4063         1224 :     list_free(elemlist);
    4064              : #endif
    4065              : 
    4066         1224 :     if (!result)
    4067            0 :         return result;
    4068              : 
    4069              :     /* Save the flags in *extra, for use by assign_debug_io_direct */
    4070         1224 :     *extra = guc_malloc(LOG, sizeof(int));
    4071         1224 :     if (!*extra)
    4072            0 :         return false;
    4073         1224 :     *((int *) *extra) = flags;
    4074              : 
    4075         1224 :     return result;
    4076              : }
    4077              : 
    4078              : void
    4079         1224 : assign_debug_io_direct(const char *newval, void *extra)
    4080              : {
    4081         1224 :     int        *flags = (int *) extra;
    4082              : 
    4083         1224 :     io_direct_flags = *flags;
    4084         1224 : }
    4085              : 
    4086              : /* ResourceOwner callbacks */
    4087              : 
    4088              : static void
    4089            5 : ResOwnerReleaseFile(Datum res)
    4090              : {
    4091            5 :     File        file = (File) DatumGetInt32(res);
    4092              :     Vfd        *vfdP;
    4093              : 
    4094              :     Assert(FileIsValid(file));
    4095              : 
    4096            5 :     vfdP = &VfdCache[file];
    4097            5 :     vfdP->resowner = NULL;
    4098              : 
    4099            5 :     FileClose(file);
    4100            5 : }
    4101              : 
    4102              : static char *
    4103            0 : ResOwnerPrintFile(Datum res)
    4104              : {
    4105            0 :     return psprintf("File %d", DatumGetInt32(res));
    4106              : }
        

Generated by: LCOV version 2.0-1