LCOV - code coverage report
Current view: top level - src/backend/storage/file - fd.c (source / functions) Hit Total Coverage
Test: PostgreSQL 15devel Lines: 671 897 74.8 %
Date: 2021-09-17 15:07:27 Functions: 85 90 94.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * fd.c
       4             :  *    Virtual file descriptor code.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/storage/file/fd.c
      11             :  *
      12             :  * NOTES:
      13             :  *
      14             :  * This code manages a cache of 'virtual' file descriptors (VFDs).
      15             :  * The server opens many file descriptors for a variety of reasons,
      16             :  * including base tables, scratch files (e.g., sort and hash spool
      17             :  * files), and random calls to C library routines like system(3); it
      18             :  * is quite easy to exceed system limits on the number of open files a
      19             :  * single process can have.  (This is around 1024 on many modern
      20             :  * operating systems, but may be lower on others.)
      21             :  *
      22             :  * VFDs are managed as an LRU pool, with actual OS file descriptors
      23             :  * being opened and closed as needed.  Obviously, if a routine is
      24             :  * opened using these interfaces, all subsequent operations must also
      25             :  * be through these interfaces (the File type is not a real file
      26             :  * descriptor).
      27             :  *
      28             :  * For this scheme to work, most (if not all) routines throughout the
      29             :  * server should use these interfaces instead of calling the C library
      30             :  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
      31             :  * may find ourselves short of real file descriptors anyway.
      32             :  *
      33             :  * INTERFACE ROUTINES
      34             :  *
      35             :  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
      36             :  * A File opened with OpenTemporaryFile is automatically deleted when the
      37             :  * File is closed, either explicitly or implicitly at end of transaction or
      38             :  * process exit. PathNameOpenFile is intended for files that are held open
      39             :  * for a long time, like relation files. It is the caller's responsibility
      40             :  * to close them, there is no automatic mechanism in fd.c for that.
      41             :  *
      42             :  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
      43             :  * temporary files that have names so that they can be shared between
      44             :  * backends.  Such files are automatically closed and count against the
      45             :  * temporary file limit of the backend that creates them, but unlike anonymous
      46             :  * files they are not automatically deleted.  See sharedfileset.c for a shared
      47             :  * ownership mechanism that provides automatic cleanup for shared files when
      48             :  * the last of a group of backends detaches.
      49             :  *
      50             :  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
      51             :  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
      52             :  * They behave like the corresponding native functions, except that the handle
      53             :  * is registered with the current subtransaction, and will be automatically
      54             :  * closed at abort. These are intended mainly for short operations like
      55             :  * reading a configuration file; there is a limit on the number of files that
      56             :  * can be opened using these functions at any one time.
      57             :  *
      58             :  * Finally, BasicOpenFile is just a thin wrapper around open() that can
      59             :  * release file descriptors in use by the virtual file descriptors if
      60             :  * necessary. There is no automatic cleanup of file descriptors returned by
      61             :  * BasicOpenFile, it is solely the caller's responsibility to close the file
      62             :  * descriptor by calling close(2).
      63             :  *
      64             :  * If a non-virtual file descriptor needs to be held open for any length of
      65             :  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
      66             :  * (and eventually ReleaseExternalFD), so that we can take it into account
      67             :  * while deciding how many VFDs can be open.  This applies to FDs obtained
      68             :  * with BasicOpenFile as well as those obtained without use of any fd.c API.
      69             :  *
      70             :  *-------------------------------------------------------------------------
      71             :  */
      72             : 
      73             : #include "postgres.h"
      74             : 
      75             : #include <dirent.h>
      76             : #include <sys/file.h>
      77             : #include <sys/param.h>
      78             : #include <sys/stat.h>
      79             : #include <sys/types.h>
      80             : #ifndef WIN32
      81             : #include <sys/mman.h>
      82             : #endif
      83             : #include <limits.h>
      84             : #include <unistd.h>
      85             : #include <fcntl.h>
      86             : #ifdef HAVE_SYS_RESOURCE_H
      87             : #include <sys/resource.h>     /* for getrlimit */
      88             : #endif
      89             : 
      90             : #include "access/xact.h"
      91             : #include "access/xlog.h"
      92             : #include "catalog/pg_tablespace.h"
      93             : #include "common/file_perm.h"
      94             : #include "common/file_utils.h"
      95             : #include "miscadmin.h"
      96             : #include "pgstat.h"
      97             : #include "port/pg_iovec.h"
      98             : #include "portability/mem.h"
      99             : #include "storage/fd.h"
     100             : #include "storage/ipc.h"
     101             : #include "utils/guc.h"
     102             : #include "utils/resowner_private.h"
     103             : 
     104             : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
     105             : #if defined(HAVE_SYNC_FILE_RANGE)
     106             : #define PG_FLUSH_DATA_WORKS 1
     107             : #elif !defined(WIN32) && defined(MS_ASYNC)
     108             : #define PG_FLUSH_DATA_WORKS 1
     109             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     110             : #define PG_FLUSH_DATA_WORKS 1
     111             : #endif
     112             : 
     113             : /*
     114             :  * We must leave some file descriptors free for system(), the dynamic loader,
     115             :  * and other code that tries to open files without consulting fd.c.  This
     116             :  * is the number left free.  (While we try fairly hard to prevent EMFILE
     117             :  * errors, there's never any guarantee that we won't get ENFILE due to
     118             :  * other processes chewing up FDs.  So it's a bad idea to try to open files
     119             :  * without consulting fd.c.  Nonetheless we cannot control all code.)
     120             :  *
     121             :  * Because this is just a fixed setting, we are effectively assuming that
     122             :  * no such code will leave FDs open over the long term; otherwise the slop
     123             :  * is likely to be insufficient.  Note in particular that we expect that
     124             :  * loading a shared library does not result in any permanent increase in
     125             :  * the number of open files.  (This appears to be true on most if not
     126             :  * all platforms as of Feb 2004.)
     127             :  */
     128             : #define NUM_RESERVED_FDS        10
     129             : 
     130             : /*
     131             :  * If we have fewer than this many usable FDs after allowing for the reserved
     132             :  * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
     133             :  * much less than that.  Note that this value ensures numExternalFDs can be
     134             :  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
     135             :  * will not pass unless that can grow to at least 14.)
     136             :  */
     137             : #define FD_MINFREE              48
     138             : 
     139             : /*
     140             :  * A number of platforms allow individual processes to open many more files
     141             :  * than they can really support when *many* processes do the same thing.
     142             :  * This GUC parameter lets the DBA limit max_safe_fds to something less than
     143             :  * what the postmaster's initial probe suggests will work.
     144             :  */
     145             : int         max_files_per_process = 1000;
     146             : 
     147             : /*
     148             :  * Maximum number of file descriptors to open for operations that fd.c knows
     149             :  * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
     150             :  * to a conservative value, and remains that way indefinitely in bootstrap or
     151             :  * standalone-backend cases.  In normal postmaster operation, the postmaster
     152             :  * calls set_max_safe_fds() late in initialization to update the value, and
     153             :  * that value is then inherited by forked subprocesses.
     154             :  *
     155             :  * Note: the value of max_files_per_process is taken into account while
     156             :  * setting this variable, and so need not be tested separately.
     157             :  */
     158             : int         max_safe_fds = FD_MINFREE;  /* default if not changed */
     159             : 
     160             : /* Whether it is safe to continue running after fsync() fails. */
     161             : bool        data_sync_retry = false;
     162             : 
     163             : /* How SyncDataDirectory() should do its job. */
     164             : int         recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
     165             : 
     166             : /* Debugging.... */
     167             : 
     168             : #ifdef FDDEBUG
     169             : #define DO_DB(A) \
     170             :     do { \
     171             :         int         _do_db_save_errno = errno; \
     172             :         A; \
     173             :         errno = _do_db_save_errno; \
     174             :     } while (0)
     175             : #else
     176             : #define DO_DB(A) \
     177             :     ((void) 0)
     178             : #endif
     179             : 
     180             : #define VFD_CLOSED (-1)
     181             : 
     182             : #define FileIsValid(file) \
     183             :     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
     184             : 
     185             : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
     186             : 
     187             : /* these are the assigned bits in fdstate below: */
     188             : #define FD_DELETE_AT_CLOSE  (1 << 0)  /* T = delete when closed */
     189             : #define FD_CLOSE_AT_EOXACT  (1 << 1)  /* T = close at eoXact */
     190             : #define FD_TEMP_FILE_LIMIT  (1 << 2)  /* T = respect temp_file_limit */
     191             : 
     192             : typedef struct vfd
     193             : {
     194             :     int         fd;             /* current FD, or VFD_CLOSED if none */
     195             :     unsigned short fdstate;     /* bitflags for VFD's state */
     196             :     ResourceOwner resowner;     /* owner, for automatic cleanup */
     197             :     File        nextFree;       /* link to next free VFD, if in freelist */
     198             :     File        lruMoreRecently;    /* doubly linked recency-of-use list */
     199             :     File        lruLessRecently;
     200             :     off_t       fileSize;       /* current size of file (0 if not temporary) */
     201             :     char       *fileName;       /* name of file, or NULL for unused VFD */
     202             :     /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
     203             :     int         fileFlags;      /* open(2) flags for (re)opening the file */
     204             :     mode_t      fileMode;       /* mode to pass to open(2) */
     205             : } Vfd;
     206             : 
     207             : /*
     208             :  * Virtual File Descriptor array pointer and size.  This grows as
     209             :  * needed.  'File' values are indexes into this array.
     210             :  * Note that VfdCache[0] is not a usable VFD, just a list header.
     211             :  */
     212             : static Vfd *VfdCache;
     213             : static Size SizeVfdCache = 0;
     214             : 
     215             : /*
     216             :  * Number of file descriptors known to be in use by VFD entries.
     217             :  */
     218             : static int  nfile = 0;
     219             : 
     220             : /*
     221             :  * Flag to tell whether it's worth scanning VfdCache looking for temp files
     222             :  * to close
     223             :  */
     224             : static bool have_xact_temporary_files = false;
     225             : 
     226             : /*
     227             :  * Tracks the total size of all temporary files.  Note: when temp_file_limit
     228             :  * is being enforced, this cannot overflow since the limit cannot be more
     229             :  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
     230             :  * overflow, but we don't care.
     231             :  */
     232             : static uint64 temporary_files_size = 0;
     233             : 
     234             : /* Temporary file access initialized and not yet shut down? */
     235             : #ifdef USE_ASSERT_CHECKING
     236             : static bool temporary_files_allowed = false;
     237             : #endif
     238             : 
     239             : /*
     240             :  * List of OS handles opened with AllocateFile, AllocateDir and
     241             :  * OpenTransientFile.
     242             :  */
     243             : typedef enum
     244             : {
     245             :     AllocateDescFile,
     246             :     AllocateDescPipe,
     247             :     AllocateDescDir,
     248             :     AllocateDescRawFD
     249             : } AllocateDescKind;
     250             : 
     251             : typedef struct
     252             : {
     253             :     AllocateDescKind kind;
     254             :     SubTransactionId create_subid;
     255             :     union
     256             :     {
     257             :         FILE       *file;
     258             :         DIR        *dir;
     259             :         int         fd;
     260             :     }           desc;
     261             : } AllocateDesc;
     262             : 
     263             : static int  numAllocatedDescs = 0;
     264             : static int  maxAllocatedDescs = 0;
     265             : static AllocateDesc *allocatedDescs = NULL;
     266             : 
     267             : /*
     268             :  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
     269             :  */
     270             : static int  numExternalFDs = 0;
     271             : 
     272             : /*
     273             :  * Number of temporary files opened during the current session;
     274             :  * this is used in generation of tempfile names.
     275             :  */
     276             : static long tempFileCounter = 0;
     277             : 
     278             : /*
     279             :  * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
     280             :  * indicating that the current database's default tablespace should be used.)
     281             :  * When numTempTableSpaces is -1, this has not been set in the current
     282             :  * transaction.
     283             :  */
     284             : static Oid *tempTableSpaces = NULL;
     285             : static int  numTempTableSpaces = -1;
     286             : static int  nextTempTableSpace = 0;
     287             : 
     288             : 
     289             : /*--------------------
     290             :  *
     291             :  * Private Routines
     292             :  *
     293             :  * Delete          - delete a file from the Lru ring
     294             :  * LruDelete       - remove a file from the Lru ring and close its FD
     295             :  * Insert          - put a file at the front of the Lru ring
     296             :  * LruInsert       - put a file at the front of the Lru ring and open it
     297             :  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
     298             :  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
     299             :  * AllocateVfd     - grab a free (or new) file record (from VfdCache)
     300             :  * FreeVfd         - free a file record
     301             :  *
     302             :  * The Least Recently Used ring is a doubly linked list that begins and
     303             :  * ends on element zero.  Element zero is special -- it doesn't represent
     304             :  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
     305             :  * anchor that shows us the beginning/end of the ring.
     306             :  * Only VFD elements that are currently really open (have an FD assigned) are
     307             :  * in the Lru ring.  Elements that are "virtually" open can be recognized
     308             :  * by having a non-null fileName field.
     309             :  *
     310             :  * example:
     311             :  *
     312             :  *     /--less----\                /---------\
     313             :  *     v           \              v           \
     314             :  *   #0 --more---> LeastRecentlyUsed --more-\ \
     315             :  *    ^\                                    | |
     316             :  *     \\less--> MostRecentlyUsedFile    <---/ |
     317             :  *      \more---/                    \--less--/
     318             :  *
     319             :  *--------------------
     320             :  */
     321             : static void Delete(File file);
     322             : static void LruDelete(File file);
     323             : static void Insert(File file);
     324             : static int  LruInsert(File file);
     325             : static bool ReleaseLruFile(void);
     326             : static void ReleaseLruFiles(void);
     327             : static File AllocateVfd(void);
     328             : static void FreeVfd(File file);
     329             : 
     330             : static int  FileAccess(File file);
     331             : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
     332             : static bool reserveAllocatedDesc(void);
     333             : static int  FreeDesc(AllocateDesc *desc);
     334             : 
     335             : static void BeforeShmemExit_Files(int code, Datum arg);
     336             : static void CleanupTempFiles(bool isCommit, bool isProcExit);
     337             : static void RemovePgTempRelationFiles(const char *tsdirname);
     338             : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
     339             : 
     340             : static void walkdir(const char *path,
     341             :                     void (*action) (const char *fname, bool isdir, int elevel),
     342             :                     bool process_symlinks,
     343             :                     int elevel);
     344             : #ifdef PG_FLUSH_DATA_WORKS
     345             : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
     346             : #endif
     347             : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
     348             : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
     349             : 
     350             : static int  fsync_parent_path(const char *fname, int elevel);
     351             : 
     352             : 
     353             : /*
     354             :  * pg_fsync --- do fsync with or without writethrough
     355             :  */
     356             : int
     357      162546 : pg_fsync(int fd)
     358             : {
     359             : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
     360             :     struct stat st;
     361             : 
     362             :     /*
     363             :      * Some operating system implementations of fsync() have requirements
     364             :      * about the file access modes that were used when their file descriptor
     365             :      * argument was opened, and these requirements differ depending on whether
     366             :      * the file descriptor is for a directory.
     367             :      *
     368             :      * For any file descriptor that may eventually be handed to fsync(), we
     369             :      * should have opened it with access modes that are compatible with
     370             :      * fsync() on all supported systems, otherwise the code may not be
     371             :      * portable, even if it runs ok on the current system.
     372             :      *
     373             :      * We assert here that a descriptor for a file was opened with write
     374             :      * permissions (either O_RDWR or O_WRONLY) and for a directory without
     375             :      * write permissions (O_RDONLY).
     376             :      *
     377             :      * Ignore any fstat errors and let the follow-up fsync() do its work.
     378             :      * Doing this sanity check here counts for the case where fsync() is
     379             :      * disabled.
     380             :      */
     381             :     if (fstat(fd, &st) == 0)
     382             :     {
     383             :         int         desc_flags = fcntl(fd, F_GETFL);
     384             : 
     385             :         /*
     386             :          * O_RDONLY is historically 0, so just make sure that for directories
     387             :          * no write flags are used.
     388             :          */
     389             :         if (S_ISDIR(st.st_mode))
     390             :             Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
     391             :         else
     392             :             Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
     393             :     }
     394             :     errno = 0;
     395             : #endif
     396             : 
     397             :     /* #if is to skip the sync_method test if there's no need for it */
     398             : #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
     399             :     if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
     400             :         return pg_fsync_writethrough(fd);
     401             :     else
     402             : #endif
     403      162546 :         return pg_fsync_no_writethrough(fd);
     404             : }
     405             : 
     406             : 
     407             : /*
     408             :  * pg_fsync_no_writethrough --- same as fsync except does nothing if
     409             :  *  enableFsync is off
     410             :  */
     411             : int
     412      162546 : pg_fsync_no_writethrough(int fd)
     413             : {
     414      162546 :     if (enableFsync)
     415        2278 :         return fsync(fd);
     416             :     else
     417      160268 :         return 0;
     418             : }
     419             : 
     420             : /*
     421             :  * pg_fsync_writethrough
     422             :  */
     423             : int
     424           0 : pg_fsync_writethrough(int fd)
     425             : {
     426           0 :     if (enableFsync)
     427             :     {
     428             : #ifdef WIN32
     429             :         return _commit(fd);
     430             : #elif defined(F_FULLFSYNC)
     431             :         return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
     432             : #else
     433           0 :         errno = ENOSYS;
     434           0 :         return -1;
     435             : #endif
     436             :     }
     437             :     else
     438           0 :         return 0;
     439             : }
     440             : 
     441             : /*
     442             :  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
     443             :  *
     444             :  * Not all platforms have fdatasync; treat as fsync if not available.
     445             :  */
     446             : int
     447          68 : pg_fdatasync(int fd)
     448             : {
     449          68 :     if (enableFsync)
     450             :     {
     451             : #ifdef HAVE_FDATASYNC
     452          68 :         return fdatasync(fd);
     453             : #else
     454             :         return fsync(fd);
     455             : #endif
     456             :     }
     457             :     else
     458           0 :         return 0;
     459             : }
     460             : 
     461             : /*
     462             :  * pg_flush_data --- advise OS that the described dirty data should be flushed
     463             :  *
     464             :  * offset of 0 with nbytes 0 means that the entire file should be flushed
     465             :  */
     466             : void
     467      444518 : pg_flush_data(int fd, off_t offset, off_t nbytes)
     468             : {
     469             :     /*
     470             :      * Right now file flushing is primarily used to avoid making later
     471             :      * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
     472             :      * if fsyncs are disabled - that's a decision we might want to make
     473             :      * configurable at some point.
     474             :      */
     475      444518 :     if (!enableFsync)
     476      442568 :         return;
     477             : 
     478             :     /*
     479             :      * We compile all alternatives that are supported on the current platform,
     480             :      * to find portability problems more easily.
     481             :      */
     482             : #if defined(HAVE_SYNC_FILE_RANGE)
     483             :     {
     484             :         int         rc;
     485             :         static bool not_implemented_by_kernel = false;
     486             : 
     487        1950 :         if (not_implemented_by_kernel)
     488           0 :             return;
     489             : 
     490             :         /*
     491             :          * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
     492             :          * tells the OS that writeback for the specified blocks should be
     493             :          * started, but that we don't want to wait for completion.  Note that
     494             :          * this call might block if too much dirty data exists in the range.
     495             :          * This is the preferable method on OSs supporting it, as it works
     496             :          * reliably when available (contrast to msync()) and doesn't flush out
     497             :          * clean data (like FADV_DONTNEED).
     498             :          */
     499        1950 :         rc = sync_file_range(fd, offset, nbytes,
     500             :                              SYNC_FILE_RANGE_WRITE);
     501        1950 :         if (rc != 0)
     502             :         {
     503             :             int         elevel;
     504             : 
     505             :             /*
     506             :              * For systems that don't have an implementation of
     507             :              * sync_file_range() such as Windows WSL, generate only one
     508             :              * warning and then suppress all further attempts by this process.
     509             :              */
     510           0 :             if (errno == ENOSYS)
     511             :             {
     512           0 :                 elevel = WARNING;
     513           0 :                 not_implemented_by_kernel = true;
     514             :             }
     515             :             else
     516           0 :                 elevel = data_sync_elevel(WARNING);
     517             : 
     518           0 :             ereport(elevel,
     519             :                     (errcode_for_file_access(),
     520             :                      errmsg("could not flush dirty data: %m")));
     521             :         }
     522             : 
     523        1950 :         return;
     524             :     }
     525             : #endif
     526             : #if !defined(WIN32) && defined(MS_ASYNC)
     527             :     {
     528             :         void       *p;
     529             :         static int  pagesize = 0;
     530             : 
     531             :         /*
     532             :          * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
     533             :          * writeback. On linux it only does so if MS_SYNC is specified, but
     534             :          * then it does the writeback synchronously. Luckily all common linux
     535             :          * systems have sync_file_range().  This is preferable over
     536             :          * FADV_DONTNEED because it doesn't flush out clean data.
     537             :          *
     538             :          * We map the file (mmap()), tell the kernel to sync back the contents
     539             :          * (msync()), and then remove the mapping again (munmap()).
     540             :          */
     541             : 
     542             :         /* mmap() needs actual length if we want to map whole file */
     543             :         if (offset == 0 && nbytes == 0)
     544             :         {
     545             :             nbytes = lseek(fd, 0, SEEK_END);
     546             :             if (nbytes < 0)
     547             :             {
     548             :                 ereport(WARNING,
     549             :                         (errcode_for_file_access(),
     550             :                          errmsg("could not determine dirty data size: %m")));
     551             :                 return;
     552             :             }
     553             :         }
     554             : 
     555             :         /*
     556             :          * Some platforms reject partial-page mmap() attempts.  To deal with
     557             :          * that, just truncate the request to a page boundary.  If any extra
     558             :          * bytes don't get flushed, well, it's only a hint anyway.
     559             :          */
     560             : 
     561             :         /* fetch pagesize only once */
     562             :         if (pagesize == 0)
     563             :             pagesize = sysconf(_SC_PAGESIZE);
     564             : 
     565             :         /* align length to pagesize, dropping any fractional page */
     566             :         if (pagesize > 0)
     567             :             nbytes = (nbytes / pagesize) * pagesize;
     568             : 
     569             :         /* fractional-page request is a no-op */
     570             :         if (nbytes <= 0)
     571             :             return;
     572             : 
     573             :         /*
     574             :          * mmap could well fail, particularly on 32-bit platforms where there
     575             :          * may simply not be enough address space.  If so, silently fall
     576             :          * through to the next implementation.
     577             :          */
     578             :         if (nbytes <= (off_t) SSIZE_MAX)
     579             :             p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
     580             :         else
     581             :             p = MAP_FAILED;
     582             : 
     583             :         if (p != MAP_FAILED)
     584             :         {
     585             :             int         rc;
     586             : 
     587             :             rc = msync(p, (size_t) nbytes, MS_ASYNC);
     588             :             if (rc != 0)
     589             :             {
     590             :                 ereport(data_sync_elevel(WARNING),
     591             :                         (errcode_for_file_access(),
     592             :                          errmsg("could not flush dirty data: %m")));
     593             :                 /* NB: need to fall through to munmap()! */
     594             :             }
     595             : 
     596             :             rc = munmap(p, (size_t) nbytes);
     597             :             if (rc != 0)
     598             :             {
     599             :                 /* FATAL error because mapping would remain */
     600             :                 ereport(FATAL,
     601             :                         (errcode_for_file_access(),
     602             :                          errmsg("could not munmap() while flushing data: %m")));
     603             :             }
     604             : 
     605             :             return;
     606             :         }
     607             :     }
     608             : #endif
     609             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     610             :     {
     611             :         int         rc;
     612             : 
     613             :         /*
     614             :          * Signal the kernel that the passed in range should not be cached
     615             :          * anymore. This has the, desired, side effect of writing out dirty
     616             :          * data, and the, undesired, side effect of likely discarding useful
     617             :          * clean cached blocks.  For the latter reason this is the least
     618             :          * preferable method.
     619             :          */
     620             : 
     621             :         rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
     622             : 
     623             :         if (rc != 0)
     624             :         {
     625             :             /* don't error out, this is just a performance optimization */
     626             :             ereport(WARNING,
     627             :                     (errcode_for_file_access(),
     628             :                      errmsg("could not flush dirty data: %m")));
     629             :         }
     630             : 
     631             :         return;
     632             :     }
     633             : #endif
     634             : }
     635             : 
     636             : /*
     637             :  * Truncate a file to a given length by name.
     638             :  */
     639             : int
     640      211434 : pg_truncate(const char *path, off_t length)
     641             : {
     642             : #ifdef WIN32
     643             :     int         save_errno;
     644             :     int         ret;
     645             :     int         fd;
     646             : 
     647             :     fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
     648             :     if (fd >= 0)
     649             :     {
     650             :         ret = ftruncate(fd, 0);
     651             :         save_errno = errno;
     652             :         CloseTransientFile(fd);
     653             :         errno = save_errno;
     654             :     }
     655             :     else
     656             :         ret = -1;
     657             : 
     658             :     return ret;
     659             : #else
     660      211434 :     return truncate(path, length);
     661             : #endif
     662             : }
     663             : 
     664             : /*
     665             :  * fsync_fname -- fsync a file or directory, handling errors properly
     666             :  *
     667             :  * Try to fsync a file or directory. When doing the latter, ignore errors that
     668             :  * indicate the OS just doesn't allow/require fsyncing directories.
     669             :  */
     670             : void
     671       25840 : fsync_fname(const char *fname, bool isdir)
     672             : {
     673       25840 :     fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
     674       25840 : }
     675             : 
     676             : /*
     677             :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     678             :  *
     679             :  * This routine ensures that, after returning, the effect of renaming file
     680             :  * persists in case of a crash. A crash while this routine is running will
     681             :  * leave you with either the pre-existing or the moved file in place of the
     682             :  * new file; no mixed state or truncated files are possible.
     683             :  *
     684             :  * It does so by using fsync on the old filename and the possibly existing
     685             :  * target filename before the rename, and the target file and directory after.
     686             :  *
     687             :  * Note that rename() cannot be used across arbitrary directories, as they
     688             :  * might not be on the same filesystem. Therefore this routine does not
     689             :  * support renaming across directories.
     690             :  *
     691             :  * Log errors with the caller specified severity.
     692             :  *
     693             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     694             :  * valid upon return.
     695             :  */
     696             : int
     697        4148 : durable_rename(const char *oldfile, const char *newfile, int elevel)
     698             : {
     699             :     int         fd;
     700             : 
     701             :     /*
     702             :      * First fsync the old and target path (if it exists), to ensure that they
     703             :      * are properly persistent on disk. Syncing the target file is not
     704             :      * strictly necessary, but it makes it easier to reason about crashes;
     705             :      * because it's then guaranteed that either source or target file exists
     706             :      * after a crash.
     707             :      */
     708        4148 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     709           0 :         return -1;
     710             : 
     711        4148 :     fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
     712        4148 :     if (fd < 0)
     713             :     {
     714         600 :         if (errno != ENOENT)
     715             :         {
     716           0 :             ereport(elevel,
     717             :                     (errcode_for_file_access(),
     718             :                      errmsg("could not open file \"%s\": %m", newfile)));
     719           0 :             return -1;
     720             :         }
     721             :     }
     722             :     else
     723             :     {
     724        3548 :         if (pg_fsync(fd) != 0)
     725             :         {
     726             :             int         save_errno;
     727             : 
     728             :             /* close file upon error, might not be in transaction context */
     729           0 :             save_errno = errno;
     730           0 :             CloseTransientFile(fd);
     731           0 :             errno = save_errno;
     732             : 
     733           0 :             ereport(elevel,
     734             :                     (errcode_for_file_access(),
     735             :                      errmsg("could not fsync file \"%s\": %m", newfile)));
     736           0 :             return -1;
     737             :         }
     738             : 
     739        3548 :         if (CloseTransientFile(fd) != 0)
     740             :         {
     741           0 :             ereport(elevel,
     742             :                     (errcode_for_file_access(),
     743             :                      errmsg("could not close file \"%s\": %m", newfile)));
     744           0 :             return -1;
     745             :         }
     746             :     }
     747             : 
     748             :     /* Time to do the real deal... */
     749        4148 :     if (rename(oldfile, newfile) < 0)
     750             :     {
     751           0 :         ereport(elevel,
     752             :                 (errcode_for_file_access(),
     753             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     754             :                         oldfile, newfile)));
     755           0 :         return -1;
     756             :     }
     757             : 
     758             :     /*
     759             :      * To guarantee renaming the file is persistent, fsync the file with its
     760             :      * new name, and its containing directory.
     761             :      */
     762        4148 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     763           0 :         return -1;
     764             : 
     765        4148 :     if (fsync_parent_path(newfile, elevel) != 0)
     766           0 :         return -1;
     767             : 
     768        4148 :     return 0;
     769             : }
     770             : 
     771             : /*
     772             :  * durable_unlink -- remove a file in a durable manner
     773             :  *
     774             :  * This routine ensures that, after returning, the effect of removing file
     775             :  * persists in case of a crash. A crash while this routine is running will
     776             :  * leave the system in no mixed state.
     777             :  *
     778             :  * It does so by using fsync on the parent directory of the file after the
     779             :  * actual removal is done.
     780             :  *
     781             :  * Log errors with the severity specified by caller.
     782             :  *
     783             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     784             :  * valid upon return.
     785             :  */
     786             : int
     787         188 : durable_unlink(const char *fname, int elevel)
     788             : {
     789         188 :     if (unlink(fname) < 0)
     790             :     {
     791          52 :         ereport(elevel,
     792             :                 (errcode_for_file_access(),
     793             :                  errmsg("could not remove file \"%s\": %m",
     794             :                         fname)));
     795          52 :         return -1;
     796             :     }
     797             : 
     798             :     /*
     799             :      * To guarantee that the removal of the file is persistent, fsync its
     800             :      * parent directory.
     801             :      */
     802         136 :     if (fsync_parent_path(fname, elevel) != 0)
     803           0 :         return -1;
     804             : 
     805         136 :     return 0;
     806             : }
     807             : 
     808             : /*
     809             :  * durable_rename_excl -- rename a file in a durable manner.
     810             :  *
     811             :  * Similar to durable_rename(), except that this routine tries (but does not
     812             :  * guarantee) not to overwrite the target file.
     813             :  *
     814             :  * Note that a crash in an unfortunate moment can leave you with two links to
     815             :  * the target file.
     816             :  *
     817             :  * Log errors with the caller specified severity.
     818             :  *
     819             :  * On Windows, using a hard link followed by unlink() causes concurrency
     820             :  * issues, while a simple rename() does not cause that, so be careful when
     821             :  * changing the logic of this routine.
     822             :  *
     823             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     824             :  * valid upon return.
     825             :  */
     826             : int
     827        1838 : durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
     828             : {
     829             :     /*
     830             :      * Ensure that, if we crash directly after the rename/link, a file with
     831             :      * valid contents is moved into place.
     832             :      */
     833        1838 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     834           0 :         return -1;
     835             : 
     836             : #ifdef HAVE_WORKING_LINK
     837        1838 :     if (link(oldfile, newfile) < 0)
     838             :     {
     839           0 :         ereport(elevel,
     840             :                 (errcode_for_file_access(),
     841             :                  errmsg("could not link file \"%s\" to \"%s\": %m",
     842             :                         oldfile, newfile)));
     843           0 :         return -1;
     844             :     }
     845        1838 :     unlink(oldfile);
     846             : #else
     847             :     if (rename(oldfile, newfile) < 0)
     848             :     {
     849             :         ereport(elevel,
     850             :                 (errcode_for_file_access(),
     851             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     852             :                         oldfile, newfile)));
     853             :         return -1;
     854             :     }
     855             : #endif
     856             : 
     857             :     /*
     858             :      * Make change persistent in case of an OS crash, both the new entry and
     859             :      * its parent directory need to be flushed.
     860             :      */
     861        1838 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     862           0 :         return -1;
     863             : 
     864             :     /* Same for parent directory */
     865        1838 :     if (fsync_parent_path(newfile, elevel) != 0)
     866           0 :         return -1;
     867             : 
     868        1838 :     return 0;
     869             : }
     870             : 
     871             : /*
     872             :  * InitFileAccess --- initialize this module during backend startup
     873             :  *
     874             :  * This is called during either normal or standalone backend start.
     875             :  * It is *not* called in the postmaster.
     876             :  *
     877             :  * Note that this does not initialize temporary file access, that is
     878             :  * separately initialized via InitTemporaryFileAccess().
     879             :  */
     880             : void
     881       17278 : InitFileAccess(void)
     882             : {
     883             :     Assert(SizeVfdCache == 0);  /* call me only once */
     884             : 
     885             :     /* initialize cache header entry */
     886       17278 :     VfdCache = (Vfd *) malloc(sizeof(Vfd));
     887       17278 :     if (VfdCache == NULL)
     888           0 :         ereport(FATAL,
     889             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     890             :                  errmsg("out of memory")));
     891             : 
     892      138224 :     MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
     893       17278 :     VfdCache->fd = VFD_CLOSED;
     894             : 
     895       17278 :     SizeVfdCache = 1;
     896       17278 : }
     897             : 
     898             : /*
     899             :  * InitTemporaryFileAccess --- initialize temporary file access during startup
     900             :  *
     901             :  * This is called during either normal or standalone backend start.
     902             :  * It is *not* called in the postmaster.
     903             :  *
     904             :  * This is separate from InitFileAccess() because temporary file cleanup can
     905             :  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
     906             :  * our reporting has to happen before that. Low level file access should be
     907             :  * available for longer, hence the separate initialization / shutdown of
     908             :  * temporary file handling.
     909             :  */
     910             : void
     911       17278 : InitTemporaryFileAccess(void)
     912             : {
     913             :     Assert(SizeVfdCache != 0);  /* InitFileAccess() needs to have run*/
     914             :     Assert(!temporary_files_allowed);   /* call me only once */
     915             : 
     916             :     /*
     917             :      * Register before-shmem-exit hook to ensure temp files are dropped while
     918             :      * we can still report stats.
     919             :      */
     920       17278 :     before_shmem_exit(BeforeShmemExit_Files, 0);
     921             : 
     922             : #ifdef USE_ASSERT_CHECKING
     923             :     temporary_files_allowed = true;
     924             : #endif
     925       17278 : }
     926             : 
     927             : /*
     928             :  * count_usable_fds --- count how many FDs the system will let us open,
     929             :  *      and estimate how many are already open.
     930             :  *
     931             :  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
     932             :  * value of max_to_probe might result in an underestimate of already_open;
     933             :  * we must fill in any "gaps" in the set of used FDs before the calculation
     934             :  * of already_open will give the right answer.  In practice, max_to_probe
     935             :  * of a couple of dozen should be enough to ensure good results.
     936             :  *
     937             :  * We assume stderr (FD 2) is available for dup'ing.  While the calling
     938             :  * script could theoretically close that, it would be a really bad idea,
     939             :  * since then one risks loss of error messages from, e.g., libc.
     940             :  */
     941             : static void
     942         902 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
     943             : {
     944             :     int        *fd;
     945             :     int         size;
     946         902 :     int         used = 0;
     947         902 :     int         highestfd = 0;
     948             :     int         j;
     949             : 
     950             : #ifdef HAVE_GETRLIMIT
     951             :     struct rlimit rlim;
     952             :     int         getrlimit_status;
     953             : #endif
     954             : 
     955         902 :     size = 1024;
     956         902 :     fd = (int *) palloc(size * sizeof(int));
     957             : 
     958             : #ifdef HAVE_GETRLIMIT
     959             : #ifdef RLIMIT_NOFILE            /* most platforms use RLIMIT_NOFILE */
     960         902 :     getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
     961             : #else                           /* but BSD doesn't ... */
     962             :     getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
     963             : #endif                          /* RLIMIT_NOFILE */
     964         902 :     if (getrlimit_status != 0)
     965           0 :         ereport(WARNING, (errmsg("getrlimit failed: %m")));
     966             : #endif                          /* HAVE_GETRLIMIT */
     967             : 
     968             :     /* dup until failure or probe limit reached */
     969             :     for (;;)
     970      901098 :     {
     971             :         int         thisfd;
     972             : 
     973             : #ifdef HAVE_GETRLIMIT
     974             : 
     975             :         /*
     976             :          * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
     977             :          * some platforms
     978             :          */
     979      902000 :         if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
     980           0 :             break;
     981             : #endif
     982             : 
     983      902000 :         thisfd = dup(2);
     984      902000 :         if (thisfd < 0)
     985             :         {
     986             :             /* Expect EMFILE or ENFILE, else it's fishy */
     987           0 :             if (errno != EMFILE && errno != ENFILE)
     988           0 :                 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
     989           0 :             break;
     990             :         }
     991             : 
     992      902000 :         if (used >= size)
     993             :         {
     994           0 :             size *= 2;
     995           0 :             fd = (int *) repalloc(fd, size * sizeof(int));
     996             :         }
     997      902000 :         fd[used++] = thisfd;
     998             : 
     999      902000 :         if (highestfd < thisfd)
    1000      902000 :             highestfd = thisfd;
    1001             : 
    1002      902000 :         if (used >= max_to_probe)
    1003         902 :             break;
    1004             :     }
    1005             : 
    1006             :     /* release the files we opened */
    1007      902902 :     for (j = 0; j < used; j++)
    1008      902000 :         close(fd[j]);
    1009             : 
    1010         902 :     pfree(fd);
    1011             : 
    1012             :     /*
    1013             :      * Return results.  usable_fds is just the number of successful dups. We
    1014             :      * assume that the system limit is highestfd+1 (remember 0 is a legal FD
    1015             :      * number) and so already_open is highestfd+1 - usable_fds.
    1016             :      */
    1017         902 :     *usable_fds = used;
    1018         902 :     *already_open = highestfd + 1 - used;
    1019         902 : }
    1020             : 
    1021             : /*
    1022             :  * set_max_safe_fds
    1023             :  *      Determine number of file descriptors that fd.c is allowed to use
    1024             :  */
    1025             : void
    1026         902 : set_max_safe_fds(void)
    1027             : {
    1028             :     int         usable_fds;
    1029             :     int         already_open;
    1030             : 
    1031             :     /*----------
    1032             :      * We want to set max_safe_fds to
    1033             :      *          MIN(usable_fds, max_files_per_process - already_open)
    1034             :      * less the slop factor for files that are opened without consulting
    1035             :      * fd.c.  This ensures that we won't exceed either max_files_per_process
    1036             :      * or the experimentally-determined EMFILE limit.
    1037             :      *----------
    1038             :      */
    1039         902 :     count_usable_fds(max_files_per_process,
    1040             :                      &usable_fds, &already_open);
    1041             : 
    1042         902 :     max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
    1043             : 
    1044             :     /*
    1045             :      * Take off the FDs reserved for system() etc.
    1046             :      */
    1047         902 :     max_safe_fds -= NUM_RESERVED_FDS;
    1048             : 
    1049             :     /*
    1050             :      * Make sure we still have enough to get by.
    1051             :      */
    1052         902 :     if (max_safe_fds < FD_MINFREE)
    1053           0 :         ereport(FATAL,
    1054             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    1055             :                  errmsg("insufficient file descriptors available to start server process"),
    1056             :                  errdetail("System allows %d, we need at least %d.",
    1057             :                            max_safe_fds + NUM_RESERVED_FDS,
    1058             :                            FD_MINFREE + NUM_RESERVED_FDS)));
    1059             : 
    1060         902 :     elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
    1061             :          max_safe_fds, usable_fds, already_open);
    1062         902 : }
    1063             : 
    1064             : /*
    1065             :  * Open a file with BasicOpenFilePerm() and pass default file mode for the
    1066             :  * fileMode parameter.
    1067             :  */
    1068             : int
    1069       56654 : BasicOpenFile(const char *fileName, int fileFlags)
    1070             : {
    1071       56654 :     return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1072             : }
    1073             : 
    1074             : /*
    1075             :  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
    1076             :  *
    1077             :  * This is exported for use by places that really want a plain kernel FD,
    1078             :  * but need to be proof against running out of FDs.  Once an FD has been
    1079             :  * successfully returned, it is the caller's responsibility to ensure that
    1080             :  * it will not be leaked on ereport()!  Most users should *not* call this
    1081             :  * routine directly, but instead use the VFD abstraction level, which
    1082             :  * provides protection against descriptor leaks as well as management of
    1083             :  * files that need to be open for more than a short period of time.
    1084             :  *
    1085             :  * Ideally this should be the *only* direct call of open() in the backend.
    1086             :  * In practice, the postmaster calls open() directly, and there are some
    1087             :  * direct open() calls done early in backend startup.  Those are OK since
    1088             :  * this module wouldn't have any open files to close at that point anyway.
    1089             :  */
    1090             : int
    1091     3066346 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1092             : {
    1093             :     int         fd;
    1094             : 
    1095     3066346 : tryAgain:
    1096             : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1097             : 
    1098             :     /*
    1099             :      * The value we defined to stand in for O_DIRECT when simulating it with
    1100             :      * F_NOCACHE had better not collide with any of the standard flags.
    1101             :      */
    1102             :     StaticAssertStmt((PG_O_DIRECT &
    1103             :                       (O_APPEND |
    1104             :                        O_CREAT |
    1105             :                        O_EXCL |
    1106             :                        O_RDWR |
    1107             :                        O_RDONLY |
    1108             :                        O_SYNC |
    1109             :                        O_TRUNC |
    1110             :                        O_WRONLY)) == 0,
    1111             :                      "PG_O_DIRECT value collides with standard flag");
    1112             : #if defined(O_CLOEXEC)
    1113             :     StaticAssertStmt((PG_O_DIRECT & O_CLOEXEC) == 0,
    1114             :                      "PG_O_DIRECT value collides with O_CLOEXEC");
    1115             : #endif
    1116             : #if defined(O_DSYNC)
    1117             :     StaticAssertStmt((PG_O_DIRECT & O_DSYNC) == 0,
    1118             :                      "PG_O_DIRECT value collides with O_DSYNC");
    1119             : #endif
    1120             : 
    1121             :     fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
    1122             : #else
    1123     3066346 :     fd = open(fileName, fileFlags, fileMode);
    1124             : #endif
    1125             : 
    1126     3066346 :     if (fd >= 0)
    1127             :     {
    1128             : #ifdef PG_O_DIRECT_USE_F_NOCACHE
    1129             :         if (fileFlags & PG_O_DIRECT)
    1130             :         {
    1131             :             if (fcntl(fd, F_NOCACHE, 1) < 0)
    1132             :             {
    1133             :                 int         save_errno = errno;
    1134             : 
    1135             :                 close(fd);
    1136             :                 errno = save_errno;
    1137             :                 return -1;
    1138             :             }
    1139             :         }
    1140             : #endif
    1141             : 
    1142     2663488 :         return fd;              /* success! */
    1143             :     }
    1144             : 
    1145      402858 :     if (errno == EMFILE || errno == ENFILE)
    1146             :     {
    1147           0 :         int         save_errno = errno;
    1148             : 
    1149           0 :         ereport(LOG,
    1150             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    1151             :                  errmsg("out of file descriptors: %m; release and retry")));
    1152           0 :         errno = 0;
    1153           0 :         if (ReleaseLruFile())
    1154           0 :             goto tryAgain;
    1155           0 :         errno = save_errno;
    1156             :     }
    1157             : 
    1158      402858 :     return -1;                  /* failure */
    1159             : }
    1160             : 
    1161             : /*
    1162             :  * AcquireExternalFD - attempt to reserve an external file descriptor
    1163             :  *
    1164             :  * This should be used by callers that need to hold a file descriptor open
    1165             :  * over more than a short interval, but cannot use any of the other facilities
    1166             :  * provided by this module.
    1167             :  *
    1168             :  * The difference between this and the underlying ReserveExternalFD function
    1169             :  * is that this will report failure (by setting errno and returning false)
    1170             :  * if "too many" external FDs are already reserved.  This should be used in
    1171             :  * any code where the total number of FDs to be reserved is not predictable
    1172             :  * and small.
    1173             :  */
    1174             : bool
    1175      132126 : AcquireExternalFD(void)
    1176             : {
    1177             :     /*
    1178             :      * We don't want more than max_safe_fds / 3 FDs to be consumed for
    1179             :      * "external" FDs.
    1180             :      */
    1181      132126 :     if (numExternalFDs < max_safe_fds / 3)
    1182             :     {
    1183      132126 :         ReserveExternalFD();
    1184      132126 :         return true;
    1185             :     }
    1186           0 :     errno = EMFILE;
    1187           0 :     return false;
    1188             : }
    1189             : 
    1190             : /*
    1191             :  * ReserveExternalFD - report external consumption of a file descriptor
    1192             :  *
    1193             :  * This should be used by callers that need to hold a file descriptor open
    1194             :  * over more than a short interval, but cannot use any of the other facilities
    1195             :  * provided by this module.  This just tracks the use of the FD and closes
    1196             :  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
    1197             :  *
    1198             :  * Call this directly only in code where failure to reserve the FD would be
    1199             :  * fatal; for example, the WAL-writing code does so, since the alternative is
    1200             :  * session failure.  Also, it's very unwise to do so in code that could
    1201             :  * consume more than one FD per process.
    1202             :  *
    1203             :  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
    1204             :  * available, it doesn't matter too much whether this is called before or
    1205             :  * after actually opening the FD; but doing so beforehand reduces the risk of
    1206             :  * an EMFILE failure if not everybody played nice.  In any case, it's solely
    1207             :  * caller's responsibility to keep the external-FD count in sync with reality.
    1208             :  */
    1209             : void
    1210      181830 : ReserveExternalFD(void)
    1211             : {
    1212             :     /*
    1213             :      * Release VFDs if needed to stay safe.  Because we do this before
    1214             :      * incrementing numExternalFDs, the final state will be as desired, i.e.,
    1215             :      * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
    1216             :      */
    1217      181830 :     ReleaseLruFiles();
    1218             : 
    1219      181830 :     numExternalFDs++;
    1220      181830 : }
    1221             : 
    1222             : /*
    1223             :  * ReleaseExternalFD - report release of an external file descriptor
    1224             :  *
    1225             :  * This is guaranteed not to change errno, so it can be used in failure paths.
    1226             :  */
    1227             : void
    1228      128734 : ReleaseExternalFD(void)
    1229             : {
    1230             :     Assert(numExternalFDs > 0);
    1231      128734 :     numExternalFDs--;
    1232      128734 : }
    1233             : 
    1234             : 
    1235             : #if defined(FDDEBUG)
    1236             : 
    1237             : static void
    1238             : _dump_lru(void)
    1239             : {
    1240             :     int         mru = VfdCache[0].lruLessRecently;
    1241             :     Vfd        *vfdP = &VfdCache[mru];
    1242             :     char        buf[2048];
    1243             : 
    1244             :     snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
    1245             :     while (mru != 0)
    1246             :     {
    1247             :         mru = vfdP->lruLessRecently;
    1248             :         vfdP = &VfdCache[mru];
    1249             :         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
    1250             :     }
    1251             :     snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
    1252             :     elog(LOG, "%s", buf);
    1253             : }
    1254             : #endif                          /* FDDEBUG */
    1255             : 
    1256             : static void
    1257     1915520 : Delete(File file)
    1258             : {
    1259             :     Vfd        *vfdP;
    1260             : 
    1261             :     Assert(file != 0);
    1262             : 
    1263             :     DO_DB(elog(LOG, "Delete %d (%s)",
    1264             :                file, VfdCache[file].fileName));
    1265             :     DO_DB(_dump_lru());
    1266             : 
    1267     1915520 :     vfdP = &VfdCache[file];
    1268             : 
    1269     1915520 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
    1270     1915520 :     VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
    1271             : 
    1272             :     DO_DB(_dump_lru());
    1273     1915520 : }
    1274             : 
    1275             : static void
    1276      375410 : LruDelete(File file)
    1277             : {
    1278             :     Vfd        *vfdP;
    1279             : 
    1280             :     Assert(file != 0);
    1281             : 
    1282             :     DO_DB(elog(LOG, "LruDelete %d (%s)",
    1283             :                file, VfdCache[file].fileName));
    1284             : 
    1285      375410 :     vfdP = &VfdCache[file];
    1286             : 
    1287             :     /*
    1288             :      * Close the file.  We aren't expecting this to fail; if it does, better
    1289             :      * to leak the FD than to mess up our internal state.
    1290             :      */
    1291      375410 :     if (close(vfdP->fd) != 0)
    1292           0 :         elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1293             :              "could not close file \"%s\": %m", vfdP->fileName);
    1294      375410 :     vfdP->fd = VFD_CLOSED;
    1295      375410 :     --nfile;
    1296             : 
    1297             :     /* delete the vfd record from the LRU ring */
    1298      375410 :     Delete(file);
    1299      375410 : }
    1300             : 
    1301             : static void
    1302     2178412 : Insert(File file)
    1303             : {
    1304             :     Vfd        *vfdP;
    1305             : 
    1306             :     Assert(file != 0);
    1307             : 
    1308             :     DO_DB(elog(LOG, "Insert %d (%s)",
    1309             :                file, VfdCache[file].fileName));
    1310             :     DO_DB(_dump_lru());
    1311             : 
    1312     2178412 :     vfdP = &VfdCache[file];
    1313             : 
    1314     2178412 :     vfdP->lruMoreRecently = 0;
    1315     2178412 :     vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
    1316     2178412 :     VfdCache[0].lruLessRecently = file;
    1317     2178412 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
    1318             : 
    1319             :     DO_DB(_dump_lru());
    1320     2178412 : }
    1321             : 
    1322             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1323             : static int
    1324      150230 : LruInsert(File file)
    1325             : {
    1326             :     Vfd        *vfdP;
    1327             : 
    1328             :     Assert(file != 0);
    1329             : 
    1330             :     DO_DB(elog(LOG, "LruInsert %d (%s)",
    1331             :                file, VfdCache[file].fileName));
    1332             : 
    1333      150230 :     vfdP = &VfdCache[file];
    1334             : 
    1335      150230 :     if (FileIsNotOpen(file))
    1336             :     {
    1337             :         /* Close excess kernel FDs. */
    1338      150230 :         ReleaseLruFiles();
    1339             : 
    1340             :         /*
    1341             :          * The open could still fail for lack of file descriptors, eg due to
    1342             :          * overall system file table being full.  So, be prepared to release
    1343             :          * another FD if necessary...
    1344             :          */
    1345      150230 :         vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
    1346             :                                      vfdP->fileMode);
    1347      150230 :         if (vfdP->fd < 0)
    1348             :         {
    1349             :             DO_DB(elog(LOG, "re-open failed: %m"));
    1350           0 :             return -1;
    1351             :         }
    1352             :         else
    1353             :         {
    1354      150230 :             ++nfile;
    1355             :         }
    1356             :     }
    1357             : 
    1358             :     /*
    1359             :      * put it at the head of the Lru ring
    1360             :      */
    1361             : 
    1362      150230 :     Insert(file);
    1363             : 
    1364      150230 :     return 0;
    1365             : }
    1366             : 
    1367             : /*
    1368             :  * Release one kernel FD by closing the least-recently-used VFD.
    1369             :  */
    1370             : static bool
    1371      375372 : ReleaseLruFile(void)
    1372             : {
    1373             :     DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
    1374             : 
    1375      375372 :     if (nfile > 0)
    1376             :     {
    1377             :         /*
    1378             :          * There are opened files and so there should be at least one used vfd
    1379             :          * in the ring.
    1380             :          */
    1381             :         Assert(VfdCache[0].lruMoreRecently != 0);
    1382      375372 :         LruDelete(VfdCache[0].lruMoreRecently);
    1383      375372 :         return true;            /* freed a file */
    1384             :     }
    1385           0 :     return false;               /* no files available to free */
    1386             : }
    1387             : 
    1388             : /*
    1389             :  * Release kernel FDs as needed to get under the max_safe_fds limit.
    1390             :  * After calling this, it's OK to try to open another file.
    1391             :  */
    1392             : static void
    1393     3363710 : ReleaseLruFiles(void)
    1394             : {
    1395     3739082 :     while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
    1396             :     {
    1397      375372 :         if (!ReleaseLruFile())
    1398           0 :             break;
    1399             :     }
    1400     3363710 : }
    1401             : 
    1402             : static File
    1403     1896282 : AllocateVfd(void)
    1404             : {
    1405             :     Index       i;
    1406             :     File        file;
    1407             : 
    1408             :     DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
    1409             : 
    1410             :     Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */
    1411             : 
    1412     1896282 :     if (VfdCache[0].nextFree == 0)
    1413             :     {
    1414             :         /*
    1415             :          * The free list is empty so it is time to increase the size of the
    1416             :          * array.  We choose to double it each time this happens. However,
    1417             :          * there's not much point in starting *real* small.
    1418             :          */
    1419       21492 :         Size        newCacheSize = SizeVfdCache * 2;
    1420             :         Vfd        *newVfdCache;
    1421             : 
    1422       21492 :         if (newCacheSize < 32)
    1423       15134 :             newCacheSize = 32;
    1424             : 
    1425             :         /*
    1426             :          * Be careful not to clobber VfdCache ptr if realloc fails.
    1427             :          */
    1428       21492 :         newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
    1429       21492 :         if (newVfdCache == NULL)
    1430           0 :             ereport(ERROR,
    1431             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    1432             :                      errmsg("out of memory")));
    1433       21492 :         VfdCache = newVfdCache;
    1434             : 
    1435             :         /*
    1436             :          * Initialize the new entries and link them into the free list.
    1437             :          */
    1438      985494 :         for (i = SizeVfdCache; i < newCacheSize; i++)
    1439             :         {
    1440     7712016 :             MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
    1441      964002 :             VfdCache[i].nextFree = i + 1;
    1442      964002 :             VfdCache[i].fd = VFD_CLOSED;
    1443             :         }
    1444       21492 :         VfdCache[newCacheSize - 1].nextFree = 0;
    1445       21492 :         VfdCache[0].nextFree = SizeVfdCache;
    1446             : 
    1447             :         /*
    1448             :          * Record the new size
    1449             :          */
    1450       21492 :         SizeVfdCache = newCacheSize;
    1451             :     }
    1452             : 
    1453     1896282 :     file = VfdCache[0].nextFree;
    1454             : 
    1455     1896282 :     VfdCache[0].nextFree = VfdCache[file].nextFree;
    1456             : 
    1457     1896282 :     return file;
    1458             : }
    1459             : 
    1460             : static void
    1461     1545628 : FreeVfd(File file)
    1462             : {
    1463     1545628 :     Vfd        *vfdP = &VfdCache[file];
    1464             : 
    1465             :     DO_DB(elog(LOG, "FreeVfd: %d (%s)",
    1466             :                file, vfdP->fileName ? vfdP->fileName : ""));
    1467             : 
    1468     1545628 :     if (vfdP->fileName != NULL)
    1469             :     {
    1470     1145016 :         free(vfdP->fileName);
    1471     1145016 :         vfdP->fileName = NULL;
    1472             :     }
    1473     1545628 :     vfdP->fdstate = 0x0;
    1474             : 
    1475     1545628 :     vfdP->nextFree = VfdCache[0].nextFree;
    1476     1545628 :     VfdCache[0].nextFree = file;
    1477     1545628 : }
    1478             : 
    1479             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1480             : static int
    1481     3145412 : FileAccess(File file)
    1482             : {
    1483             :     int         returnValue;
    1484             : 
    1485             :     DO_DB(elog(LOG, "FileAccess %d (%s)",
    1486             :                file, VfdCache[file].fileName));
    1487             : 
    1488             :     /*
    1489             :      * Is the file open?  If not, open it and put it at the head of the LRU
    1490             :      * ring (possibly closing the least recently used file to get an FD).
    1491             :      */
    1492             : 
    1493     3145412 :     if (FileIsNotOpen(file))
    1494             :     {
    1495      150230 :         returnValue = LruInsert(file);
    1496      150230 :         if (returnValue != 0)
    1497           0 :             return returnValue;
    1498             :     }
    1499     2995182 :     else if (VfdCache[0].lruLessRecently != file)
    1500             :     {
    1501             :         /*
    1502             :          * We now know that the file is open and that it is not the last one
    1503             :          * accessed, so we need to move it to the head of the Lru ring.
    1504             :          */
    1505             : 
    1506      532512 :         Delete(file);
    1507      532512 :         Insert(file);
    1508             :     }
    1509             : 
    1510     3145412 :     return 0;
    1511             : }
    1512             : 
    1513             : /*
    1514             :  * Called whenever a temporary file is deleted to report its size.
    1515             :  */
    1516             : static void
    1517        3196 : ReportTemporaryFileUsage(const char *path, off_t size)
    1518             : {
    1519        3196 :     pgstat_report_tempfile(size);
    1520             : 
    1521        3196 :     if (log_temp_files >= 0)
    1522             :     {
    1523        1438 :         if ((size / 1024) >= log_temp_files)
    1524         250 :             ereport(LOG,
    1525             :                     (errmsg("temporary file: path \"%s\", size %lu",
    1526             :                             path, (unsigned long) size)));
    1527             :     }
    1528        3196 : }
    1529             : 
    1530             : /*
    1531             :  * Called to register a temporary file for automatic close.
    1532             :  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
    1533             :  * before the file was opened.
    1534             :  */
    1535             : static void
    1536        5272 : RegisterTemporaryFile(File file)
    1537             : {
    1538        5272 :     ResourceOwnerRememberFile(CurrentResourceOwner, file);
    1539        5272 :     VfdCache[file].resowner = CurrentResourceOwner;
    1540             : 
    1541             :     /* Backup mechanism for closing at end of xact. */
    1542        5272 :     VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
    1543        5272 :     have_xact_temporary_files = true;
    1544        5272 : }
    1545             : 
    1546             : /*
    1547             :  *  Called when we get a shared invalidation message on some relation.
    1548             :  */
    1549             : #ifdef NOT_USED
    1550             : void
    1551             : FileInvalidate(File file)
    1552             : {
    1553             :     Assert(FileIsValid(file));
    1554             :     if (!FileIsNotOpen(file))
    1555             :         LruDelete(file);
    1556             : }
    1557             : #endif
    1558             : 
    1559             : /*
    1560             :  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
    1561             :  * fileMode parameter.
    1562             :  */
    1563             : File
    1564     1896282 : PathNameOpenFile(const char *fileName, int fileFlags)
    1565             : {
    1566     1896282 :     return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1567             : }
    1568             : 
    1569             : /*
    1570             :  * open a file in an arbitrary directory
    1571             :  *
    1572             :  * NB: if the passed pathname is relative (which it usually is),
    1573             :  * it will be interpreted relative to the process' working directory
    1574             :  * (which should always be $PGDATA when this code is running).
    1575             :  */
    1576             : File
    1577     1896282 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1578             : {
    1579             :     char       *fnamecopy;
    1580             :     File        file;
    1581             :     Vfd        *vfdP;
    1582             : 
    1583             :     DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
    1584             :                fileName, fileFlags, fileMode));
    1585             : 
    1586             :     /*
    1587             :      * We need a malloc'd copy of the file name; fail cleanly if no room.
    1588             :      */
    1589     1896282 :     fnamecopy = strdup(fileName);
    1590     1896282 :     if (fnamecopy == NULL)
    1591           0 :         ereport(ERROR,
    1592             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    1593             :                  errmsg("out of memory")));
    1594             : 
    1595     1896282 :     file = AllocateVfd();
    1596     1896282 :     vfdP = &VfdCache[file];
    1597             : 
    1598             :     /* Close excess kernel FDs. */
    1599     1896282 :     ReleaseLruFiles();
    1600             : 
    1601     1896282 :     vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    1602             : 
    1603     1896282 :     if (vfdP->fd < 0)
    1604             :     {
    1605      400612 :         int         save_errno = errno;
    1606             : 
    1607      400612 :         FreeVfd(file);
    1608      400612 :         free(fnamecopy);
    1609      400612 :         errno = save_errno;
    1610      400612 :         return -1;
    1611             :     }
    1612     1495670 :     ++nfile;
    1613             :     DO_DB(elog(LOG, "PathNameOpenFile: success %d",
    1614             :                vfdP->fd));
    1615             : 
    1616     1495670 :     vfdP->fileName = fnamecopy;
    1617             :     /* Saved flags are adjusted to be OK for re-opening file */
    1618     1495670 :     vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    1619     1495670 :     vfdP->fileMode = fileMode;
    1620     1495670 :     vfdP->fileSize = 0;
    1621     1495670 :     vfdP->fdstate = 0x0;
    1622     1495670 :     vfdP->resowner = NULL;
    1623             : 
    1624     1495670 :     Insert(file);
    1625             : 
    1626     1495670 :     return file;
    1627             : }
    1628             : 
    1629             : /*
    1630             :  * Create directory 'directory'.  If necessary, create 'basedir', which must
    1631             :  * be the directory above it.  This is designed for creating the top-level
    1632             :  * temporary directory on demand before creating a directory underneath it.
    1633             :  * Do nothing if the directory already exists.
    1634             :  *
    1635             :  * Directories created within the top-level temporary directory should begin
    1636             :  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
    1637             :  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
    1638             :  * that do not need any particular prefix.
    1639             : */
    1640             : void
    1641         218 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
    1642             : {
    1643         218 :     if (MakePGDirectory(directory) < 0)
    1644             :     {
    1645          20 :         if (errno == EEXIST)
    1646           4 :             return;
    1647             : 
    1648             :         /*
    1649             :          * Failed.  Try to create basedir first in case it's missing. Tolerate
    1650             :          * EEXIST to close a race against another process following the same
    1651             :          * algorithm.
    1652             :          */
    1653          16 :         if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
    1654           0 :             ereport(ERROR,
    1655             :                     (errcode_for_file_access(),
    1656             :                      errmsg("cannot create temporary directory \"%s\": %m",
    1657             :                             basedir)));
    1658             : 
    1659             :         /* Try again. */
    1660          16 :         if (MakePGDirectory(directory) < 0 && errno != EEXIST)
    1661           0 :             ereport(ERROR,
    1662             :                     (errcode_for_file_access(),
    1663             :                      errmsg("cannot create temporary subdirectory \"%s\": %m",
    1664             :                             directory)));
    1665             :     }
    1666             : }
    1667             : 
    1668             : /*
    1669             :  * Delete a directory and everything in it, if it exists.
    1670             :  */
    1671             : void
    1672         254 : PathNameDeleteTemporaryDir(const char *dirname)
    1673             : {
    1674             :     struct stat statbuf;
    1675             : 
    1676             :     /* Silently ignore missing directory. */
    1677         254 :     if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
    1678          40 :         return;
    1679             : 
    1680             :     /*
    1681             :      * Currently, walkdir doesn't offer a way for our passed in function to
    1682             :      * maintain state.  Perhaps it should, so that we could tell the caller
    1683             :      * whether this operation succeeded or failed.  Since this operation is
    1684             :      * used in a cleanup path, we wouldn't actually behave differently: we'll
    1685             :      * just log failures.
    1686             :      */
    1687         214 :     walkdir(dirname, unlink_if_exists_fname, false, LOG);
    1688             : }
    1689             : 
    1690             : /*
    1691             :  * Open a temporary file that will disappear when we close it.
    1692             :  *
    1693             :  * This routine takes care of generating an appropriate tempfile name.
    1694             :  * There's no need to pass in fileFlags or fileMode either, since only
    1695             :  * one setting makes any sense for a temp file.
    1696             :  *
    1697             :  * Unless interXact is true, the file is remembered by CurrentResourceOwner
    1698             :  * to ensure it's closed and deleted when it's no longer needed, typically at
    1699             :  * the end-of-transaction. In most cases, you don't want temporary files to
    1700             :  * outlive the transaction that created them, so this should be false -- but
    1701             :  * if you need "somewhat" temporary storage, this might be useful. In either
    1702             :  * case, the file is removed when the File is explicitly closed.
    1703             :  */
    1704             : File
    1705        1866 : OpenTemporaryFile(bool interXact)
    1706             : {
    1707        1866 :     File        file = 0;
    1708             : 
    1709             :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1710             : 
    1711             :     /*
    1712             :      * Make sure the current resource owner has space for this File before we
    1713             :      * open it, if we'll be registering it below.
    1714             :      */
    1715        1866 :     if (!interXact)
    1716        1858 :         ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1717             : 
    1718             :     /*
    1719             :      * If some temp tablespace(s) have been given to us, try to use the next
    1720             :      * one.  If a given tablespace can't be found, we silently fall back to
    1721             :      * the database's default tablespace.
    1722             :      *
    1723             :      * BUT: if the temp file is slated to outlive the current transaction,
    1724             :      * force it into the database's default tablespace, so that it will not
    1725             :      * pose a threat to possible tablespace drop attempts.
    1726             :      */
    1727        1866 :     if (numTempTableSpaces > 0 && !interXact)
    1728             :     {
    1729           0 :         Oid         tblspcOid = GetNextTempTableSpace();
    1730             : 
    1731           0 :         if (OidIsValid(tblspcOid))
    1732           0 :             file = OpenTemporaryFileInTablespace(tblspcOid, false);
    1733             :     }
    1734             : 
    1735             :     /*
    1736             :      * If not, or if tablespace is bad, create in database's default
    1737             :      * tablespace.  MyDatabaseTableSpace should normally be set before we get
    1738             :      * here, but just in case it isn't, fall back to pg_default tablespace.
    1739             :      */
    1740        1866 :     if (file <= 0)
    1741        1866 :         file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
    1742             :                                              MyDatabaseTableSpace :
    1743             :                                              DEFAULTTABLESPACE_OID,
    1744             :                                              true);
    1745             : 
    1746             :     /* Mark it for deletion at close and temporary file size limit */
    1747        1866 :     VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
    1748             : 
    1749             :     /* Register it with the current resource owner */
    1750        1866 :     if (!interXact)
    1751        1858 :         RegisterTemporaryFile(file);
    1752             : 
    1753        1866 :     return file;
    1754             : }
    1755             : 
    1756             : /*
    1757             :  * Return the path of the temp directory in a given tablespace.
    1758             :  */
    1759             : void
    1760       10566 : TempTablespacePath(char *path, Oid tablespace)
    1761             : {
    1762             :     /*
    1763             :      * Identify the tempfile directory for this tablespace.
    1764             :      *
    1765             :      * If someone tries to specify pg_global, use pg_default instead.
    1766             :      */
    1767       10566 :     if (tablespace == InvalidOid ||
    1768           0 :         tablespace == DEFAULTTABLESPACE_OID ||
    1769             :         tablespace == GLOBALTABLESPACE_OID)
    1770       10566 :         snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
    1771             :     else
    1772             :     {
    1773             :         /* All other tablespaces are accessed via symlinks */
    1774           0 :         snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
    1775             :                  tablespace, TABLESPACE_VERSION_DIRECTORY,
    1776             :                  PG_TEMP_FILES_DIR);
    1777             :     }
    1778       10566 : }
    1779             : 
    1780             : /*
    1781             :  * Open a temporary file in a specific tablespace.
    1782             :  * Subroutine for OpenTemporaryFile, which see for details.
    1783             :  */
    1784             : static File
    1785        1866 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
    1786             : {
    1787             :     char        tempdirpath[MAXPGPATH];
    1788             :     char        tempfilepath[MAXPGPATH];
    1789             :     File        file;
    1790             : 
    1791        1866 :     TempTablespacePath(tempdirpath, tblspcOid);
    1792             : 
    1793             :     /*
    1794             :      * Generate a tempfile name that should be unique within the current
    1795             :      * database instance.
    1796             :      */
    1797        1866 :     snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
    1798             :              tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
    1799             : 
    1800             :     /*
    1801             :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1802             :      * temp file that can be reused.
    1803             :      */
    1804        1866 :     file = PathNameOpenFile(tempfilepath,
    1805             :                             O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1806        1866 :     if (file <= 0)
    1807             :     {
    1808             :         /*
    1809             :          * We might need to create the tablespace's tempfile directory, if no
    1810             :          * one has yet done so.
    1811             :          *
    1812             :          * Don't check for an error from MakePGDirectory; it could fail if
    1813             :          * someone else just did the same thing.  If it doesn't work then
    1814             :          * we'll bomb out on the second create attempt, instead.
    1815             :          */
    1816         102 :         (void) MakePGDirectory(tempdirpath);
    1817             : 
    1818         102 :         file = PathNameOpenFile(tempfilepath,
    1819             :                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1820         102 :         if (file <= 0 && rejectError)
    1821           0 :             elog(ERROR, "could not create temporary file \"%s\": %m",
    1822             :                  tempfilepath);
    1823             :     }
    1824             : 
    1825        1866 :     return file;
    1826             : }
    1827             : 
    1828             : 
    1829             : /*
    1830             :  * Create a new file.  The directory containing it must already exist.  Files
    1831             :  * created this way are subject to temp_file_limit and are automatically
    1832             :  * closed at end of transaction, but are not automatically deleted on close
    1833             :  * because they are intended to be shared between cooperating backends.
    1834             :  *
    1835             :  * If the file is inside the top-level temporary directory, its name should
    1836             :  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
    1837             :  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
    1838             :  * inside a directory created with PathNameCreateTemporaryDir(), in which case
    1839             :  * the prefix isn't needed.
    1840             :  */
    1841             : File
    1842        1548 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
    1843             : {
    1844             :     File        file;
    1845             : 
    1846             :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1847             : 
    1848        1548 :     ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1849             : 
    1850             :     /*
    1851             :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1852             :      * temp file that can be reused.
    1853             :      */
    1854        1548 :     file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1855        1548 :     if (file <= 0)
    1856             :     {
    1857         218 :         if (error_on_failure)
    1858           0 :             ereport(ERROR,
    1859             :                     (errcode_for_file_access(),
    1860             :                      errmsg("could not create temporary file \"%s\": %m",
    1861             :                             path)));
    1862             :         else
    1863         218 :             return file;
    1864             :     }
    1865             : 
    1866             :     /* Mark it for temp_file_limit accounting. */
    1867        1330 :     VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
    1868             : 
    1869             :     /* Register it for automatic close. */
    1870        1330 :     RegisterTemporaryFile(file);
    1871             : 
    1872        1330 :     return file;
    1873             : }
    1874             : 
    1875             : /*
    1876             :  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
    1877             :  * another backend.  Files opened this way don't count against the
    1878             :  * temp_file_limit of the caller, are automatically closed at the end of the
    1879             :  * transaction but are not deleted on close.
    1880             :  */
    1881             : File
    1882        4686 : PathNameOpenTemporaryFile(const char *path, int mode)
    1883             : {
    1884             :     File        file;
    1885             : 
    1886             :     Assert(temporary_files_allowed);    /* check temp file access is up */
    1887             : 
    1888        4686 :     ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1889             : 
    1890        4686 :     file = PathNameOpenFile(path, mode | PG_BINARY);
    1891             : 
    1892             :     /* If no such file, then we don't raise an error. */
    1893        4686 :     if (file <= 0 && errno != ENOENT)
    1894           0 :         ereport(ERROR,
    1895             :                 (errcode_for_file_access(),
    1896             :                  errmsg("could not open temporary file \"%s\": %m",
    1897             :                         path)));
    1898             : 
    1899        4686 :     if (file > 0)
    1900             :     {
    1901             :         /* Register it for automatic close. */
    1902        2084 :         RegisterTemporaryFile(file);
    1903             :     }
    1904             : 
    1905        4686 :     return file;
    1906             : }
    1907             : 
    1908             : /*
    1909             :  * Delete a file by pathname.  Return true if the file existed, false if
    1910             :  * didn't.
    1911             :  */
    1912             : bool
    1913        3268 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
    1914             : {
    1915             :     struct stat filestats;
    1916             :     int         stat_errno;
    1917             : 
    1918             :     /* Get the final size for pgstat reporting. */
    1919        3268 :     if (stat(path, &filestats) != 0)
    1920        1938 :         stat_errno = errno;
    1921             :     else
    1922        1330 :         stat_errno = 0;
    1923             : 
    1924             :     /*
    1925             :      * Unlike FileClose's automatic file deletion code, we tolerate
    1926             :      * non-existence to support BufFileDeleteFileSet which doesn't know how
    1927             :      * many segments it has to delete until it runs out.
    1928             :      */
    1929        3268 :     if (stat_errno == ENOENT)
    1930        1938 :         return false;
    1931             : 
    1932        1330 :     if (unlink(path) < 0)
    1933             :     {
    1934           0 :         if (errno != ENOENT)
    1935           0 :             ereport(error_on_failure ? ERROR : LOG,
    1936             :                     (errcode_for_file_access(),
    1937             :                      errmsg("could not unlink temporary file \"%s\": %m",
    1938             :                             path)));
    1939           0 :         return false;
    1940             :     }
    1941             : 
    1942        1330 :     if (stat_errno == 0)
    1943        1330 :         ReportTemporaryFileUsage(path, filestats.st_size);
    1944             :     else
    1945             :     {
    1946           0 :         errno = stat_errno;
    1947           0 :         ereport(LOG,
    1948             :                 (errcode_for_file_access(),
    1949             :                  errmsg("could not stat file \"%s\": %m", path)));
    1950             :     }
    1951             : 
    1952        1330 :     return true;
    1953             : }
    1954             : 
    1955             : /*
    1956             :  * close a file when done with it
    1957             :  */
    1958             : void
    1959     1145016 : FileClose(File file)
    1960             : {
    1961             :     Vfd        *vfdP;
    1962             : 
    1963             :     Assert(FileIsValid(file));
    1964             : 
    1965             :     DO_DB(elog(LOG, "FileClose: %d (%s)",
    1966             :                file, VfdCache[file].fileName));
    1967             : 
    1968     1145016 :     vfdP = &VfdCache[file];
    1969             : 
    1970     1145016 :     if (!FileIsNotOpen(file))
    1971             :     {
    1972             :         /* close the file */
    1973     1007598 :         if (close(vfdP->fd) != 0)
    1974             :         {
    1975             :             /*
    1976             :              * We may need to panic on failure to close non-temporary files;
    1977             :              * see LruDelete.
    1978             :              */
    1979           0 :             elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1980             :                  "could not close file \"%s\": %m", vfdP->fileName);
    1981             :         }
    1982             : 
    1983     1007598 :         --nfile;
    1984     1007598 :         vfdP->fd = VFD_CLOSED;
    1985             : 
    1986             :         /* remove the file from the lru ring */
    1987     1007598 :         Delete(file);
    1988             :     }
    1989             : 
    1990     1145016 :     if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    1991             :     {
    1992             :         /* Subtract its size from current usage (do first in case of error) */
    1993        3196 :         temporary_files_size -= vfdP->fileSize;
    1994        3196 :         vfdP->fileSize = 0;
    1995             :     }
    1996             : 
    1997             :     /*
    1998             :      * Delete the file if it was temporary, and make a log entry if wanted
    1999             :      */
    2000     1145016 :     if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
    2001             :     {
    2002             :         struct stat filestats;
    2003             :         int         stat_errno;
    2004             : 
    2005             :         /*
    2006             :          * If we get an error, as could happen within the ereport/elog calls,
    2007             :          * we'll come right back here during transaction abort.  Reset the
    2008             :          * flag to ensure that we can't get into an infinite loop.  This code
    2009             :          * is arranged to ensure that the worst-case consequence is failing to
    2010             :          * emit log message(s), not failing to attempt the unlink.
    2011             :          */
    2012        1866 :         vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
    2013             : 
    2014             : 
    2015             :         /* first try the stat() */
    2016        1866 :         if (stat(vfdP->fileName, &filestats))
    2017           0 :             stat_errno = errno;
    2018             :         else
    2019        1866 :             stat_errno = 0;
    2020             : 
    2021             :         /* in any case do the unlink */
    2022        1866 :         if (unlink(vfdP->fileName))
    2023           0 :             ereport(LOG,
    2024             :                     (errcode_for_file_access(),
    2025             :                      errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
    2026             : 
    2027             :         /* and last report the stat results */
    2028        1866 :         if (stat_errno == 0)
    2029        1866 :             ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
    2030             :         else
    2031             :         {
    2032           0 :             errno = stat_errno;
    2033           0 :             ereport(LOG,
    2034             :                     (errcode_for_file_access(),
    2035             :                      errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
    2036             :         }
    2037             :     }
    2038             : 
    2039             :     /* Unregister it from the resource owner */
    2040     1145016 :     if (vfdP->resowner)
    2041        5272 :         ResourceOwnerForgetFile(vfdP->resowner, file);
    2042             : 
    2043             :     /*
    2044             :      * Return the Vfd slot to the free list
    2045             :      */
    2046     1145016 :     FreeVfd(file);
    2047     1145016 : }
    2048             : 
    2049             : /*
    2050             :  * FilePrefetch - initiate asynchronous read of a given range of the file.
    2051             :  *
    2052             :  * Currently the only implementation of this function is using posix_fadvise
    2053             :  * which is the simplest standardized interface that accomplishes this.
    2054             :  * We could add an implementation using libaio in the future; but note that
    2055             :  * this API is inappropriate for libaio, which wants to have a buffer provided
    2056             :  * to read into.
    2057             :  */
    2058             : int
    2059       10384 : FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
    2060             : {
    2061             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
    2062             :     int         returnCode;
    2063             : 
    2064             :     Assert(FileIsValid(file));
    2065             : 
    2066             :     DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
    2067             :                file, VfdCache[file].fileName,
    2068             :                (int64) offset, amount));
    2069             : 
    2070       10384 :     returnCode = FileAccess(file);
    2071       10384 :     if (returnCode < 0)
    2072           0 :         return returnCode;
    2073             : 
    2074       10384 :     pgstat_report_wait_start(wait_event_info);
    2075       10384 :     returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
    2076             :                                POSIX_FADV_WILLNEED);
    2077       10384 :     pgstat_report_wait_end();
    2078             : 
    2079       10384 :     return returnCode;
    2080             : #else
    2081             :     Assert(FileIsValid(file));
    2082             :     return 0;
    2083             : #endif
    2084             : }
    2085             : 
    2086             : void
    2087      151506 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
    2088             : {
    2089             :     int         returnCode;
    2090             : 
    2091             :     Assert(FileIsValid(file));
    2092             : 
    2093             :     DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    2094             :                file, VfdCache[file].fileName,
    2095             :                (int64) offset, (int64) nbytes));
    2096             : 
    2097      151506 :     if (nbytes <= 0)
    2098           0 :         return;
    2099             : 
    2100      151506 :     returnCode = FileAccess(file);
    2101      151506 :     if (returnCode < 0)
    2102           0 :         return;
    2103             : 
    2104      151506 :     pgstat_report_wait_start(wait_event_info);
    2105      151506 :     pg_flush_data(VfdCache[file].fd, offset, nbytes);
    2106      151506 :     pgstat_report_wait_end();
    2107             : }
    2108             : 
    2109             : int
    2110     1378500 : FileRead(File file, char *buffer, int amount, off_t offset,
    2111             :          uint32 wait_event_info)
    2112             : {
    2113             :     int         returnCode;
    2114             :     Vfd        *vfdP;
    2115             : 
    2116             :     Assert(FileIsValid(file));
    2117             : 
    2118             :     DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
    2119             :                file, VfdCache[file].fileName,
    2120             :                (int64) offset,
    2121             :                amount, buffer));
    2122             : 
    2123     1378500 :     returnCode = FileAccess(file);
    2124     1378500 :     if (returnCode < 0)
    2125           0 :         return returnCode;
    2126             : 
    2127     1378500 :     vfdP = &VfdCache[file];
    2128             : 
    2129     1378500 : retry:
    2130     1378500 :     pgstat_report_wait_start(wait_event_info);
    2131     1378500 :     returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
    2132     1378500 :     pgstat_report_wait_end();
    2133             : 
    2134     1378500 :     if (returnCode < 0)
    2135             :     {
    2136             :         /*
    2137             :          * Windows may run out of kernel buffers and return "Insufficient
    2138             :          * system resources" error.  Wait a bit and retry to solve it.
    2139             :          *
    2140             :          * It is rumored that EINTR is also possible on some Unix filesystems,
    2141             :          * in which case immediate retry is indicated.
    2142             :          */
    2143             : #ifdef WIN32
    2144             :         DWORD       error = GetLastError();
    2145             : 
    2146             :         switch (error)
    2147             :         {
    2148             :             case ERROR_NO_SYSTEM_RESOURCES:
    2149             :                 pg_usleep(1000L);
    2150             :                 errno = EINTR;
    2151             :                 break;
    2152             :             default:
    2153             :                 _dosmaperr(error);
    2154             :                 break;
    2155             :         }
    2156             : #endif
    2157             :         /* OK to retry if interrupted */
    2158           0 :         if (errno == EINTR)
    2159           0 :             goto retry;
    2160             :     }
    2161             : 
    2162     1378500 :     return returnCode;
    2163             : }
    2164             : 
    2165             : int
    2166     1402910 : FileWrite(File file, char *buffer, int amount, off_t offset,
    2167             :           uint32 wait_event_info)
    2168             : {
    2169             :     int         returnCode;
    2170             :     Vfd        *vfdP;
    2171             : 
    2172             :     Assert(FileIsValid(file));
    2173             : 
    2174             :     DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
    2175             :                file, VfdCache[file].fileName,
    2176             :                (int64) offset,
    2177             :                amount, buffer));
    2178             : 
    2179     1402910 :     returnCode = FileAccess(file);
    2180     1402910 :     if (returnCode < 0)
    2181           0 :         return returnCode;
    2182             : 
    2183     1402910 :     vfdP = &VfdCache[file];
    2184             : 
    2185             :     /*
    2186             :      * If enforcing temp_file_limit and it's a temp file, check to see if the
    2187             :      * write would overrun temp_file_limit, and throw error if so.  Note: it's
    2188             :      * really a modularity violation to throw error here; we should set errno
    2189             :      * and return -1.  However, there's no way to report a suitable error
    2190             :      * message if we do that.  All current callers would just throw error
    2191             :      * immediately anyway, so this is safe at present.
    2192             :      */
    2193     1402910 :     if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
    2194             :     {
    2195           0 :         off_t       past_write = offset + amount;
    2196             : 
    2197           0 :         if (past_write > vfdP->fileSize)
    2198             :         {
    2199           0 :             uint64      newTotal = temporary_files_size;
    2200             : 
    2201           0 :             newTotal += past_write - vfdP->fileSize;
    2202           0 :             if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
    2203           0 :                 ereport(ERROR,
    2204             :                         (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
    2205             :                          errmsg("temporary file size exceeds temp_file_limit (%dkB)",
    2206             :                                 temp_file_limit)));
    2207             :         }
    2208             :     }
    2209             : 
    2210     1402910 : retry:
    2211     1402910 :     errno = 0;
    2212     1402910 :     pgstat_report_wait_start(wait_event_info);
    2213     1402910 :     returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
    2214     1402910 :     pgstat_report_wait_end();
    2215             : 
    2216             :     /* if write didn't set errno, assume problem is no disk space */
    2217     1402910 :     if (returnCode != amount && errno == 0)
    2218           0 :         errno = ENOSPC;
    2219             : 
    2220     1402910 :     if (returnCode >= 0)
    2221             :     {
    2222             :         /*
    2223             :          * Maintain fileSize and temporary_files_size if it's a temp file.
    2224             :          */
    2225     1402910 :         if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    2226             :         {
    2227       84080 :             off_t       past_write = offset + amount;
    2228             : 
    2229       84080 :             if (past_write > vfdP->fileSize)
    2230             :             {
    2231       59828 :                 temporary_files_size += past_write - vfdP->fileSize;
    2232       59828 :                 vfdP->fileSize = past_write;
    2233             :             }
    2234             :         }
    2235             :     }
    2236             :     else
    2237             :     {
    2238             :         /*
    2239             :          * See comments in FileRead()
    2240             :          */
    2241             : #ifdef WIN32
    2242             :         DWORD       error = GetLastError();
    2243             : 
    2244             :         switch (error)
    2245             :         {
    2246             :             case ERROR_NO_SYSTEM_RESOURCES:
    2247             :                 pg_usleep(1000L);
    2248             :                 errno = EINTR;
    2249             :                 break;
    2250             :             default:
    2251             :                 _dosmaperr(error);
    2252             :                 break;
    2253             :         }
    2254             : #endif
    2255             :         /* OK to retry if interrupted */
    2256           0 :         if (errno == EINTR)
    2257           0 :             goto retry;
    2258             :     }
    2259             : 
    2260     1402910 :     return returnCode;
    2261             : }
    2262             : 
    2263             : int
    2264       87932 : FileSync(File file, uint32 wait_event_info)
    2265             : {
    2266             :     int         returnCode;
    2267             : 
    2268             :     Assert(FileIsValid(file));
    2269             : 
    2270             :     DO_DB(elog(LOG, "FileSync: %d (%s)",
    2271             :                file, VfdCache[file].fileName));
    2272             : 
    2273       87932 :     returnCode = FileAccess(file);
    2274       87932 :     if (returnCode < 0)
    2275           0 :         return returnCode;
    2276             : 
    2277       87932 :     pgstat_report_wait_start(wait_event_info);
    2278       87932 :     returnCode = pg_fsync(VfdCache[file].fd);
    2279       87932 :     pgstat_report_wait_end();
    2280             : 
    2281       87932 :     return returnCode;
    2282             : }
    2283             : 
    2284             : off_t
    2285     4216114 : FileSize(File file)
    2286             : {
    2287             :     Assert(FileIsValid(file));
    2288             : 
    2289             :     DO_DB(elog(LOG, "FileSize %d (%s)",
    2290             :                file, VfdCache[file].fileName));
    2291             : 
    2292     4216114 :     if (FileIsNotOpen(file))
    2293             :     {
    2294      113600 :         if (FileAccess(file) < 0)
    2295           0 :             return (off_t) -1;
    2296             :     }
    2297             : 
    2298     4216114 :     return lseek(VfdCache[file].fd, 0, SEEK_END);
    2299             : }
    2300             : 
    2301             : int
    2302         580 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
    2303             : {
    2304             :     int         returnCode;
    2305             : 
    2306             :     Assert(FileIsValid(file));
    2307             : 
    2308             :     DO_DB(elog(LOG, "FileTruncate %d (%s)",
    2309             :                file, VfdCache[file].fileName));
    2310             : 
    2311         580 :     returnCode = FileAccess(file);
    2312         580 :     if (returnCode < 0)
    2313           0 :         return returnCode;
    2314             : 
    2315         580 :     pgstat_report_wait_start(wait_event_info);
    2316         580 :     returnCode = ftruncate(VfdCache[file].fd, offset);
    2317         580 :     pgstat_report_wait_end();
    2318             : 
    2319         580 :     if (returnCode == 0 && VfdCache[file].fileSize > offset)
    2320             :     {
    2321             :         /* adjust our state for truncation of a temp file */
    2322             :         Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
    2323           0 :         temporary_files_size -= VfdCache[file].fileSize - offset;
    2324           0 :         VfdCache[file].fileSize = offset;
    2325             :     }
    2326             : 
    2327         580 :     return returnCode;
    2328             : }
    2329             : 
    2330             : /*
    2331             :  * Return the pathname associated with an open file.
    2332             :  *
    2333             :  * The returned string points to an internal buffer, which is valid until
    2334             :  * the file is closed.
    2335             :  */
    2336             : char *
    2337          48 : FilePathName(File file)
    2338             : {
    2339             :     Assert(FileIsValid(file));
    2340             : 
    2341          48 :     return VfdCache[file].fileName;
    2342             : }
    2343             : 
    2344             : /*
    2345             :  * Return the raw file descriptor of an opened file.
    2346             :  *
    2347             :  * The returned file descriptor will be valid until the file is closed, but
    2348             :  * there are a lot of things that can make that happen.  So the caller should
    2349             :  * be careful not to do much of anything else before it finishes using the
    2350             :  * returned file descriptor.
    2351             :  */
    2352             : int
    2353           0 : FileGetRawDesc(File file)
    2354             : {
    2355             :     Assert(FileIsValid(file));
    2356           0 :     return VfdCache[file].fd;
    2357             : }
    2358             : 
    2359             : /*
    2360             :  * FileGetRawFlags - returns the file flags on open(2)
    2361             :  */
    2362             : int
    2363           0 : FileGetRawFlags(File file)
    2364             : {
    2365             :     Assert(FileIsValid(file));
    2366           0 :     return VfdCache[file].fileFlags;
    2367             : }
    2368             : 
    2369             : /*
    2370             :  * FileGetRawMode - returns the mode bitmask passed to open(2)
    2371             :  */
    2372             : mode_t
    2373           0 : FileGetRawMode(File file)
    2374             : {
    2375             :     Assert(FileIsValid(file));
    2376           0 :     return VfdCache[file].fileMode;
    2377             : }
    2378             : 
    2379             : /*
    2380             :  * Make room for another allocatedDescs[] array entry if needed and possible.
    2381             :  * Returns true if an array element is available.
    2382             :  */
    2383             : static bool
    2384     1135368 : reserveAllocatedDesc(void)
    2385             : {
    2386             :     AllocateDesc *newDescs;
    2387             :     int         newMax;
    2388             : 
    2389             :     /* Quick out if array already has a free slot. */
    2390     1135368 :     if (numAllocatedDescs < maxAllocatedDescs)
    2391     1132592 :         return true;
    2392             : 
    2393             :     /*
    2394             :      * If the array hasn't yet been created in the current process, initialize
    2395             :      * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
    2396             :      * we will ever need, anyway.  We don't want to look at max_safe_fds
    2397             :      * immediately because set_max_safe_fds() may not have run yet.
    2398             :      */
    2399        2776 :     if (allocatedDescs == NULL)
    2400             :     {
    2401        2776 :         newMax = FD_MINFREE / 3;
    2402        2776 :         newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
    2403             :         /* Out of memory already?  Treat as fatal error. */
    2404        2776 :         if (newDescs == NULL)
    2405           0 :             ereport(ERROR,
    2406             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    2407             :                      errmsg("out of memory")));
    2408        2776 :         allocatedDescs = newDescs;
    2409        2776 :         maxAllocatedDescs = newMax;
    2410        2776 :         return true;
    2411             :     }
    2412             : 
    2413             :     /*
    2414             :      * Consider enlarging the array beyond the initial allocation used above.
    2415             :      * By the time this happens, max_safe_fds should be known accurately.
    2416             :      *
    2417             :      * We mustn't let allocated descriptors hog all the available FDs, and in
    2418             :      * practice we'd better leave a reasonable number of FDs for VFD use.  So
    2419             :      * set the maximum to max_safe_fds / 3.  (This should certainly be at
    2420             :      * least as large as the initial size, FD_MINFREE / 3, so we aren't
    2421             :      * tightening the restriction here.)  Recall that "external" FDs are
    2422             :      * allowed to consume another third of max_safe_fds.
    2423             :      */
    2424           0 :     newMax = max_safe_fds / 3;
    2425           0 :     if (newMax > maxAllocatedDescs)
    2426             :     {
    2427           0 :         newDescs = (AllocateDesc *) realloc(allocatedDescs,
    2428             :                                             newMax * sizeof(AllocateDesc));
    2429             :         /* Treat out-of-memory as a non-fatal error. */
    2430           0 :         if (newDescs == NULL)
    2431           0 :             return false;
    2432           0 :         allocatedDescs = newDescs;
    2433           0 :         maxAllocatedDescs = newMax;
    2434           0 :         return true;
    2435             :     }
    2436             : 
    2437             :     /* Can't enlarge allocatedDescs[] any more. */
    2438           0 :     return false;
    2439             : }
    2440             : 
    2441             : /*
    2442             :  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
    2443             :  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
    2444             :  * necessary to open the file.  When done, call FreeFile rather than fclose.
    2445             :  *
    2446             :  * Note that files that will be open for any significant length of time
    2447             :  * should NOT be handled this way, since they cannot share kernel file
    2448             :  * descriptors with other files; there is grave risk of running out of FDs
    2449             :  * if anyone locks down too many FDs.  Most callers of this routine are
    2450             :  * simply reading a config file that they will read and close immediately.
    2451             :  *
    2452             :  * fd.c will automatically close all files opened with AllocateFile at
    2453             :  * transaction commit or abort; this prevents FD leakage if a routine
    2454             :  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
    2455             :  *
    2456             :  * Ideally this should be the *only* direct call of fopen() in the backend.
    2457             :  */
    2458             : FILE *
    2459      108888 : AllocateFile(const char *name, const char *mode)
    2460             : {
    2461             :     FILE       *file;
    2462             : 
    2463             :     DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
    2464             :                numAllocatedDescs, name));
    2465             : 
    2466             :     /* Can we allocate another non-virtual FD? */
    2467      108888 :     if (!reserveAllocatedDesc())
    2468           0 :         ereport(ERROR,
    2469             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2470             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2471             :                         maxAllocatedDescs, name)));
    2472             : 
    2473             :     /* Close excess kernel FDs. */
    2474      108888 :     ReleaseLruFiles();
    2475             : 
    2476      108888 : TryAgain:
    2477      108888 :     if ((file = fopen(name, mode)) != NULL)
    2478             :     {
    2479       97318 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2480             : 
    2481       97318 :         desc->kind = AllocateDescFile;
    2482       97318 :         desc->desc.file = file;
    2483       97318 :         desc->create_subid = GetCurrentSubTransactionId();
    2484       97318 :         numAllocatedDescs++;
    2485       97318 :         return desc->desc.file;
    2486             :     }
    2487             : 
    2488       11570 :     if (errno == EMFILE || errno == ENFILE)
    2489             :     {
    2490           0 :         int         save_errno = errno;
    2491             : 
    2492           0 :         ereport(LOG,
    2493             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2494             :                  errmsg("out of file descriptors: %m; release and retry")));
    2495           0 :         errno = 0;
    2496           0 :         if (ReleaseLruFile())
    2497           0 :             goto TryAgain;
    2498           0 :         errno = save_errno;
    2499             :     }
    2500             : 
    2501       11570 :     return NULL;
    2502             : }
    2503             : 
    2504             : /*
    2505             :  * Open a file with OpenTransientFilePerm() and pass default file mode for
    2506             :  * the fileMode parameter.
    2507             :  */
    2508             : int
    2509      963062 : OpenTransientFile(const char *fileName, int fileFlags)
    2510             : {
    2511      963062 :     return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
    2512             : }
    2513             : 
    2514             : /*
    2515             :  * Like AllocateFile, but returns an unbuffered fd like open(2)
    2516             :  */
    2517             : int
    2518      963066 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    2519             : {
    2520             :     int         fd;
    2521             : 
    2522             :     DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
    2523             :                numAllocatedDescs, fileName));
    2524             : 
    2525             :     /* Can we allocate another non-virtual FD? */
    2526      963066 :     if (!reserveAllocatedDesc())
    2527           0 :         ereport(ERROR,
    2528             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2529             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2530             :                         maxAllocatedDescs, fileName)));
    2531             : 
    2532             :     /* Close excess kernel FDs. */
    2533      963066 :     ReleaseLruFiles();
    2534             : 
    2535      963066 :     fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    2536             : 
    2537      963066 :     if (fd >= 0)
    2538             :     {
    2539      961906 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2540             : 
    2541      961906 :         desc->kind = AllocateDescRawFD;
    2542      961906 :         desc->desc.fd = fd;
    2543      961906 :         desc->create_subid = GetCurrentSubTransactionId();
    2544      961906 :         numAllocatedDescs++;
    2545             : 
    2546      961906 :         return fd;
    2547             :     }
    2548             : 
    2549        1160 :     return -1;                  /* failure */
    2550             : }
    2551             : 
    2552             : /*
    2553             :  * Routines that want to initiate a pipe stream should use OpenPipeStream
    2554             :  * rather than plain popen().  This lets fd.c deal with freeing FDs if
    2555             :  * necessary.  When done, call ClosePipeStream rather than pclose.
    2556             :  *
    2557             :  * This function also ensures that the popen'd program is run with default
    2558             :  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
    2559             :  * uses.  This ensures desirable response to, eg, closing a read pipe early.
    2560             :  */
    2561             : FILE *
    2562         466 : OpenPipeStream(const char *command, const char *mode)
    2563             : {
    2564             :     FILE       *file;
    2565             :     int         save_errno;
    2566             : 
    2567             :     DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
    2568             :                numAllocatedDescs, command));
    2569             : 
    2570             :     /* Can we allocate another non-virtual FD? */
    2571         466 :     if (!reserveAllocatedDesc())
    2572           0 :         ereport(ERROR,
    2573             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2574             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
    2575             :                         maxAllocatedDescs, command)));
    2576             : 
    2577             :     /* Close excess kernel FDs. */
    2578         466 :     ReleaseLruFiles();
    2579             : 
    2580         466 : TryAgain:
    2581         466 :     fflush(stdout);
    2582         466 :     fflush(stderr);
    2583         466 :     pqsignal(SIGPIPE, SIG_DFL);
    2584         466 :     errno = 0;
    2585         466 :     file = popen(command, mode);
    2586         466 :     save_errno = errno;
    2587         466 :     pqsignal(SIGPIPE, SIG_IGN);
    2588         466 :     errno = save_errno;
    2589         466 :     if (file != NULL)
    2590             :     {
    2591         466 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2592             : 
    2593         466 :         desc->kind = AllocateDescPipe;
    2594         466 :         desc->desc.file = file;
    2595         466 :         desc->create_subid = GetCurrentSubTransactionId();
    2596         466 :         numAllocatedDescs++;
    2597         466 :         return desc->desc.file;
    2598             :     }
    2599             : 
    2600           0 :     if (errno == EMFILE || errno == ENFILE)
    2601             :     {
    2602           0 :         ereport(LOG,
    2603             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2604             :                  errmsg("out of file descriptors: %m; release and retry")));
    2605           0 :         if (ReleaseLruFile())
    2606           0 :             goto TryAgain;
    2607           0 :         errno = save_errno;
    2608             :     }
    2609             : 
    2610           0 :     return NULL;
    2611             : }
    2612             : 
    2613             : /*
    2614             :  * Free an AllocateDesc of any type.
    2615             :  *
    2616             :  * The argument *must* point into the allocatedDescs[] array.
    2617             :  */
    2618             : static int
    2619     1121724 : FreeDesc(AllocateDesc *desc)
    2620             : {
    2621             :     int         result;
    2622             : 
    2623             :     /* Close the underlying object */
    2624     1121724 :     switch (desc->kind)
    2625             :     {
    2626       97318 :         case AllocateDescFile:
    2627       97318 :             result = fclose(desc->desc.file);
    2628       97318 :             break;
    2629         466 :         case AllocateDescPipe:
    2630         466 :             result = pclose(desc->desc.file);
    2631         466 :             break;
    2632       62034 :         case AllocateDescDir:
    2633       62034 :             result = closedir(desc->desc.dir);
    2634       62034 :             break;
    2635      961906 :         case AllocateDescRawFD:
    2636      961906 :             result = close(desc->desc.fd);
    2637      961906 :             break;
    2638           0 :         default:
    2639           0 :             elog(ERROR, "AllocateDesc kind not recognized");
    2640             :             result = 0;         /* keep compiler quiet */
    2641             :             break;
    2642             :     }
    2643             : 
    2644             :     /* Compact storage in the allocatedDescs array */
    2645     1121724 :     numAllocatedDescs--;
    2646     1121724 :     *desc = allocatedDescs[numAllocatedDescs];
    2647             : 
    2648     1121724 :     return result;
    2649             : }
    2650             : 
    2651             : /*
    2652             :  * Close a file returned by AllocateFile.
    2653             :  *
    2654             :  * Note we do not check fclose's return value --- it is up to the caller
    2655             :  * to handle close errors.
    2656             :  */
    2657             : int
    2658       97306 : FreeFile(FILE *file)
    2659             : {
    2660             :     int         i;
    2661             : 
    2662             :     DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
    2663             : 
    2664             :     /* Remove file from list of allocated files, if it's present */
    2665       97308 :     for (i = numAllocatedDescs; --i >= 0;)
    2666             :     {
    2667       97308 :         AllocateDesc *desc = &allocatedDescs[i];
    2668             : 
    2669       97308 :         if (desc->kind == AllocateDescFile && desc->desc.file == file)
    2670       97306 :             return FreeDesc(desc);
    2671             :     }
    2672             : 
    2673             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2674           0 :     elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
    2675             : 
    2676           0 :     return fclose(file);
    2677             : }
    2678             : 
    2679             : /*
    2680             :  * Close a file returned by OpenTransientFile.
    2681             :  *
    2682             :  * Note we do not check close's return value --- it is up to the caller
    2683             :  * to handle close errors.
    2684             :  */
    2685             : int
    2686      961900 : CloseTransientFile(int fd)
    2687             : {
    2688             :     int         i;
    2689             : 
    2690             :     DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
    2691             : 
    2692             :     /* Remove fd from list of allocated files, if it's present */
    2693      961900 :     for (i = numAllocatedDescs; --i >= 0;)
    2694             :     {
    2695      961900 :         AllocateDesc *desc = &allocatedDescs[i];
    2696             : 
    2697      961900 :         if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
    2698      961900 :             return FreeDesc(desc);
    2699             :     }
    2700             : 
    2701             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2702           0 :     elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
    2703             : 
    2704           0 :     return close(fd);
    2705             : }
    2706             : 
    2707             : /*
    2708             :  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
    2709             :  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
    2710             :  * necessary to open the directory, and with closing it after an elog.
    2711             :  * When done, call FreeDir rather than closedir.
    2712             :  *
    2713             :  * Returns NULL, with errno set, on failure.  Note that failure detection
    2714             :  * is commonly left to the following call of ReadDir or ReadDirExtended;
    2715             :  * see the comments for ReadDir.
    2716             :  *
    2717             :  * Ideally this should be the *only* direct call of opendir() in the backend.
    2718             :  */
    2719             : DIR *
    2720       62948 : AllocateDir(const char *dirname)
    2721             : {
    2722             :     DIR        *dir;
    2723             : 
    2724             :     DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
    2725             :                numAllocatedDescs, dirname));
    2726             : 
    2727             :     /* Can we allocate another non-virtual FD? */
    2728       62948 :     if (!reserveAllocatedDesc())
    2729           0 :         ereport(ERROR,
    2730             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2731             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
    2732             :                         maxAllocatedDescs, dirname)));
    2733             : 
    2734             :     /* Close excess kernel FDs. */
    2735       62948 :     ReleaseLruFiles();
    2736             : 
    2737       62948 : TryAgain:
    2738       62948 :     if ((dir = opendir(dirname)) != NULL)
    2739             :     {
    2740       62034 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2741             : 
    2742       62034 :         desc->kind = AllocateDescDir;
    2743       62034 :         desc->desc.dir = dir;
    2744       62034 :         desc->create_subid = GetCurrentSubTransactionId();
    2745       62034 :         numAllocatedDescs++;
    2746       62034 :         return desc->desc.dir;
    2747             :     }
    2748             : 
    2749         914 :     if (errno == EMFILE || errno == ENFILE)
    2750             :     {
    2751           0 :         int         save_errno = errno;
    2752             : 
    2753           0 :         ereport(LOG,
    2754             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2755             :                  errmsg("out of file descriptors: %m; release and retry")));
    2756           0 :         errno = 0;
    2757           0 :         if (ReleaseLruFile())
    2758           0 :             goto TryAgain;
    2759           0 :         errno = save_errno;
    2760             :     }
    2761             : 
    2762         914 :     return NULL;
    2763             : }
    2764             : 
    2765             : /*
    2766             :  * Read a directory opened with AllocateDir, ereport'ing any error.
    2767             :  *
    2768             :  * This is easier to use than raw readdir() since it takes care of some
    2769             :  * otherwise rather tedious and error-prone manipulation of errno.  Also,
    2770             :  * if you are happy with a generic error message for AllocateDir failure,
    2771             :  * you can just do
    2772             :  *
    2773             :  *      dir = AllocateDir(path);
    2774             :  *      while ((dirent = ReadDir(dir, path)) != NULL)
    2775             :  *          process dirent;
    2776             :  *      FreeDir(dir);
    2777             :  *
    2778             :  * since a NULL dir parameter is taken as indicating AllocateDir failed.
    2779             :  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
    2780             :  * use this shortcut.)
    2781             :  *
    2782             :  * The pathname passed to AllocateDir must be passed to this routine too,
    2783             :  * but it is only used for error reporting.
    2784             :  */
    2785             : struct dirent *
    2786     1580314 : ReadDir(DIR *dir, const char *dirname)
    2787             : {
    2788     1580314 :     return ReadDirExtended(dir, dirname, ERROR);
    2789             : }
    2790             : 
    2791             : /*
    2792             :  * Alternate version of ReadDir that allows caller to specify the elevel
    2793             :  * for any error report (whether it's reporting an initial failure of
    2794             :  * AllocateDir or a subsequent directory read failure).
    2795             :  *
    2796             :  * If elevel < ERROR, returns NULL after any error.  With the normal coding
    2797             :  * pattern, this will result in falling out of the loop immediately as
    2798             :  * though the directory contained no (more) entries.
    2799             :  */
    2800             : struct dirent *
    2801     2872386 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
    2802             : {
    2803             :     struct dirent *dent;
    2804             : 
    2805             :     /* Give a generic message for AllocateDir failure, if caller didn't */
    2806     2872386 :     if (dir == NULL)
    2807             :     {
    2808           0 :         ereport(elevel,
    2809             :                 (errcode_for_file_access(),
    2810             :                  errmsg("could not open directory \"%s\": %m",
    2811             :                         dirname)));
    2812           0 :         return NULL;
    2813             :     }
    2814             : 
    2815     2872386 :     errno = 0;
    2816     2872386 :     if ((dent = readdir(dir)) != NULL)
    2817     2821836 :         return dent;
    2818             : 
    2819       50550 :     if (errno)
    2820           0 :         ereport(elevel,
    2821             :                 (errcode_for_file_access(),
    2822             :                  errmsg("could not read directory \"%s\": %m",
    2823             :                         dirname)));
    2824       50550 :     return NULL;
    2825             : }
    2826             : 
    2827             : /*
    2828             :  * Close a directory opened with AllocateDir.
    2829             :  *
    2830             :  * Returns closedir's return value (with errno set if it's not 0).
    2831             :  * Note we do not check the return value --- it is up to the caller
    2832             :  * to handle close errors if wanted.
    2833             :  *
    2834             :  * Does nothing if dir == NULL; we assume that directory open failure was
    2835             :  * already reported if desired.
    2836             :  */
    2837             : int
    2838       62022 : FreeDir(DIR *dir)
    2839             : {
    2840             :     int         i;
    2841             : 
    2842             :     /* Nothing to do if AllocateDir failed */
    2843       62022 :     if (dir == NULL)
    2844           0 :         return 0;
    2845             : 
    2846             :     DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
    2847             : 
    2848             :     /* Remove dir from list of allocated dirs, if it's present */
    2849       62022 :     for (i = numAllocatedDescs; --i >= 0;)
    2850             :     {
    2851       62022 :         AllocateDesc *desc = &allocatedDescs[i];
    2852             : 
    2853       62022 :         if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
    2854       62022 :             return FreeDesc(desc);
    2855             :     }
    2856             : 
    2857             :     /* Only get here if someone passes us a dir not in allocatedDescs */
    2858           0 :     elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
    2859             : 
    2860           0 :     return closedir(dir);
    2861             : }
    2862             : 
    2863             : 
    2864             : /*
    2865             :  * Close a pipe stream returned by OpenPipeStream.
    2866             :  */
    2867             : int
    2868         466 : ClosePipeStream(FILE *file)
    2869             : {
    2870             :     int         i;
    2871             : 
    2872             :     DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
    2873             : 
    2874             :     /* Remove file from list of allocated files, if it's present */
    2875         466 :     for (i = numAllocatedDescs; --i >= 0;)
    2876             :     {
    2877         466 :         AllocateDesc *desc = &allocatedDescs[i];
    2878             : 
    2879         466 :         if (desc->kind == AllocateDescPipe && desc->desc.file == file)
    2880         466 :             return FreeDesc(desc);
    2881             :     }
    2882             : 
    2883             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2884           0 :     elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
    2885             : 
    2886           0 :     return pclose(file);
    2887             : }
    2888             : 
    2889             : /*
    2890             :  * closeAllVfds
    2891             :  *
    2892             :  * Force all VFDs into the physically-closed state, so that the fewest
    2893             :  * possible number of kernel file descriptors are in use.  There is no
    2894             :  * change in the logical state of the VFDs.
    2895             :  */
    2896             : void
    2897          34 : closeAllVfds(void)
    2898             : {
    2899             :     Index       i;
    2900             : 
    2901          34 :     if (SizeVfdCache > 0)
    2902             :     {
    2903             :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    2904        1088 :         for (i = 1; i < SizeVfdCache; i++)
    2905             :         {
    2906        1054 :             if (!FileIsNotOpen(i))
    2907          38 :                 LruDelete(i);
    2908             :         }
    2909             :     }
    2910          34 : }
    2911             : 
    2912             : 
    2913             : /*
    2914             :  * SetTempTablespaces
    2915             :  *
    2916             :  * Define a list (actually an array) of OIDs of tablespaces to use for
    2917             :  * temporary files.  This list will be used until end of transaction,
    2918             :  * unless this function is called again before then.  It is caller's
    2919             :  * responsibility that the passed-in array has adequate lifespan (typically
    2920             :  * it'd be allocated in TopTransactionContext).
    2921             :  *
    2922             :  * Some entries of the array may be InvalidOid, indicating that the current
    2923             :  * database's default tablespace should be used.
    2924             :  */
    2925             : void
    2926        5030 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
    2927             : {
    2928             :     Assert(numSpaces >= 0);
    2929        5030 :     tempTableSpaces = tableSpaces;
    2930        5030 :     numTempTableSpaces = numSpaces;
    2931             : 
    2932             :     /*
    2933             :      * Select a random starting point in the list.  This is to minimize
    2934             :      * conflicts between backends that are most likely sharing the same list
    2935             :      * of temp tablespaces.  Note that if we create multiple temp files in the
    2936             :      * same transaction, we'll advance circularly through the list --- this
    2937             :      * ensures that large temporary sort files are nicely spread across all
    2938             :      * available tablespaces.
    2939             :      */
    2940        5030 :     if (numSpaces > 1)
    2941           0 :         nextTempTableSpace = random() % numSpaces;
    2942             :     else
    2943        5030 :         nextTempTableSpace = 0;
    2944        5030 : }
    2945             : 
    2946             : /*
    2947             :  * TempTablespacesAreSet
    2948             :  *
    2949             :  * Returns true if SetTempTablespaces has been called in current transaction.
    2950             :  * (This is just so that tablespaces.c doesn't need its own per-transaction
    2951             :  * state.)
    2952             :  */
    2953             : bool
    2954        5100 : TempTablespacesAreSet(void)
    2955             : {
    2956        5100 :     return (numTempTableSpaces >= 0);
    2957             : }
    2958             : 
    2959             : /*
    2960             :  * GetTempTablespaces
    2961             :  *
    2962             :  * Populate an array with the OIDs of the tablespaces that should be used for
    2963             :  * temporary files.  (Some entries may be InvalidOid, indicating that the
    2964             :  * current database's default tablespace should be used.)  At most numSpaces
    2965             :  * entries will be filled.
    2966             :  * Returns the number of OIDs that were copied into the output array.
    2967             :  */
    2968             : int
    2969         222 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
    2970             : {
    2971             :     int         i;
    2972             : 
    2973             :     Assert(TempTablespacesAreSet());
    2974         222 :     for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
    2975           0 :         tableSpaces[i] = tempTableSpaces[i];
    2976             : 
    2977         222 :     return i;
    2978             : }
    2979             : 
    2980             : /*
    2981             :  * GetNextTempTableSpace
    2982             :  *
    2983             :  * Select the next temp tablespace to use.  A result of InvalidOid means
    2984             :  * to use the current database's default tablespace.
    2985             :  */
    2986             : Oid
    2987        2374 : GetNextTempTableSpace(void)
    2988             : {
    2989        2374 :     if (numTempTableSpaces > 0)
    2990             :     {
    2991             :         /* Advance nextTempTableSpace counter with wraparound */
    2992           0 :         if (++nextTempTableSpace >= numTempTableSpaces)
    2993           0 :             nextTempTableSpace = 0;
    2994           0 :         return tempTableSpaces[nextTempTableSpace];
    2995             :     }
    2996        2374 :     return InvalidOid;
    2997             : }
    2998             : 
    2999             : 
    3000             : /*
    3001             :  * AtEOSubXact_Files
    3002             :  *
    3003             :  * Take care of subtransaction commit/abort.  At abort, we close temp files
    3004             :  * that the subtransaction may have opened.  At commit, we reassign the
    3005             :  * files that were opened to the parent subtransaction.
    3006             :  */
    3007             : void
    3008       11494 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
    3009             :                   SubTransactionId parentSubid)
    3010             : {
    3011             :     Index       i;
    3012             : 
    3013       11494 :     for (i = 0; i < numAllocatedDescs; i++)
    3014             :     {
    3015           0 :         if (allocatedDescs[i].create_subid == mySubid)
    3016             :         {
    3017           0 :             if (isCommit)
    3018           0 :                 allocatedDescs[i].create_subid = parentSubid;
    3019             :             else
    3020             :             {
    3021             :                 /* have to recheck the item after FreeDesc (ugly) */
    3022           0 :                 FreeDesc(&allocatedDescs[i--]);
    3023             :             }
    3024             :         }
    3025             :     }
    3026       11494 : }
    3027             : 
    3028             : /*
    3029             :  * AtEOXact_Files
    3030             :  *
    3031             :  * This routine is called during transaction commit or abort.  All still-open
    3032             :  * per-transaction temporary file VFDs are closed, which also causes the
    3033             :  * underlying files to be deleted (although they should've been closed already
    3034             :  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
    3035             :  * closed. We also forget any transaction-local temp tablespace list.
    3036             :  *
    3037             :  * The isCommit flag is used only to decide whether to emit warnings about
    3038             :  * unclosed files.
    3039             :  */
    3040             : void
    3041      691904 : AtEOXact_Files(bool isCommit)
    3042             : {
    3043      691904 :     CleanupTempFiles(isCommit, false);
    3044      691904 :     tempTableSpaces = NULL;
    3045      691904 :     numTempTableSpaces = -1;
    3046      691904 : }
    3047             : 
    3048             : /*
    3049             :  * BeforeShmemExit_Files
    3050             :  *
    3051             :  * before_shmem_access hook to clean up temp files during backend shutdown.
    3052             :  * Here, we want to clean up *all* temp files including interXact ones.
    3053             :  */
    3054             : static void
    3055       17278 : BeforeShmemExit_Files(int code, Datum arg)
    3056             : {
    3057       17278 :     CleanupTempFiles(false, true);
    3058             : 
    3059             :     /* prevent further temp files from being created */
    3060             : #ifdef USE_ASSERT_CHECKING
    3061             :     temporary_files_allowed = false;
    3062             : #endif
    3063       17278 : }
    3064             : 
    3065             : /*
    3066             :  * Close temporary files and delete their underlying files.
    3067             :  *
    3068             :  * isCommit: if true, this is normal transaction commit, and we don't
    3069             :  * expect any remaining files; warn if there are some.
    3070             :  *
    3071             :  * isProcExit: if true, this is being called as the backend process is
    3072             :  * exiting. If that's the case, we should remove all temporary files; if
    3073             :  * that's not the case, we are being called for transaction commit/abort
    3074             :  * and should only remove transaction-local temp files.  In either case,
    3075             :  * also clean up "allocated" stdio files, dirs and fds.
    3076             :  */
    3077             : static void
    3078      709182 : CleanupTempFiles(bool isCommit, bool isProcExit)
    3079             : {
    3080             :     Index       i;
    3081             : 
    3082             :     /*
    3083             :      * Careful here: at proc_exit we need extra cleanup, not just
    3084             :      * xact_temporary files.
    3085             :      */
    3086      709182 :     if (isProcExit || have_xact_temporary_files)
    3087             :     {
    3088             :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    3089     1026912 :         for (i = 1; i < SizeVfdCache; i++)
    3090             :         {
    3091     1008540 :             unsigned short fdstate = VfdCache[i].fdstate;
    3092             : 
    3093     1008540 :             if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
    3094           0 :                 VfdCache[i].fileName != NULL)
    3095             :             {
    3096             :                 /*
    3097             :                  * If we're in the process of exiting a backend process, close
    3098             :                  * all temporary files. Otherwise, only close temporary files
    3099             :                  * local to the current transaction. They should be closed by
    3100             :                  * the ResourceOwner mechanism already, so this is just a
    3101             :                  * debugging cross-check.
    3102             :                  */
    3103           0 :                 if (isProcExit)
    3104           0 :                     FileClose(i);
    3105           0 :                 else if (fdstate & FD_CLOSE_AT_EOXACT)
    3106             :                 {
    3107           0 :                     elog(WARNING,
    3108             :                          "temporary file %s not closed at end-of-transaction",
    3109             :                          VfdCache[i].fileName);
    3110           0 :                     FileClose(i);
    3111             :                 }
    3112             :             }
    3113             :         }
    3114             : 
    3115       18372 :         have_xact_temporary_files = false;
    3116             :     }
    3117             : 
    3118             :     /* Complain if any allocated files remain open at commit. */
    3119      709182 :     if (isCommit && numAllocatedDescs > 0)
    3120           0 :         elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
    3121             :              numAllocatedDescs);
    3122             : 
    3123             :     /* Clean up "allocated" stdio files, dirs and fds. */
    3124      709212 :     while (numAllocatedDescs > 0)
    3125          30 :         FreeDesc(&allocatedDescs[0]);
    3126      709182 : }
    3127             : 
    3128             : 
    3129             : /*
    3130             :  * Remove temporary and temporary relation files left over from a prior
    3131             :  * postmaster session
    3132             :  *
    3133             :  * This should be called during postmaster startup.  It will forcibly
    3134             :  * remove any leftover files created by OpenTemporaryFile and any leftover
    3135             :  * temporary relation files created by mdcreate.
    3136             :  *
    3137             :  * During post-backend-crash restart cycle, this routine is called when
    3138             :  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
    3139             :  * queries are using temp files could result in useless storage usage that can
    3140             :  * only be reclaimed by a service restart. The argument against enabling it is
    3141             :  * that someone might want to examine the temporary files for debugging
    3142             :  * purposes. This does however mean that OpenTemporaryFile had better allow for
    3143             :  * collision with an existing temp file name.
    3144             :  *
    3145             :  * NOTE: this function and its subroutines generally report syscall failures
    3146             :  * with ereport(LOG) and keep going.  Removing temp files is not so critical
    3147             :  * that we should fail to start the database when we can't do it.
    3148             :  */
    3149             : void
    3150         908 : RemovePgTempFiles(void)
    3151             : {
    3152             :     char        temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
    3153             :     DIR        *spc_dir;
    3154             :     struct dirent *spc_de;
    3155             : 
    3156             :     /*
    3157             :      * First process temp files in pg_default ($PGDATA/base)
    3158             :      */
    3159         908 :     snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
    3160         908 :     RemovePgTempFilesInDir(temp_path, true, false);
    3161         908 :     RemovePgTempRelationFiles("base");
    3162             : 
    3163             :     /*
    3164             :      * Cycle through temp directories for all non-default tablespaces.
    3165             :      */
    3166         908 :     spc_dir = AllocateDir("pg_tblspc");
    3167             : 
    3168        2798 :     while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
    3169             :     {
    3170        1890 :         if (strcmp(spc_de->d_name, ".") == 0 ||
    3171         982 :             strcmp(spc_de->d_name, "..") == 0)
    3172        1816 :             continue;
    3173             : 
    3174          74 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
    3175          74 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
    3176          74 :         RemovePgTempFilesInDir(temp_path, true, false);
    3177             : 
    3178          74 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
    3179          74 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
    3180          74 :         RemovePgTempRelationFiles(temp_path);
    3181             :     }
    3182             : 
    3183         908 :     FreeDir(spc_dir);
    3184             : 
    3185             :     /*
    3186             :      * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
    3187             :      * DataDir as well.  However, that is *not* cleaned here because doing so
    3188             :      * would create a race condition.  It's done separately, earlier in
    3189             :      * postmaster startup.
    3190             :      */
    3191         908 : }
    3192             : 
    3193             : /*
    3194             :  * Process one pgsql_tmp directory for RemovePgTempFiles.
    3195             :  *
    3196             :  * If missing_ok is true, it's all right for the named directory to not exist.
    3197             :  * Any other problem results in a LOG message.  (missing_ok should be true at
    3198             :  * the top level, since pgsql_tmp directories are not created until needed.)
    3199             :  *
    3200             :  * At the top level, this should be called with unlink_all = false, so that
    3201             :  * only files matching the temporary name prefix will be unlinked.  When
    3202             :  * recursing it will be called with unlink_all = true to unlink everything
    3203             :  * under a top-level temporary directory.
    3204             :  *
    3205             :  * (These two flags could be replaced by one, but it seems clearer to keep
    3206             :  * them separate.)
    3207             :  */
    3208             : void
    3209         984 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
    3210             : {
    3211             :     DIR        *temp_dir;
    3212             :     struct dirent *temp_de;
    3213             :     char        rm_path[MAXPGPATH * 2];
    3214             : 
    3215         984 :     temp_dir = AllocateDir(tmpdirname);
    3216             : 
    3217         984 :     if (temp_dir == NULL && errno == ENOENT && missing_ok)
    3218         914 :         return;
    3219             : 
    3220         218 :     while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
    3221             :     {
    3222         148 :         if (strcmp(temp_de->d_name, ".") == 0 ||
    3223          78 :             strcmp(temp_de->d_name, "..") == 0)
    3224         140 :             continue;
    3225             : 
    3226           8 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3227           8 :                  tmpdirname, temp_de->d_name);
    3228             : 
    3229           8 :         if (unlink_all ||
    3230           6 :             strncmp(temp_de->d_name,
    3231             :                     PG_TEMP_FILE_PREFIX,
    3232             :                     strlen(PG_TEMP_FILE_PREFIX)) == 0)
    3233           8 :         {
    3234             :             struct stat statbuf;
    3235             : 
    3236           8 :             if (lstat(rm_path, &statbuf) < 0)
    3237             :             {
    3238           0 :                 ereport(LOG,
    3239             :                         (errcode_for_file_access(),
    3240             :                          errmsg("could not stat file \"%s\": %m", rm_path)));
    3241           0 :                 continue;
    3242             :             }
    3243             : 
    3244           8 :             if (S_ISDIR(statbuf.st_mode))
    3245             :             {
    3246             :                 /* recursively remove contents, then directory itself */
    3247           2 :                 RemovePgTempFilesInDir(rm_path, false, true);
    3248             : 
    3249           2 :                 if (rmdir(rm_path) < 0)
    3250           0 :                     ereport(LOG,
    3251             :                             (errcode_for_file_access(),
    3252             :                              errmsg("could not remove directory \"%s\": %m",
    3253             :                                     rm_path)));
    3254             :             }
    3255             :             else
    3256             :             {
    3257           6 :                 if (unlink(rm_path) < 0)
    3258           0 :                     ereport(LOG,
    3259             :                             (errcode_for_file_access(),
    3260             :                              errmsg("could not remove file \"%s\": %m",
    3261             :                                     rm_path)));
    3262             :             }
    3263             :         }
    3264             :         else
    3265           0 :             ereport(LOG,
    3266             :                     (errmsg("unexpected file found in temporary-files directory: \"%s\"",
    3267             :                             rm_path)));
    3268             :     }
    3269             : 
    3270          70 :     FreeDir(temp_dir);
    3271             : }
    3272             : 
    3273             : /* Process one tablespace directory, look for per-DB subdirectories */
    3274             : static void
    3275         982 : RemovePgTempRelationFiles(const char *tsdirname)
    3276             : {
    3277             :     DIR        *ts_dir;
    3278             :     struct dirent *de;
    3279             :     char        dbspace_path[MAXPGPATH * 2];
    3280             : 
    3281         982 :     ts_dir = AllocateDir(tsdirname);
    3282             : 
    3283        6016 :     while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
    3284             :     {
    3285             :         /*
    3286             :          * We're only interested in the per-database directories, which have
    3287             :          * numeric names.  Note that this code will also (properly) ignore "."
    3288             :          * and "..".
    3289             :          */
    3290        5034 :         if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    3291        2032 :             continue;
    3292             : 
    3293        3002 :         snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
    3294        3002 :                  tsdirname, de->d_name);
    3295        3002 :         RemovePgTempRelationFilesInDbspace(dbspace_path);
    3296             :     }
    3297             : 
    3298         982 :     FreeDir(ts_dir);
    3299         982 : }
    3300             : 
    3301             : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
    3302             : static void
    3303        3002 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
    3304             : {
    3305             :     DIR        *dbspace_dir;
    3306             :     struct dirent *de;
    3307             :     char        rm_path[MAXPGPATH * 2];
    3308             : 
    3309        3002 :     dbspace_dir = AllocateDir(dbspacedirname);
    3310             : 
    3311      877860 :     while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
    3312             :     {
    3313      874858 :         if (!looks_like_temp_rel_name(de->d_name))
    3314      874850 :             continue;
    3315             : 
    3316           8 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3317           8 :                  dbspacedirname, de->d_name);
    3318             : 
    3319           8 :         if (unlink(rm_path) < 0)
    3320           0 :             ereport(LOG,
    3321             :                     (errcode_for_file_access(),
    3322             :                      errmsg("could not remove file \"%s\": %m",
    3323             :                             rm_path)));
    3324             :     }
    3325             : 
    3326        3002 :     FreeDir(dbspace_dir);
    3327        3002 : }
    3328             : 
    3329             : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
    3330             : bool
    3331     1163316 : looks_like_temp_rel_name(const char *name)
    3332             : {
    3333             :     int         pos;
    3334             :     int         savepos;
    3335             : 
    3336             :     /* Must start with "t". */
    3337     1163316 :     if (name[0] != 't')
    3338     1163236 :         return false;
    3339             : 
    3340             :     /* Followed by a non-empty string of digits and then an underscore. */
    3341         392 :     for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
    3342             :         ;
    3343          80 :     if (pos == 1 || name[pos] != '_')
    3344           0 :         return false;
    3345             : 
    3346             :     /* Followed by another nonempty string of digits. */
    3347         392 :     for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
    3348             :         ;
    3349          80 :     if (savepos == pos)
    3350           0 :         return false;
    3351             : 
    3352             :     /* We might have _forkname or .segment or both. */
    3353          80 :     if (name[pos] == '_')
    3354             :     {
    3355          40 :         int         forkchar = forkname_chars(&name[pos + 1], NULL);
    3356             : 
    3357          40 :         if (forkchar <= 0)
    3358           0 :             return false;
    3359          40 :         pos += forkchar + 1;
    3360             :     }
    3361          80 :     if (name[pos] == '.')
    3362             :     {
    3363             :         int         segchar;
    3364             : 
    3365          80 :         for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
    3366             :             ;
    3367          40 :         if (segchar <= 1)
    3368           0 :             return false;
    3369          40 :         pos += segchar;
    3370             :     }
    3371             : 
    3372             :     /* Now we should be at the end. */
    3373          80 :     if (name[pos] != '\0')
    3374           0 :         return false;
    3375          80 :     return true;
    3376             : }
    3377             : 
    3378             : #ifdef HAVE_SYNCFS
    3379             : static void
    3380           0 : do_syncfs(const char *path)
    3381             : {
    3382             :     int         fd;
    3383             : 
    3384           0 :     fd = OpenTransientFile(path, O_RDONLY);
    3385           0 :     if (fd < 0)
    3386             :     {
    3387           0 :         ereport(LOG,
    3388             :                 (errcode_for_file_access(),
    3389             :                  errmsg("could not open file \"%s\": %m", path)));
    3390           0 :         return;
    3391             :     }
    3392           0 :     if (syncfs(fd) < 0)
    3393           0 :         ereport(LOG,
    3394             :                 (errcode_for_file_access(),
    3395             :                  errmsg("could not synchronize file system for file \"%s\": %m", path)));
    3396           0 :     CloseTransientFile(fd);
    3397             : }
    3398             : #endif
    3399             : 
    3400             : /*
    3401             :  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
    3402             :  * all potential filesystem, depending on recovery_init_sync_method setting.
    3403             :  *
    3404             :  * We fsync regular files and directories wherever they are, but we
    3405             :  * follow symlinks only for pg_wal and immediately under pg_tblspc.
    3406             :  * Other symlinks are presumed to point at files we're not responsible
    3407             :  * for fsyncing, and might not have privileges to write at all.
    3408             :  *
    3409             :  * Errors are logged but not considered fatal; that's because this is used
    3410             :  * only during database startup, to deal with the possibility that there are
    3411             :  * issued-but-unsynced writes pending against the data directory.  We want to
    3412             :  * ensure that such writes reach disk before anything that's done in the new
    3413             :  * run.  However, aborting on error would result in failure to start for
    3414             :  * harmless cases such as read-only files in the data directory, and that's
    3415             :  * not good either.
    3416             :  *
    3417             :  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
    3418             :  * rewriting all changes again during recovery.
    3419             :  *
    3420             :  * Note we assume we're chdir'd into PGDATA to begin with.
    3421             :  */
    3422             : void
    3423         228 : SyncDataDirectory(void)
    3424             : {
    3425             :     bool        xlog_is_symlink;
    3426             : 
    3427             :     /* We can skip this whole thing if fsync is disabled. */
    3428         228 :     if (!enableFsync)
    3429         226 :         return;
    3430             : 
    3431             :     /*
    3432             :      * If pg_wal is a symlink, we'll need to recurse into it separately,
    3433             :      * because the first walkdir below will ignore it.
    3434             :      */
    3435           2 :     xlog_is_symlink = false;
    3436             : 
    3437             : #ifndef WIN32
    3438             :     {
    3439             :         struct stat st;
    3440             : 
    3441           2 :         if (lstat("pg_wal", &st) < 0)
    3442           0 :             ereport(LOG,
    3443             :                     (errcode_for_file_access(),
    3444             :                      errmsg("could not stat file \"%s\": %m",
    3445             :                             "pg_wal")));
    3446           2 :         else if (S_ISLNK(st.st_mode))
    3447           0 :             xlog_is_symlink = true;
    3448             :     }
    3449             : #else
    3450             :     if (pgwin32_is_junction("pg_wal"))
    3451             :         xlog_is_symlink = true;
    3452             : #endif
    3453             : 
    3454             : #ifdef HAVE_SYNCFS
    3455           2 :     if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
    3456             :     {
    3457             :         DIR        *dir;
    3458             :         struct dirent *de;
    3459             : 
    3460             :         /*
    3461             :          * On Linux, we don't have to open every single file one by one.  We
    3462             :          * can use syncfs() to sync whole filesystems.  We only expect
    3463             :          * filesystem boundaries to exist where we tolerate symlinks, namely
    3464             :          * pg_wal and the tablespaces, so we call syncfs() for each of those
    3465             :          * directories.
    3466             :          */
    3467             : 
    3468             :         /* Sync the top level pgdata directory. */
    3469           0 :         do_syncfs(".");
    3470             :         /* If any tablespaces are configured, sync each of those. */
    3471           0 :         dir = AllocateDir("pg_tblspc");
    3472           0 :         while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
    3473             :         {
    3474             :             char        path[MAXPGPATH];
    3475             : 
    3476           0 :             if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
    3477           0 :                 continue;
    3478             : 
    3479           0 :             snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
    3480           0 :             do_syncfs(path);
    3481             :         }
    3482           0 :         FreeDir(dir);
    3483             :         /* If pg_wal is a symlink, process that too. */
    3484           0 :         if (xlog_is_symlink)
    3485           0 :             do_syncfs("pg_wal");
    3486           0 :         return;
    3487             :     }
    3488             : #endif                          /* !HAVE_SYNCFS */
    3489             : 
    3490             :     /*
    3491             :      * If possible, hint to the kernel that we're soon going to fsync the data
    3492             :      * directory and its contents.  Errors in this step are even less
    3493             :      * interesting than normal, so log them only at DEBUG1.
    3494             :      */
    3495             : #ifdef PG_FLUSH_DATA_WORKS
    3496           2 :     walkdir(".", pre_sync_fname, false, DEBUG1);
    3497           2 :     if (xlog_is_symlink)
    3498           0 :         walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
    3499           2 :     walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
    3500             : #endif
    3501             : 
    3502             :     /*
    3503             :      * Now we do the fsync()s in the same order.
    3504             :      *
    3505             :      * The main call ignores symlinks, so in addition to specially processing
    3506             :      * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
    3507             :      * process_symlinks = true.  Note that if there are any plain directories
    3508             :      * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
    3509             :      * so we don't worry about optimizing it.
    3510             :      */
    3511           2 :     walkdir(".", datadir_fsync_fname, false, LOG);
    3512           2 :     if (xlog_is_symlink)
    3513           0 :         walkdir("pg_wal", datadir_fsync_fname, false, LOG);
    3514           2 :     walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
    3515             : }
    3516             : 
    3517             : /*
    3518             :  * walkdir: recursively walk a directory, applying the action to each
    3519             :  * regular file and directory (including the named directory itself).
    3520             :  *
    3521             :  * If process_symlinks is true, the action and recursion are also applied
    3522             :  * to regular files and directories that are pointed to by symlinks in the
    3523             :  * given directory; otherwise symlinks are ignored.  Symlinks are always
    3524             :  * ignored in subdirectories, ie we intentionally don't pass down the
    3525             :  * process_symlinks flag to recursive calls.
    3526             :  *
    3527             :  * Errors are reported at level elevel, which might be ERROR or less.
    3528             :  *
    3529             :  * See also walkdir in file_utils.c, which is a frontend version of this
    3530             :  * logic.
    3531             :  */
    3532             : static void
    3533         322 : walkdir(const char *path,
    3534             :         void (*action) (const char *fname, bool isdir, int elevel),
    3535             :         bool process_symlinks,
    3536             :         int elevel)
    3537             : {
    3538             :     DIR        *dir;
    3539             :     struct dirent *de;
    3540             : 
    3541         322 :     dir = AllocateDir(path);
    3542             : 
    3543        6124 :     while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
    3544             :     {
    3545             :         char        subpath[MAXPGPATH * 2];
    3546             : 
    3547        5802 :         CHECK_FOR_INTERRUPTS();
    3548             : 
    3549        5802 :         if (strcmp(de->d_name, ".") == 0 ||
    3550        5480 :             strcmp(de->d_name, "..") == 0)
    3551         644 :             continue;
    3552             : 
    3553        5158 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
    3554             : 
    3555        5158 :         switch (get_dirent_type(subpath, de, process_symlinks, elevel))
    3556             :         {
    3557        5058 :             case PGFILETYPE_REG:
    3558        5058 :                 (*action) (subpath, false, elevel);
    3559        5058 :                 break;
    3560         100 :             case PGFILETYPE_DIR:
    3561         100 :                 walkdir(subpath, action, false, elevel);
    3562         100 :                 break;
    3563           0 :             default:
    3564             : 
    3565             :                 /*
    3566             :                  * Errors are already reported directly by get_dirent_type(),
    3567             :                  * and any remaining symlinks and unknown file types are
    3568             :                  * ignored.
    3569             :                  */
    3570           0 :                 break;
    3571             :         }
    3572             :     }
    3573             : 
    3574         322 :     FreeDir(dir);               /* we ignore any error here */
    3575             : 
    3576             :     /*
    3577             :      * It's important to fsync the destination directory itself as individual
    3578             :      * file fsyncs don't guarantee that the directory entry for the file is
    3579             :      * synced.  However, skip this if AllocateDir failed; the action function
    3580             :      * might not be robust against that.
    3581             :      */
    3582         322 :     if (dir)
    3583         322 :         (*action) (path, true, elevel);
    3584         322 : }
    3585             : 
    3586             : 
    3587             : /*
    3588             :  * Hint to the OS that it should get ready to fsync() this file.
    3589             :  *
    3590             :  * Ignores errors trying to open unreadable files, and logs other errors at a
    3591             :  * caller-specified level.
    3592             :  */
    3593             : #ifdef PG_FLUSH_DATA_WORKS
    3594             : 
    3595             : static void
    3596        1946 : pre_sync_fname(const char *fname, bool isdir, int elevel)
    3597             : {
    3598             :     int         fd;
    3599             : 
    3600             :     /* Don't try to flush directories, it'll likely just fail */
    3601        1946 :     if (isdir)
    3602          54 :         return;
    3603             : 
    3604        1892 :     fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
    3605             : 
    3606        1892 :     if (fd < 0)
    3607             :     {
    3608           0 :         if (errno == EACCES)
    3609           0 :             return;
    3610           0 :         ereport(elevel,
    3611             :                 (errcode_for_file_access(),
    3612             :                  errmsg("could not open file \"%s\": %m", fname)));
    3613           0 :         return;
    3614             :     }
    3615             : 
    3616             :     /*
    3617             :      * pg_flush_data() ignores errors, which is ok because this is only a
    3618             :      * hint.
    3619             :      */
    3620        1892 :     pg_flush_data(fd, 0, 0);
    3621             : 
    3622        1892 :     if (CloseTransientFile(fd) != 0)
    3623           0 :         ereport(elevel,
    3624             :                 (errcode_for_file_access(),
    3625             :                  errmsg("could not close file \"%s\": %m", fname)));
    3626             : }
    3627             : 
    3628             : #endif                          /* PG_FLUSH_DATA_WORKS */
    3629             : 
    3630             : static void
    3631        1946 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
    3632             : {
    3633             :     /*
    3634             :      * We want to silently ignoring errors about unreadable files.  Pass that
    3635             :      * desire on to fsync_fname_ext().
    3636             :      */
    3637        1946 :     fsync_fname_ext(fname, isdir, true, elevel);
    3638        1946 : }
    3639             : 
    3640             : static void
    3641        1488 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
    3642             : {
    3643        1488 :     if (isdir)
    3644             :     {
    3645         214 :         if (rmdir(fname) != 0 && errno != ENOENT)
    3646           0 :             ereport(elevel,
    3647             :                     (errcode_for_file_access(),
    3648             :                      errmsg("could not remove directory \"%s\": %m", fname)));
    3649             :     }
    3650             :     else
    3651             :     {
    3652             :         /* Use PathNameDeleteTemporaryFile to report filesize */
    3653        1274 :         PathNameDeleteTemporaryFile(fname, false);
    3654             :     }
    3655        1488 : }
    3656             : 
    3657             : /*
    3658             :  * fsync_fname_ext -- Try to fsync a file or directory
    3659             :  *
    3660             :  * If ignore_perm is true, ignore errors upon trying to open unreadable
    3661             :  * files. Logs other errors at a caller-specified level.
    3662             :  *
    3663             :  * Returns 0 if the operation succeeded, -1 otherwise.
    3664             :  */
    3665             : int
    3666       45884 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
    3667             : {
    3668             :     int         fd;
    3669             :     int         flags;
    3670             :     int         returncode;
    3671             : 
    3672             :     /*
    3673             :      * Some OSs require directories to be opened read-only whereas other
    3674             :      * systems don't allow us to fsync files opened read-only; so we need both
    3675             :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
    3676             :      * not writable by our userid, but we assume that's OK.
    3677             :      */
    3678       45884 :     flags = PG_BINARY;
    3679       45884 :     if (!isdir)
    3680       15854 :         flags |= O_RDWR;
    3681             :     else
    3682       30030 :         flags |= O_RDONLY;
    3683             : 
    3684       45884 :     fd = OpenTransientFile(fname, flags);
    3685             : 
    3686             :     /*
    3687             :      * Some OSs don't allow us to open directories at all (Windows returns
    3688             :      * EACCES), just ignore the error in that case.  If desired also silently
    3689             :      * ignoring errors about unreadable files. Log others.
    3690             :      */
    3691       45884 :     if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
    3692           0 :         return 0;
    3693       45884 :     else if (fd < 0 && ignore_perm && errno == EACCES)
    3694           0 :         return 0;
    3695       45884 :     else if (fd < 0)
    3696             :     {
    3697           0 :         ereport(elevel,
    3698             :                 (errcode_for_file_access(),
    3699             :                  errmsg("could not open file \"%s\": %m", fname)));
    3700           0 :         return -1;
    3701             :     }
    3702             : 
    3703       45884 :     returncode = pg_fsync(fd);
    3704             : 
    3705             :     /*
    3706             :      * Some OSes don't allow us to fsync directories at all, so we can ignore
    3707             :      * those errors. Anything else needs to be logged.
    3708             :      */
    3709       45884 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
    3710             :     {
    3711             :         int         save_errno;
    3712             : 
    3713             :         /* close file upon error, might not be in transaction context */
    3714           0 :         save_errno = errno;
    3715           0 :         (void) CloseTransientFile(fd);
    3716           0 :         errno = save_errno;
    3717             : 
    3718           0 :         ereport(elevel,
    3719             :                 (errcode_for_file_access(),
    3720             :                  errmsg("could not fsync file \"%s\": %m", fname)));
    3721           0 :         return -1;
    3722             :     }
    3723             : 
    3724       45884 :     if (CloseTransientFile(fd) != 0)
    3725             :     {
    3726           0 :         ereport(elevel,
    3727             :                 (errcode_for_file_access(),
    3728             :                  errmsg("could not close file \"%s\": %m", fname)));
    3729           0 :         return -1;
    3730             :     }
    3731             : 
    3732       45884 :     return 0;
    3733             : }
    3734             : 
    3735             : /*
    3736             :  * fsync_parent_path -- fsync the parent path of a file or directory
    3737             :  *
    3738             :  * This is aimed at making file operations persistent on disk in case of
    3739             :  * an OS crash or power failure.
    3740             :  */
    3741             : static int
    3742        6122 : fsync_parent_path(const char *fname, int elevel)
    3743             : {
    3744             :     char        parentpath[MAXPGPATH];
    3745             : 
    3746        6122 :     strlcpy(parentpath, fname, MAXPGPATH);
    3747        6122 :     get_parent_directory(parentpath);
    3748             : 
    3749             :     /*
    3750             :      * get_parent_directory() returns an empty string if the input argument is
    3751             :      * just a file name (see comments in path.c), so handle that as being the
    3752             :      * current directory.
    3753             :      */
    3754        6122 :     if (strlen(parentpath) == 0)
    3755         214 :         strlcpy(parentpath, ".", MAXPGPATH);
    3756             : 
    3757        6122 :     if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
    3758           0 :         return -1;
    3759             : 
    3760        6122 :     return 0;
    3761             : }
    3762             : 
    3763             : /*
    3764             :  * Create a PostgreSQL data sub-directory
    3765             :  *
    3766             :  * The data directory itself, and most of its sub-directories, are created at
    3767             :  * initdb time, but we do have some occasions when we create directories in
    3768             :  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
    3769             :  * make sure that those directories are created consistently.  Today, that means
    3770             :  * making sure that the created directory has the correct permissions, which is
    3771             :  * what pg_dir_create_mode tracks for us.
    3772             :  *
    3773             :  * Note that we also set the umask() based on what we understand the correct
    3774             :  * permissions to be (see file_perm.c).
    3775             :  *
    3776             :  * For permissions other than the default, mkdir() can be used directly, but
    3777             :  * be sure to consider carefully such cases -- a sub-directory with incorrect
    3778             :  * permissions in a PostgreSQL data directory could cause backups and other
    3779             :  * processes to fail.
    3780             :  */
    3781             : int
    3782        2192 : MakePGDirectory(const char *directoryName)
    3783             : {
    3784        2192 :     return mkdir(directoryName, pg_dir_create_mode);
    3785             : }
    3786             : 
    3787             : /*
    3788             :  * Return the passed-in error level, or PANIC if data_sync_retry is off.
    3789             :  *
    3790             :  * Failure to fsync any data file is cause for immediate panic, unless
    3791             :  * data_sync_retry is enabled.  Data may have been written to the operating
    3792             :  * system and removed from our buffer pool already, and if we are running on
    3793             :  * an operating system that forgets dirty data on write-back failure, there
    3794             :  * may be only one copy of the data remaining: in the WAL.  A later attempt to
    3795             :  * fsync again might falsely report success.  Therefore we must not allow any
    3796             :  * further checkpoints to be attempted.  data_sync_retry can in theory be
    3797             :  * enabled on systems known not to drop dirty buffered data on write-back
    3798             :  * failure (with the likely outcome that checkpoints will continue to fail
    3799             :  * until the underlying problem is fixed).
    3800             :  *
    3801             :  * Any code that reports a failure from fsync() or related functions should
    3802             :  * filter the error level with this function.
    3803             :  */
    3804             : int
    3805       25840 : data_sync_elevel(int elevel)
    3806             : {
    3807       25840 :     return data_sync_retry ? elevel : PANIC;
    3808             : }
    3809             : 
    3810             : /*
    3811             :  * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
    3812             :  * error is returned, it is unspecified how much has been written.
    3813             :  */
    3814             : ssize_t
    3815       46416 : pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
    3816             : {
    3817             :     struct iovec iov_copy[PG_IOV_MAX];
    3818       46416 :     ssize_t     sum = 0;
    3819             :     ssize_t     part;
    3820             : 
    3821             :     /* We'd better have space to make a copy, in case we need to retry. */
    3822       46416 :     if (iovcnt > PG_IOV_MAX)
    3823             :     {
    3824           0 :         errno = EINVAL;
    3825           0 :         return -1;
    3826             :     }
    3827             : 
    3828             :     for (;;)
    3829             :     {
    3830             :         /* Write as much as we can. */
    3831       46416 :         part = pg_pwritev(fd, iov, iovcnt, offset);
    3832       46416 :         if (part < 0)
    3833           0 :             return -1;
    3834             : 
    3835             : #ifdef SIMULATE_SHORT_WRITE
    3836             :         part = Min(part, 4096);
    3837             : #endif
    3838             : 
    3839             :         /* Count our progress. */
    3840       46416 :         sum += part;
    3841       46416 :         offset += part;
    3842             : 
    3843             :         /* Step over iovecs that are done. */
    3844     1531728 :         while (iovcnt > 0 && iov->iov_len <= part)
    3845             :         {
    3846     1485312 :             part -= iov->iov_len;
    3847     1485312 :             ++iov;
    3848     1485312 :             --iovcnt;
    3849             :         }
    3850             : 
    3851             :         /* Are they all done? */
    3852       46416 :         if (iovcnt == 0)
    3853             :         {
    3854             :             /* We don't expect the kernel to write more than requested. */
    3855             :             Assert(part == 0);
    3856       46416 :             break;
    3857             :         }
    3858             : 
    3859             :         /*
    3860             :          * Move whatever's left to the front of our mutable copy and adjust
    3861             :          * the leading iovec.
    3862             :          */
    3863             :         Assert(iovcnt > 0);
    3864           0 :         memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
    3865             :         Assert(iov->iov_len > part);
    3866           0 :         iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
    3867           0 :         iov_copy[0].iov_len -= part;
    3868           0 :         iov = iov_copy;
    3869             :     }
    3870             : 
    3871       46416 :     return sum;
    3872             : }

Generated by: LCOV version 1.13