LCOV - code coverage report
Current view: top level - src/backend/storage/file - fd.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta2 Lines: 619 829 74.7 %
Date: 2019-06-18 07:06:57 Functions: 78 83 94.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * fd.c
       4             :  *    Virtual file descriptor code.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/storage/file/fd.c
      11             :  *
      12             :  * NOTES:
      13             :  *
      14             :  * This code manages a cache of 'virtual' file descriptors (VFDs).
      15             :  * The server opens many file descriptors for a variety of reasons,
      16             :  * including base tables, scratch files (e.g., sort and hash spool
      17             :  * files), and random calls to C library routines like system(3); it
      18             :  * is quite easy to exceed system limits on the number of open files a
      19             :  * single process can have.  (This is around 1024 on many modern
      20             :  * operating systems, but may be lower on others.)
      21             :  *
      22             :  * VFDs are managed as an LRU pool, with actual OS file descriptors
      23             :  * being opened and closed as needed.  Obviously, if a routine is
      24             :  * opened using these interfaces, all subsequent operations must also
      25             :  * be through these interfaces (the File type is not a real file
      26             :  * descriptor).
      27             :  *
      28             :  * For this scheme to work, most (if not all) routines throughout the
      29             :  * server should use these interfaces instead of calling the C library
      30             :  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
      31             :  * may find ourselves short of real file descriptors anyway.
      32             :  *
      33             :  * INTERFACE ROUTINES
      34             :  *
      35             :  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
      36             :  * A File opened with OpenTemporaryFile is automatically deleted when the
      37             :  * File is closed, either explicitly or implicitly at end of transaction or
      38             :  * process exit. PathNameOpenFile is intended for files that are held open
      39             :  * for a long time, like relation files. It is the caller's responsibility
      40             :  * to close them, there is no automatic mechanism in fd.c for that.
      41             :  *
      42             :  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
      43             :  * temporary files that have names so that they can be shared between
      44             :  * backends.  Such files are automatically closed and count against the
      45             :  * temporary file limit of the backend that creates them, but unlike anonymous
      46             :  * files they are not automatically deleted.  See sharedfileset.c for a shared
      47             :  * ownership mechanism that provides automatic cleanup for shared files when
      48             :  * the last of a group of backends detaches.
      49             :  *
      50             :  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
      51             :  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
      52             :  * They behave like the corresponding native functions, except that the handle
      53             :  * is registered with the current subtransaction, and will be automatically
      54             :  * closed at abort. These are intended mainly for short operations like
      55             :  * reading a configuration file; there is a limit on the number of files that
      56             :  * can be opened using these functions at any one time.
      57             :  *
      58             :  * Finally, BasicOpenFile is just a thin wrapper around open() that can
      59             :  * release file descriptors in use by the virtual file descriptors if
      60             :  * necessary. There is no automatic cleanup of file descriptors returned by
      61             :  * BasicOpenFile, it is solely the caller's responsibility to close the file
      62             :  * descriptor by calling close(2).
      63             :  *
      64             :  *-------------------------------------------------------------------------
      65             :  */
      66             : 
      67             : #include "postgres.h"
      68             : 
      69             : #include <sys/file.h>
      70             : #include <sys/param.h>
      71             : #include <sys/stat.h>
      72             : #ifndef WIN32
      73             : #include <sys/mman.h>
      74             : #endif
      75             : #include <limits.h>
      76             : #include <unistd.h>
      77             : #include <fcntl.h>
      78             : #ifdef HAVE_SYS_RESOURCE_H
      79             : #include <sys/resource.h>     /* for getrlimit */
      80             : #endif
      81             : 
      82             : #include "miscadmin.h"
      83             : #include "access/xact.h"
      84             : #include "access/xlog.h"
      85             : #include "catalog/pg_tablespace.h"
      86             : #include "common/file_perm.h"
      87             : #include "pgstat.h"
      88             : #include "portability/mem.h"
      89             : #include "storage/fd.h"
      90             : #include "storage/ipc.h"
      91             : #include "utils/guc.h"
      92             : #include "utils/resowner_private.h"
      93             : 
      94             : 
      95             : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
      96             : #if defined(HAVE_SYNC_FILE_RANGE)
      97             : #define PG_FLUSH_DATA_WORKS 1
      98             : #elif !defined(WIN32) && defined(MS_ASYNC)
      99             : #define PG_FLUSH_DATA_WORKS 1
     100             : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     101             : #define PG_FLUSH_DATA_WORKS 1
     102             : #endif
     103             : 
     104             : /*
     105             :  * We must leave some file descriptors free for system(), the dynamic loader,
     106             :  * and other code that tries to open files without consulting fd.c.  This
     107             :  * is the number left free.  (While we can be pretty sure we won't get
     108             :  * EMFILE, there's never any guarantee that we won't get ENFILE due to
     109             :  * other processes chewing up FDs.  So it's a bad idea to try to open files
     110             :  * without consulting fd.c.  Nonetheless we cannot control all code.)
     111             :  *
     112             :  * Because this is just a fixed setting, we are effectively assuming that
     113             :  * no such code will leave FDs open over the long term; otherwise the slop
     114             :  * is likely to be insufficient.  Note in particular that we expect that
     115             :  * loading a shared library does not result in any permanent increase in
     116             :  * the number of open files.  (This appears to be true on most if not
     117             :  * all platforms as of Feb 2004.)
     118             :  */
     119             : #define NUM_RESERVED_FDS        10
     120             : 
     121             : /*
     122             :  * If we have fewer than this many usable FDs after allowing for the reserved
     123             :  * ones, choke.
     124             :  */
     125             : #define FD_MINFREE              10
     126             : 
     127             : /*
     128             :  * A number of platforms allow individual processes to open many more files
     129             :  * than they can really support when *many* processes do the same thing.
     130             :  * This GUC parameter lets the DBA limit max_safe_fds to something less than
     131             :  * what the postmaster's initial probe suggests will work.
     132             :  */
     133             : int         max_files_per_process = 1000;
     134             : 
     135             : /*
     136             :  * Maximum number of file descriptors to open for either VFD entries or
     137             :  * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
     138             :  * to a conservative value, and remains that way indefinitely in bootstrap or
     139             :  * standalone-backend cases.  In normal postmaster operation, the postmaster
     140             :  * calls set_max_safe_fds() late in initialization to update the value, and
     141             :  * that value is then inherited by forked subprocesses.
     142             :  *
     143             :  * Note: the value of max_files_per_process is taken into account while
     144             :  * setting this variable, and so need not be tested separately.
     145             :  */
     146             : int         max_safe_fds = 32;  /* default if not changed */
     147             : 
     148             : /* Whether it is safe to continue running after fsync() fails. */
     149             : bool        data_sync_retry = false;
     150             : 
     151             : /* Debugging.... */
     152             : 
     153             : #ifdef FDDEBUG
     154             : #define DO_DB(A) \
     155             :     do { \
     156             :         int         _do_db_save_errno = errno; \
     157             :         A; \
     158             :         errno = _do_db_save_errno; \
     159             :     } while (0)
     160             : #else
     161             : #define DO_DB(A) \
     162             :     ((void) 0)
     163             : #endif
     164             : 
     165             : #define VFD_CLOSED (-1)
     166             : 
     167             : #define FileIsValid(file) \
     168             :     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
     169             : 
     170             : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
     171             : 
     172             : /* these are the assigned bits in fdstate below: */
     173             : #define FD_DELETE_AT_CLOSE  (1 << 0)  /* T = delete when closed */
     174             : #define FD_CLOSE_AT_EOXACT  (1 << 1)  /* T = close at eoXact */
     175             : #define FD_TEMP_FILE_LIMIT  (1 << 2)  /* T = respect temp_file_limit */
     176             : 
     177             : typedef struct vfd
     178             : {
     179             :     int         fd;             /* current FD, or VFD_CLOSED if none */
     180             :     unsigned short fdstate;     /* bitflags for VFD's state */
     181             :     ResourceOwner resowner;     /* owner, for automatic cleanup */
     182             :     File        nextFree;       /* link to next free VFD, if in freelist */
     183             :     File        lruMoreRecently;    /* doubly linked recency-of-use list */
     184             :     File        lruLessRecently;
     185             :     off_t       fileSize;       /* current size of file (0 if not temporary) */
     186             :     char       *fileName;       /* name of file, or NULL for unused VFD */
     187             :     /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
     188             :     int         fileFlags;      /* open(2) flags for (re)opening the file */
     189             :     mode_t      fileMode;       /* mode to pass to open(2) */
     190             : } Vfd;
     191             : 
     192             : /*
     193             :  * Virtual File Descriptor array pointer and size.  This grows as
     194             :  * needed.  'File' values are indexes into this array.
     195             :  * Note that VfdCache[0] is not a usable VFD, just a list header.
     196             :  */
     197             : static Vfd *VfdCache;
     198             : static Size SizeVfdCache = 0;
     199             : 
     200             : /*
     201             :  * Number of file descriptors known to be in use by VFD entries.
     202             :  */
     203             : static int  nfile = 0;
     204             : 
     205             : /*
     206             :  * Flag to tell whether it's worth scanning VfdCache looking for temp files
     207             :  * to close
     208             :  */
     209             : static bool have_xact_temporary_files = false;
     210             : 
     211             : /*
     212             :  * Tracks the total size of all temporary files.  Note: when temp_file_limit
     213             :  * is being enforced, this cannot overflow since the limit cannot be more
     214             :  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
     215             :  * overflow, but we don't care.
     216             :  */
     217             : static uint64 temporary_files_size = 0;
     218             : 
     219             : /*
     220             :  * List of OS handles opened with AllocateFile, AllocateDir and
     221             :  * OpenTransientFile.
     222             :  */
     223             : typedef enum
     224             : {
     225             :     AllocateDescFile,
     226             :     AllocateDescPipe,
     227             :     AllocateDescDir,
     228             :     AllocateDescRawFD
     229             : } AllocateDescKind;
     230             : 
     231             : typedef struct
     232             : {
     233             :     AllocateDescKind kind;
     234             :     SubTransactionId create_subid;
     235             :     union
     236             :     {
     237             :         FILE       *file;
     238             :         DIR        *dir;
     239             :         int         fd;
     240             :     }           desc;
     241             : } AllocateDesc;
     242             : 
     243             : static int  numAllocatedDescs = 0;
     244             : static int  maxAllocatedDescs = 0;
     245             : static AllocateDesc *allocatedDescs = NULL;
     246             : 
     247             : /*
     248             :  * Number of temporary files opened during the current session;
     249             :  * this is used in generation of tempfile names.
     250             :  */
     251             : static long tempFileCounter = 0;
     252             : 
     253             : /*
     254             :  * Array of OIDs of temp tablespaces.  When numTempTableSpaces is -1,
     255             :  * this has not been set in the current transaction.
     256             :  */
     257             : static Oid *tempTableSpaces = NULL;
     258             : static int  numTempTableSpaces = -1;
     259             : static int  nextTempTableSpace = 0;
     260             : 
     261             : 
     262             : /*--------------------
     263             :  *
     264             :  * Private Routines
     265             :  *
     266             :  * Delete          - delete a file from the Lru ring
     267             :  * LruDelete       - remove a file from the Lru ring and close its FD
     268             :  * Insert          - put a file at the front of the Lru ring
     269             :  * LruInsert       - put a file at the front of the Lru ring and open it
     270             :  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
     271             :  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
     272             :  * AllocateVfd     - grab a free (or new) file record (from VfdArray)
     273             :  * FreeVfd         - free a file record
     274             :  *
     275             :  * The Least Recently Used ring is a doubly linked list that begins and
     276             :  * ends on element zero.  Element zero is special -- it doesn't represent
     277             :  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
     278             :  * anchor that shows us the beginning/end of the ring.
     279             :  * Only VFD elements that are currently really open (have an FD assigned) are
     280             :  * in the Lru ring.  Elements that are "virtually" open can be recognized
     281             :  * by having a non-null fileName field.
     282             :  *
     283             :  * example:
     284             :  *
     285             :  *     /--less----\                /---------\
     286             :  *     v           \              v           \
     287             :  *   #0 --more---> LeastRecentlyUsed --more-\ \
     288             :  *    ^\                                    | |
     289             :  *     \\less--> MostRecentlyUsedFile    <---/ |
     290             :  *      \more---/                    \--less--/
     291             :  *
     292             :  *--------------------
     293             :  */
     294             : static void Delete(File file);
     295             : static void LruDelete(File file);
     296             : static void Insert(File file);
     297             : static int  LruInsert(File file);
     298             : static bool ReleaseLruFile(void);
     299             : static void ReleaseLruFiles(void);
     300             : static File AllocateVfd(void);
     301             : static void FreeVfd(File file);
     302             : 
     303             : static int  FileAccess(File file);
     304             : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
     305             : static bool reserveAllocatedDesc(void);
     306             : static int  FreeDesc(AllocateDesc *desc);
     307             : 
     308             : static void AtProcExit_Files(int code, Datum arg);
     309             : static void CleanupTempFiles(bool isCommit, bool isProcExit);
     310             : static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
     311             :                                    bool unlink_all);
     312             : static void RemovePgTempRelationFiles(const char *tsdirname);
     313             : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
     314             : 
     315             : static void walkdir(const char *path,
     316             :                     void (*action) (const char *fname, bool isdir, int elevel),
     317             :                     bool process_symlinks,
     318             :                     int elevel);
     319             : #ifdef PG_FLUSH_DATA_WORKS
     320             : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
     321             : #endif
     322             : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
     323             : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
     324             : 
     325             : static int  fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
     326             : static int  fsync_parent_path(const char *fname, int elevel);
     327             : 
     328             : 
     329             : /*
     330             :  * pg_fsync --- do fsync with or without writethrough
     331             :  */
     332             : int
     333      121528 : pg_fsync(int fd)
     334             : {
     335             :     /* #if is to skip the sync_method test if there's no need for it */
     336             : #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
     337             :     if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
     338             :         return pg_fsync_writethrough(fd);
     339             :     else
     340             : #endif
     341      121528 :         return pg_fsync_no_writethrough(fd);
     342             : }
     343             : 
     344             : 
     345             : /*
     346             :  * pg_fsync_no_writethrough --- same as fsync except does nothing if
     347             :  *  enableFsync is off
     348             :  */
     349             : int
     350      121528 : pg_fsync_no_writethrough(int fd)
     351             : {
     352      121528 :     if (enableFsync)
     353        2350 :         return fsync(fd);
     354             :     else
     355      119178 :         return 0;
     356             : }
     357             : 
     358             : /*
     359             :  * pg_fsync_writethrough
     360             :  */
     361             : int
     362           0 : pg_fsync_writethrough(int fd)
     363             : {
     364           0 :     if (enableFsync)
     365             :     {
     366             : #ifdef WIN32
     367             :         return _commit(fd);
     368             : #elif defined(F_FULLFSYNC)
     369             :         return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
     370             : #else
     371           0 :         errno = ENOSYS;
     372           0 :         return -1;
     373             : #endif
     374             :     }
     375             :     else
     376           0 :         return 0;
     377             : }
     378             : 
     379             : /*
     380             :  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
     381             :  *
     382             :  * Not all platforms have fdatasync; treat as fsync if not available.
     383             :  */
     384             : int
     385      238736 : pg_fdatasync(int fd)
     386             : {
     387      238736 :     if (enableFsync)
     388             :     {
     389             : #ifdef HAVE_FDATASYNC
     390          76 :         return fdatasync(fd);
     391             : #else
     392             :         return fsync(fd);
     393             : #endif
     394             :     }
     395             :     else
     396      238660 :         return 0;
     397             : }
     398             : 
     399             : /*
     400             :  * pg_flush_data --- advise OS that the described dirty data should be flushed
     401             :  *
     402             :  * offset of 0 with nbytes 0 means that the entire file should be flushed
     403             :  */
     404             : void
     405      368742 : pg_flush_data(int fd, off_t offset, off_t nbytes)
     406             : {
     407             :     /*
     408             :      * Right now file flushing is primarily used to avoid making later
     409             :      * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
     410             :      * if fsyncs are disabled - that's a decision we might want to make
     411             :      * configurable at some point.
     412             :      */
     413      368742 :     if (!enableFsync)
     414      366690 :         return;
     415             : 
     416             :     /*
     417             :      * We compile all alternatives that are supported on the current platform,
     418             :      * to find portability problems more easily.
     419             :      */
     420             : #if defined(HAVE_SYNC_FILE_RANGE)
     421             :     {
     422             :         int         rc;
     423             :         static bool not_implemented_by_kernel = false;
     424             : 
     425        2052 :         if (not_implemented_by_kernel)
     426           0 :             return;
     427             : 
     428             :         /*
     429             :          * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
     430             :          * tells the OS that writeback for the specified blocks should be
     431             :          * started, but that we don't want to wait for completion.  Note that
     432             :          * this call might block if too much dirty data exists in the range.
     433             :          * This is the preferable method on OSs supporting it, as it works
     434             :          * reliably when available (contrast to msync()) and doesn't flush out
     435             :          * clean data (like FADV_DONTNEED).
     436             :          */
     437        2052 :         rc = sync_file_range(fd, offset, nbytes,
     438             :                              SYNC_FILE_RANGE_WRITE);
     439        2052 :         if (rc != 0)
     440             :         {
     441             :             int         elevel;
     442             : 
     443             :             /*
     444             :              * For systems that don't have an implementation of
     445             :              * sync_file_range() such as Windows WSL, generate only one
     446             :              * warning and then suppress all further attempts by this process.
     447             :              */
     448           0 :             if (errno == ENOSYS)
     449             :             {
     450           0 :                 elevel = WARNING;
     451           0 :                 not_implemented_by_kernel = true;
     452             :             }
     453             :             else
     454           0 :                 elevel = data_sync_elevel(WARNING);
     455             : 
     456           0 :             ereport(elevel,
     457             :                     (errcode_for_file_access(),
     458             :                      errmsg("could not flush dirty data: %m")));
     459             :         }
     460             : 
     461        2052 :         return;
     462             :     }
     463             : #endif
     464             : #if !defined(WIN32) && defined(MS_ASYNC)
     465             :     {
     466             :         void       *p;
     467             :         static int  pagesize = 0;
     468             : 
     469             :         /*
     470             :          * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
     471             :          * writeback. On linux it only does so if MS_SYNC is specified, but
     472             :          * then it does the writeback synchronously. Luckily all common linux
     473             :          * systems have sync_file_range().  This is preferable over
     474             :          * FADV_DONTNEED because it doesn't flush out clean data.
     475             :          *
     476             :          * We map the file (mmap()), tell the kernel to sync back the contents
     477             :          * (msync()), and then remove the mapping again (munmap()).
     478             :          */
     479             : 
     480             :         /* mmap() needs actual length if we want to map whole file */
     481             :         if (offset == 0 && nbytes == 0)
     482             :         {
     483             :             nbytes = lseek(fd, 0, SEEK_END);
     484             :             if (nbytes < 0)
     485             :             {
     486             :                 ereport(WARNING,
     487             :                         (errcode_for_file_access(),
     488             :                          errmsg("could not determine dirty data size: %m")));
     489             :                 return;
     490             :             }
     491             :         }
     492             : 
     493             :         /*
     494             :          * Some platforms reject partial-page mmap() attempts.  To deal with
     495             :          * that, just truncate the request to a page boundary.  If any extra
     496             :          * bytes don't get flushed, well, it's only a hint anyway.
     497             :          */
     498             : 
     499             :         /* fetch pagesize only once */
     500             :         if (pagesize == 0)
     501             :             pagesize = sysconf(_SC_PAGESIZE);
     502             : 
     503             :         /* align length to pagesize, dropping any fractional page */
     504             :         if (pagesize > 0)
     505             :             nbytes = (nbytes / pagesize) * pagesize;
     506             : 
     507             :         /* fractional-page request is a no-op */
     508             :         if (nbytes <= 0)
     509             :             return;
     510             : 
     511             :         /*
     512             :          * mmap could well fail, particularly on 32-bit platforms where there
     513             :          * may simply not be enough address space.  If so, silently fall
     514             :          * through to the next implementation.
     515             :          */
     516             :         if (nbytes <= (off_t) SSIZE_MAX)
     517             :             p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
     518             :         else
     519             :             p = MAP_FAILED;
     520             : 
     521             :         if (p != MAP_FAILED)
     522             :         {
     523             :             int         rc;
     524             : 
     525             :             rc = msync(p, (size_t) nbytes, MS_ASYNC);
     526             :             if (rc != 0)
     527             :             {
     528             :                 ereport(data_sync_elevel(WARNING),
     529             :                         (errcode_for_file_access(),
     530             :                          errmsg("could not flush dirty data: %m")));
     531             :                 /* NB: need to fall through to munmap()! */
     532             :             }
     533             : 
     534             :             rc = munmap(p, (size_t) nbytes);
     535             :             if (rc != 0)
     536             :             {
     537             :                 /* FATAL error because mapping would remain */
     538             :                 ereport(FATAL,
     539             :                         (errcode_for_file_access(),
     540             :                          errmsg("could not munmap() while flushing data: %m")));
     541             :             }
     542             : 
     543             :             return;
     544             :         }
     545             :     }
     546             : #endif
     547             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
     548             :     {
     549             :         int         rc;
     550             : 
     551             :         /*
     552             :          * Signal the kernel that the passed in range should not be cached
     553             :          * anymore. This has the, desired, side effect of writing out dirty
     554             :          * data, and the, undesired, side effect of likely discarding useful
     555             :          * clean cached blocks.  For the latter reason this is the least
     556             :          * preferable method.
     557             :          */
     558             : 
     559             :         rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
     560             : 
     561             :         if (rc != 0)
     562             :         {
     563             :             /* don't error out, this is just a performance optimization */
     564             :             ereport(WARNING,
     565             :                     (errcode_for_file_access(),
     566             :                      errmsg("could not flush dirty data: %m")));
     567             :         }
     568             : 
     569             :         return;
     570             :     }
     571             : #endif
     572             : }
     573             : 
     574             : 
     575             : /*
     576             :  * fsync_fname -- fsync a file or directory, handling errors properly
     577             :  *
     578             :  * Try to fsync a file or directory. When doing the latter, ignore errors that
     579             :  * indicate the OS just doesn't allow/require fsyncing directories.
     580             :  */
     581             : void
     582       13340 : fsync_fname(const char *fname, bool isdir)
     583             : {
     584       13340 :     fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
     585       13340 : }
     586             : 
     587             : /*
     588             :  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
     589             :  *
     590             :  * This routine ensures that, after returning, the effect of renaming file
     591             :  * persists in case of a crash. A crash while this routine is running will
     592             :  * leave you with either the pre-existing or the moved file in place of the
     593             :  * new file; no mixed state or truncated files are possible.
     594             :  *
     595             :  * It does so by using fsync on the old filename and the possibly existing
     596             :  * target filename before the rename, and the target file and directory after.
     597             :  *
     598             :  * Note that rename() cannot be used across arbitrary directories, as they
     599             :  * might not be on the same filesystem. Therefore this routine does not
     600             :  * support renaming across directories.
     601             :  *
     602             :  * Log errors with the caller specified severity.
     603             :  *
     604             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     605             :  * valid upon return.
     606             :  */
     607             : int
     608        2936 : durable_rename(const char *oldfile, const char *newfile, int elevel)
     609             : {
     610             :     int         fd;
     611             : 
     612             :     /*
     613             :      * First fsync the old and target path (if it exists), to ensure that they
     614             :      * are properly persistent on disk. Syncing the target file is not
     615             :      * strictly necessary, but it makes it easier to reason about crashes;
     616             :      * because it's then guaranteed that either source or target file exists
     617             :      * after a crash.
     618             :      */
     619        2936 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     620           0 :         return -1;
     621             : 
     622        2936 :     fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
     623        2936 :     if (fd < 0)
     624             :     {
     625         404 :         if (errno != ENOENT)
     626             :         {
     627           0 :             ereport(elevel,
     628             :                     (errcode_for_file_access(),
     629             :                      errmsg("could not open file \"%s\": %m", newfile)));
     630           0 :             return -1;
     631             :         }
     632             :     }
     633             :     else
     634             :     {
     635        2532 :         if (pg_fsync(fd) != 0)
     636             :         {
     637             :             int         save_errno;
     638             : 
     639             :             /* close file upon error, might not be in transaction context */
     640           0 :             save_errno = errno;
     641           0 :             CloseTransientFile(fd);
     642           0 :             errno = save_errno;
     643             : 
     644           0 :             ereport(elevel,
     645             :                     (errcode_for_file_access(),
     646             :                      errmsg("could not fsync file \"%s\": %m", newfile)));
     647           0 :             return -1;
     648             :         }
     649             : 
     650        2532 :         if (CloseTransientFile(fd))
     651             :         {
     652           0 :             ereport(elevel,
     653             :                     (errcode_for_file_access(),
     654             :                      errmsg("could not close file \"%s\": %m", newfile)));
     655           0 :             return -1;
     656             :         }
     657             :     }
     658             : 
     659             :     /* Time to do the real deal... */
     660        2936 :     if (rename(oldfile, newfile) < 0)
     661             :     {
     662           0 :         ereport(elevel,
     663             :                 (errcode_for_file_access(),
     664             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     665             :                         oldfile, newfile)));
     666           0 :         return -1;
     667             :     }
     668             : 
     669             :     /*
     670             :      * To guarantee renaming the file is persistent, fsync the file with its
     671             :      * new name, and its containing directory.
     672             :      */
     673        2936 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     674           0 :         return -1;
     675             : 
     676        2936 :     if (fsync_parent_path(newfile, elevel) != 0)
     677           0 :         return -1;
     678             : 
     679        2936 :     return 0;
     680             : }
     681             : 
     682             : /*
     683             :  * durable_unlink -- remove a file in a durable manner
     684             :  *
     685             :  * This routine ensures that, after returning, the effect of removing file
     686             :  * persists in case of a crash. A crash while this routine is running will
     687             :  * leave the system in no mixed state.
     688             :  *
     689             :  * It does so by using fsync on the parent directory of the file after the
     690             :  * actual removal is done.
     691             :  *
     692             :  * Log errors with the severity specified by caller.
     693             :  *
     694             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     695             :  * valid upon return.
     696             :  */
     697             : int
     698         404 : durable_unlink(const char *fname, int elevel)
     699             : {
     700         404 :     if (unlink(fname) < 0)
     701             :     {
     702         354 :         ereport(elevel,
     703             :                 (errcode_for_file_access(),
     704             :                  errmsg("could not remove file \"%s\": %m",
     705             :                         fname)));
     706         354 :         return -1;
     707             :     }
     708             : 
     709             :     /*
     710             :      * To guarantee that the removal of the file is persistent, fsync its
     711             :      * parent directory.
     712             :      */
     713          50 :     if (fsync_parent_path(fname, elevel) != 0)
     714           0 :         return -1;
     715             : 
     716          50 :     return 0;
     717             : }
     718             : 
     719             : /*
     720             :  * durable_link_or_rename -- rename a file in a durable manner.
     721             :  *
     722             :  * Similar to durable_rename(), except that this routine tries (but does not
     723             :  * guarantee) not to overwrite the target file.
     724             :  *
     725             :  * Note that a crash in an unfortunate moment can leave you with two links to
     726             :  * the target file.
     727             :  *
     728             :  * Log errors with the caller specified severity.
     729             :  *
     730             :  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
     731             :  * valid upon return.
     732             :  */
     733             : int
     734        1376 : durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
     735             : {
     736             :     /*
     737             :      * Ensure that, if we crash directly after the rename/link, a file with
     738             :      * valid contents is moved into place.
     739             :      */
     740        1376 :     if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
     741           0 :         return -1;
     742             : 
     743             : #if HAVE_WORKING_LINK
     744        1376 :     if (link(oldfile, newfile) < 0)
     745             :     {
     746           0 :         ereport(elevel,
     747             :                 (errcode_for_file_access(),
     748             :                  errmsg("could not link file \"%s\" to \"%s\": %m",
     749             :                         oldfile, newfile)));
     750           0 :         return -1;
     751             :     }
     752        1376 :     unlink(oldfile);
     753             : #else
     754             :     /* XXX: Add racy file existence check? */
     755             :     if (rename(oldfile, newfile) < 0)
     756             :     {
     757             :         ereport(elevel,
     758             :                 (errcode_for_file_access(),
     759             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
     760             :                         oldfile, newfile)));
     761             :         return -1;
     762             :     }
     763             : #endif
     764             : 
     765             :     /*
     766             :      * Make change persistent in case of an OS crash, both the new entry and
     767             :      * its parent directory need to be flushed.
     768             :      */
     769        1376 :     if (fsync_fname_ext(newfile, false, false, elevel) != 0)
     770           0 :         return -1;
     771             : 
     772             :     /* Same for parent directory */
     773        1376 :     if (fsync_parent_path(newfile, elevel) != 0)
     774           0 :         return -1;
     775             : 
     776        1376 :     return 0;
     777             : }
     778             : 
     779             : /*
     780             :  * InitFileAccess --- initialize this module during backend startup
     781             :  *
     782             :  * This is called during either normal or standalone backend start.
     783             :  * It is *not* called in the postmaster.
     784             :  */
     785             : void
     786       12256 : InitFileAccess(void)
     787             : {
     788             :     Assert(SizeVfdCache == 0);  /* call me only once */
     789             : 
     790             :     /* initialize cache header entry */
     791       12256 :     VfdCache = (Vfd *) malloc(sizeof(Vfd));
     792       12256 :     if (VfdCache == NULL)
     793           0 :         ereport(FATAL,
     794             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     795             :                  errmsg("out of memory")));
     796             : 
     797       12256 :     MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
     798       12256 :     VfdCache->fd = VFD_CLOSED;
     799             : 
     800       12256 :     SizeVfdCache = 1;
     801             : 
     802             :     /* register proc-exit hook to ensure temp files are dropped at exit */
     803       12256 :     on_proc_exit(AtProcExit_Files, 0);
     804       12256 : }
     805             : 
     806             : /*
     807             :  * count_usable_fds --- count how many FDs the system will let us open,
     808             :  *      and estimate how many are already open.
     809             :  *
     810             :  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
     811             :  * value of max_to_probe might result in an underestimate of already_open;
     812             :  * we must fill in any "gaps" in the set of used FDs before the calculation
     813             :  * of already_open will give the right answer.  In practice, max_to_probe
     814             :  * of a couple of dozen should be enough to ensure good results.
     815             :  *
     816             :  * We assume stdin (FD 0) is available for dup'ing
     817             :  */
     818             : static void
     819         572 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
     820             : {
     821             :     int        *fd;
     822             :     int         size;
     823         572 :     int         used = 0;
     824         572 :     int         highestfd = 0;
     825             :     int         j;
     826             : 
     827             : #ifdef HAVE_GETRLIMIT
     828             :     struct rlimit rlim;
     829             :     int         getrlimit_status;
     830             : #endif
     831             : 
     832         572 :     size = 1024;
     833         572 :     fd = (int *) palloc(size * sizeof(int));
     834             : 
     835             : #ifdef HAVE_GETRLIMIT
     836             : #ifdef RLIMIT_NOFILE            /* most platforms use RLIMIT_NOFILE */
     837         572 :     getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
     838             : #else                           /* but BSD doesn't ... */
     839             :     getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
     840             : #endif                          /* RLIMIT_NOFILE */
     841         572 :     if (getrlimit_status != 0)
     842           0 :         ereport(WARNING, (errmsg("getrlimit failed: %m")));
     843             : #endif                          /* HAVE_GETRLIMIT */
     844             : 
     845             :     /* dup until failure or probe limit reached */
     846             :     for (;;)
     847      571428 :     {
     848             :         int         thisfd;
     849             : 
     850             : #ifdef HAVE_GETRLIMIT
     851             : 
     852             :         /*
     853             :          * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
     854             :          * some platforms
     855             :          */
     856      572000 :         if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
     857           0 :             break;
     858             : #endif
     859             : 
     860      572000 :         thisfd = dup(0);
     861      572000 :         if (thisfd < 0)
     862             :         {
     863             :             /* Expect EMFILE or ENFILE, else it's fishy */
     864           0 :             if (errno != EMFILE && errno != ENFILE)
     865           0 :                 elog(WARNING, "dup(0) failed after %d successes: %m", used);
     866           0 :             break;
     867             :         }
     868             : 
     869      572000 :         if (used >= size)
     870             :         {
     871           0 :             size *= 2;
     872           0 :             fd = (int *) repalloc(fd, size * sizeof(int));
     873             :         }
     874      572000 :         fd[used++] = thisfd;
     875             : 
     876      572000 :         if (highestfd < thisfd)
     877      572000 :             highestfd = thisfd;
     878             : 
     879      572000 :         if (used >= max_to_probe)
     880         572 :             break;
     881             :     }
     882             : 
     883             :     /* release the files we opened */
     884      572572 :     for (j = 0; j < used; j++)
     885      572000 :         close(fd[j]);
     886             : 
     887         572 :     pfree(fd);
     888             : 
     889             :     /*
     890             :      * Return results.  usable_fds is just the number of successful dups. We
     891             :      * assume that the system limit is highestfd+1 (remember 0 is a legal FD
     892             :      * number) and so already_open is highestfd+1 - usable_fds.
     893             :      */
     894         572 :     *usable_fds = used;
     895         572 :     *already_open = highestfd + 1 - used;
     896         572 : }
     897             : 
     898             : /*
     899             :  * set_max_safe_fds
     900             :  *      Determine number of filedescriptors that fd.c is allowed to use
     901             :  */
     902             : void
     903         572 : set_max_safe_fds(void)
     904             : {
     905             :     int         usable_fds;
     906             :     int         already_open;
     907             : 
     908             :     /*----------
     909             :      * We want to set max_safe_fds to
     910             :      *          MIN(usable_fds, max_files_per_process - already_open)
     911             :      * less the slop factor for files that are opened without consulting
     912             :      * fd.c.  This ensures that we won't exceed either max_files_per_process
     913             :      * or the experimentally-determined EMFILE limit.
     914             :      *----------
     915             :      */
     916         572 :     count_usable_fds(max_files_per_process,
     917             :                      &usable_fds, &already_open);
     918             : 
     919         572 :     max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
     920             : 
     921             :     /*
     922             :      * Take off the FDs reserved for system() etc.
     923             :      */
     924         572 :     max_safe_fds -= NUM_RESERVED_FDS;
     925             : 
     926             :     /*
     927             :      * Make sure we still have enough to get by.
     928             :      */
     929         572 :     if (max_safe_fds < FD_MINFREE)
     930           0 :         ereport(FATAL,
     931             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
     932             :                  errmsg("insufficient file descriptors available to start server process"),
     933             :                  errdetail("System allows %d, we need at least %d.",
     934             :                            max_safe_fds + NUM_RESERVED_FDS,
     935             :                            FD_MINFREE + NUM_RESERVED_FDS)));
     936             : 
     937         572 :     elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
     938             :          max_safe_fds, usable_fds, already_open);
     939         572 : }
     940             : 
     941             : /*
     942             :  * Open a file with BasicOpenFilePerm() and pass default file mode for the
     943             :  * fileMode parameter.
     944             :  */
     945             : int
     946       17992 : BasicOpenFile(const char *fileName, int fileFlags)
     947             : {
     948       17992 :     return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
     949             : }
     950             : 
     951             : /*
     952             :  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
     953             :  *
     954             :  * This is exported for use by places that really want a plain kernel FD,
     955             :  * but need to be proof against running out of FDs.  Once an FD has been
     956             :  * successfully returned, it is the caller's responsibility to ensure that
     957             :  * it will not be leaked on ereport()!  Most users should *not* call this
     958             :  * routine directly, but instead use the VFD abstraction level, which
     959             :  * provides protection against descriptor leaks as well as management of
     960             :  * files that need to be open for more than a short period of time.
     961             :  *
     962             :  * Ideally this should be the *only* direct call of open() in the backend.
     963             :  * In practice, the postmaster calls open() directly, and there are some
     964             :  * direct open() calls done early in backend startup.  Those are OK since
     965             :  * this module wouldn't have any open files to close at that point anyway.
     966             :  */
     967             : int
     968     2192152 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
     969             : {
     970             :     int         fd;
     971             : 
     972             : tryAgain:
     973     2192152 :     fd = open(fileName, fileFlags, fileMode);
     974             : 
     975     2192152 :     if (fd >= 0)
     976     1922364 :         return fd;              /* success! */
     977             : 
     978      269788 :     if (errno == EMFILE || errno == ENFILE)
     979             :     {
     980           0 :         int         save_errno = errno;
     981             : 
     982           0 :         ereport(LOG,
     983             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
     984             :                  errmsg("out of file descriptors: %m; release and retry")));
     985           0 :         errno = 0;
     986           0 :         if (ReleaseLruFile())
     987           0 :             goto tryAgain;
     988           0 :         errno = save_errno;
     989             :     }
     990             : 
     991      269788 :     return -1;                  /* failure */
     992             : }
     993             : 
     994             : #if defined(FDDEBUG)
     995             : 
     996             : static void
     997             : _dump_lru(void)
     998             : {
     999             :     int         mru = VfdCache[0].lruLessRecently;
    1000             :     Vfd        *vfdP = &VfdCache[mru];
    1001             :     char        buf[2048];
    1002             : 
    1003             :     snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
    1004             :     while (mru != 0)
    1005             :     {
    1006             :         mru = vfdP->lruLessRecently;
    1007             :         vfdP = &VfdCache[mru];
    1008             :         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
    1009             :     }
    1010             :     snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
    1011             :     elog(LOG, "%s", buf);
    1012             : }
    1013             : #endif                          /* FDDEBUG */
    1014             : 
    1015             : static void
    1016     1523548 : Delete(File file)
    1017             : {
    1018             :     Vfd        *vfdP;
    1019             : 
    1020             :     Assert(file != 0);
    1021             : 
    1022             :     DO_DB(elog(LOG, "Delete %d (%s)",
    1023             :                file, VfdCache[file].fileName));
    1024             :     DO_DB(_dump_lru());
    1025             : 
    1026     1523548 :     vfdP = &VfdCache[file];
    1027             : 
    1028     1523548 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
    1029     1523548 :     VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
    1030             : 
    1031             :     DO_DB(_dump_lru());
    1032     1523548 : }
    1033             : 
    1034             : static void
    1035      378378 : LruDelete(File file)
    1036             : {
    1037             :     Vfd        *vfdP;
    1038             : 
    1039             :     Assert(file != 0);
    1040             : 
    1041             :     DO_DB(elog(LOG, "LruDelete %d (%s)",
    1042             :                file, VfdCache[file].fileName));
    1043             : 
    1044      378378 :     vfdP = &VfdCache[file];
    1045             : 
    1046             :     /*
    1047             :      * Close the file.  We aren't expecting this to fail; if it does, better
    1048             :      * to leak the FD than to mess up our internal state.
    1049             :      */
    1050      378378 :     if (close(vfdP->fd))
    1051           0 :         elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1052             :              "could not close file \"%s\": %m", vfdP->fileName);
    1053      378378 :     vfdP->fd = VFD_CLOSED;
    1054      378378 :     --nfile;
    1055             : 
    1056             :     /* delete the vfd record from the LRU ring */
    1057      378378 :     Delete(file);
    1058      378378 : }
    1059             : 
    1060             : static void
    1061     1702686 : Insert(File file)
    1062             : {
    1063             :     Vfd        *vfdP;
    1064             : 
    1065             :     Assert(file != 0);
    1066             : 
    1067             :     DO_DB(elog(LOG, "Insert %d (%s)",
    1068             :                file, VfdCache[file].fileName));
    1069             :     DO_DB(_dump_lru());
    1070             : 
    1071     1702686 :     vfdP = &VfdCache[file];
    1072             : 
    1073     1702686 :     vfdP->lruMoreRecently = 0;
    1074     1702686 :     vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
    1075     1702686 :     VfdCache[0].lruLessRecently = file;
    1076     1702686 :     VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
    1077             : 
    1078             :     DO_DB(_dump_lru());
    1079     1702686 : }
    1080             : 
    1081             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1082             : static int
    1083      204476 : LruInsert(File file)
    1084             : {
    1085             :     Vfd        *vfdP;
    1086             : 
    1087             :     Assert(file != 0);
    1088             : 
    1089             :     DO_DB(elog(LOG, "LruInsert %d (%s)",
    1090             :                file, VfdCache[file].fileName));
    1091             : 
    1092      204476 :     vfdP = &VfdCache[file];
    1093             : 
    1094      204476 :     if (FileIsNotOpen(file))
    1095             :     {
    1096             :         /* Close excess kernel FDs. */
    1097      204476 :         ReleaseLruFiles();
    1098             : 
    1099             :         /*
    1100             :          * The open could still fail for lack of file descriptors, eg due to
    1101             :          * overall system file table being full.  So, be prepared to release
    1102             :          * another FD if necessary...
    1103             :          */
    1104      204476 :         vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
    1105             :                                      vfdP->fileMode);
    1106      204476 :         if (vfdP->fd < 0)
    1107             :         {
    1108             :             DO_DB(elog(LOG, "re-open failed: %m"));
    1109           0 :             return -1;
    1110             :         }
    1111             :         else
    1112             :         {
    1113      204476 :             ++nfile;
    1114             :         }
    1115             :     }
    1116             : 
    1117             :     /*
    1118             :      * put it at the head of the Lru ring
    1119             :      */
    1120             : 
    1121      204476 :     Insert(file);
    1122             : 
    1123      204476 :     return 0;
    1124             : }
    1125             : 
    1126             : /*
    1127             :  * Release one kernel FD by closing the least-recently-used VFD.
    1128             :  */
    1129             : static bool
    1130      378352 : ReleaseLruFile(void)
    1131             : {
    1132             :     DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
    1133             : 
    1134      378352 :     if (nfile > 0)
    1135             :     {
    1136             :         /*
    1137             :          * There are opened files and so there should be at least one used vfd
    1138             :          * in the ring.
    1139             :          */
    1140             :         Assert(VfdCache[0].lruMoreRecently != 0);
    1141      378352 :         LruDelete(VfdCache[0].lruMoreRecently);
    1142      378352 :         return true;            /* freed a file */
    1143             :     }
    1144           0 :     return false;               /* no files available to free */
    1145             : }
    1146             : 
    1147             : /*
    1148             :  * Release kernel FDs as needed to get under the max_safe_fds limit.
    1149             :  * After calling this, it's OK to try to open another file.
    1150             :  */
    1151             : static void
    1152     2384028 : ReleaseLruFiles(void)
    1153             : {
    1154     5146408 :     while (nfile + numAllocatedDescs >= max_safe_fds)
    1155             :     {
    1156      378352 :         if (!ReleaseLruFile())
    1157           0 :             break;
    1158             :     }
    1159     2384028 : }
    1160             : 
    1161             : static File
    1162     1332150 : AllocateVfd(void)
    1163             : {
    1164             :     Index       i;
    1165             :     File        file;
    1166             : 
    1167             :     DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
    1168             : 
    1169             :     Assert(SizeVfdCache > 0);    /* InitFileAccess not called? */
    1170             : 
    1171     1332150 :     if (VfdCache[0].nextFree == 0)
    1172             :     {
    1173             :         /*
    1174             :          * The free list is empty so it is time to increase the size of the
    1175             :          * array.  We choose to double it each time this happens. However,
    1176             :          * there's not much point in starting *real* small.
    1177             :          */
    1178       14472 :         Size        newCacheSize = SizeVfdCache * 2;
    1179             :         Vfd        *newVfdCache;
    1180             : 
    1181       14472 :         if (newCacheSize < 32)
    1182       10004 :             newCacheSize = 32;
    1183             : 
    1184             :         /*
    1185             :          * Be careful not to clobber VfdCache ptr if realloc fails.
    1186             :          */
    1187       14472 :         newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
    1188       14472 :         if (newVfdCache == NULL)
    1189           0 :             ereport(ERROR,
    1190             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    1191             :                      errmsg("out of memory")));
    1192       14472 :         VfdCache = newVfdCache;
    1193             : 
    1194             :         /*
    1195             :          * Initialize the new entries and link them into the free list.
    1196             :          */
    1197      677492 :         for (i = SizeVfdCache; i < newCacheSize; i++)
    1198             :         {
    1199      663020 :             MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
    1200      663020 :             VfdCache[i].nextFree = i + 1;
    1201      663020 :             VfdCache[i].fd = VFD_CLOSED;
    1202             :         }
    1203       14472 :         VfdCache[newCacheSize - 1].nextFree = 0;
    1204       14472 :         VfdCache[0].nextFree = SizeVfdCache;
    1205             : 
    1206             :         /*
    1207             :          * Record the new size
    1208             :          */
    1209       14472 :         SizeVfdCache = newCacheSize;
    1210             :     }
    1211             : 
    1212     1332150 :     file = VfdCache[0].nextFree;
    1213             : 
    1214     1332150 :     VfdCache[0].nextFree = VfdCache[file].nextFree;
    1215             : 
    1216     1332150 :     return file;
    1217             : }
    1218             : 
    1219             : static void
    1220     1087552 : FreeVfd(File file)
    1221             : {
    1222     1087552 :     Vfd        *vfdP = &VfdCache[file];
    1223             : 
    1224             :     DO_DB(elog(LOG, "FreeVfd: %d (%s)",
    1225             :                file, vfdP->fileName ? vfdP->fileName : ""));
    1226             : 
    1227     1087552 :     if (vfdP->fileName != NULL)
    1228             :     {
    1229      818968 :         free(vfdP->fileName);
    1230      818968 :         vfdP->fileName = NULL;
    1231             :     }
    1232     1087552 :     vfdP->fdstate = 0x0;
    1233             : 
    1234     1087552 :     vfdP->nextFree = VfdCache[0].nextFree;
    1235     1087552 :     VfdCache[0].nextFree = file;
    1236     1087552 : }
    1237             : 
    1238             : /* returns 0 on success, -1 on re-open failure (with errno set) */
    1239             : static int
    1240     1943474 : FileAccess(File file)
    1241             : {
    1242             :     int         returnValue;
    1243             : 
    1244             :     DO_DB(elog(LOG, "FileAccess %d (%s)",
    1245             :                file, VfdCache[file].fileName));
    1246             : 
    1247             :     /*
    1248             :      * Is the file open?  If not, open it and put it at the head of the LRU
    1249             :      * ring (possibly closing the least recently used file to get an FD).
    1250             :      */
    1251             : 
    1252     1943474 :     if (FileIsNotOpen(file))
    1253             :     {
    1254      204476 :         returnValue = LruInsert(file);
    1255      204476 :         if (returnValue != 0)
    1256           0 :             return returnValue;
    1257             :     }
    1258     1738998 :     else if (VfdCache[0].lruLessRecently != file)
    1259             :     {
    1260             :         /*
    1261             :          * We now know that the file is open and that it is not the last one
    1262             :          * accessed, so we need to move it to the head of the Lru ring.
    1263             :          */
    1264             : 
    1265      434644 :         Delete(file);
    1266      434644 :         Insert(file);
    1267             :     }
    1268             : 
    1269     1943474 :     return 0;
    1270             : }
    1271             : 
    1272             : /*
    1273             :  * Called whenever a temporary file is deleted to report its size.
    1274             :  */
    1275             : static void
    1276        3216 : ReportTemporaryFileUsage(const char *path, off_t size)
    1277             : {
    1278        3216 :     pgstat_report_tempfile(size);
    1279             : 
    1280        3216 :     if (log_temp_files >= 0)
    1281             :     {
    1282        1650 :         if ((size / 1024) >= log_temp_files)
    1283         180 :             ereport(LOG,
    1284             :                     (errmsg("temporary file: path \"%s\", size %lu",
    1285             :                             path, (unsigned long) size)));
    1286             :     }
    1287        3216 : }
    1288             : 
    1289             : /*
    1290             :  * Called to register a temporary file for automatic close.
    1291             :  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
    1292             :  * before the file was opened.
    1293             :  */
    1294             : static void
    1295        4516 : RegisterTemporaryFile(File file)
    1296             : {
    1297        4516 :     ResourceOwnerRememberFile(CurrentResourceOwner, file);
    1298        4516 :     VfdCache[file].resowner = CurrentResourceOwner;
    1299             : 
    1300             :     /* Backup mechanism for closing at end of xact. */
    1301        4516 :     VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
    1302        4516 :     have_xact_temporary_files = true;
    1303        4516 : }
    1304             : 
    1305             : /*
    1306             :  *  Called when we get a shared invalidation message on some relation.
    1307             :  */
    1308             : #ifdef NOT_USED
    1309             : void
    1310             : FileInvalidate(File file)
    1311             : {
    1312             :     Assert(FileIsValid(file));
    1313             :     if (!FileIsNotOpen(file))
    1314             :         LruDelete(file);
    1315             : }
    1316             : #endif
    1317             : 
    1318             : /*
    1319             :  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
    1320             :  * fileMode parameter.
    1321             :  */
    1322             : File
    1323     1332150 : PathNameOpenFile(const char *fileName, int fileFlags)
    1324             : {
    1325     1332150 :     return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
    1326             : }
    1327             : 
    1328             : /*
    1329             :  * open a file in an arbitrary directory
    1330             :  *
    1331             :  * NB: if the passed pathname is relative (which it usually is),
    1332             :  * it will be interpreted relative to the process' working directory
    1333             :  * (which should always be $PGDATA when this code is running).
    1334             :  */
    1335             : File
    1336     1332150 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    1337             : {
    1338             :     char       *fnamecopy;
    1339             :     File        file;
    1340             :     Vfd        *vfdP;
    1341             : 
    1342             :     DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
    1343             :                fileName, fileFlags, fileMode));
    1344             : 
    1345             :     /*
    1346             :      * We need a malloc'd copy of the file name; fail cleanly if no room.
    1347             :      */
    1348     1332150 :     fnamecopy = strdup(fileName);
    1349     1332150 :     if (fnamecopy == NULL)
    1350           0 :         ereport(ERROR,
    1351             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
    1352             :                  errmsg("out of memory")));
    1353             : 
    1354     1332150 :     file = AllocateVfd();
    1355     1332150 :     vfdP = &VfdCache[file];
    1356             : 
    1357             :     /* Close excess kernel FDs. */
    1358     1332150 :     ReleaseLruFiles();
    1359             : 
    1360     1332150 :     vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    1361             : 
    1362     1332150 :     if (vfdP->fd < 0)
    1363             :     {
    1364      268584 :         int         save_errno = errno;
    1365             : 
    1366      268584 :         FreeVfd(file);
    1367      268584 :         free(fnamecopy);
    1368      268584 :         errno = save_errno;
    1369      268584 :         return -1;
    1370             :     }
    1371     1063566 :     ++nfile;
    1372             :     DO_DB(elog(LOG, "PathNameOpenFile: success %d",
    1373             :                vfdP->fd));
    1374             : 
    1375     1063566 :     Insert(file);
    1376             : 
    1377     1063566 :     vfdP->fileName = fnamecopy;
    1378             :     /* Saved flags are adjusted to be OK for re-opening file */
    1379     1063566 :     vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    1380     1063566 :     vfdP->fileMode = fileMode;
    1381     1063566 :     vfdP->fileSize = 0;
    1382     1063566 :     vfdP->fdstate = 0x0;
    1383     1063566 :     vfdP->resowner = NULL;
    1384             : 
    1385     1063566 :     return file;
    1386             : }
    1387             : 
    1388             : /*
    1389             :  * Create directory 'directory'.  If necessary, create 'basedir', which must
    1390             :  * be the directory above it.  This is designed for creating the top-level
    1391             :  * temporary directory on demand before creating a directory underneath it.
    1392             :  * Do nothing if the directory already exists.
    1393             :  *
    1394             :  * Directories created within the top-level temporary directory should begin
    1395             :  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
    1396             :  * deleted at startup by RemovePgTempFiles().  Further subdirectories below
    1397             :  * that do not need any particular prefix.
    1398             : */
    1399             : void
    1400         178 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
    1401             : {
    1402         178 :     if (MakePGDirectory(directory) < 0)
    1403             :     {
    1404           4 :         if (errno == EEXIST)
    1405           0 :             return;
    1406             : 
    1407             :         /*
    1408             :          * Failed.  Try to create basedir first in case it's missing. Tolerate
    1409             :          * EEXIST to close a race against another process following the same
    1410             :          * algorithm.
    1411             :          */
    1412           4 :         if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
    1413           0 :             ereport(ERROR,
    1414             :                     (errcode_for_file_access(),
    1415             :                      errmsg("cannot create temporary directory \"%s\": %m",
    1416             :                             basedir)));
    1417             : 
    1418             :         /* Try again. */
    1419           4 :         if (MakePGDirectory(directory) < 0 && errno != EEXIST)
    1420           0 :             ereport(ERROR,
    1421             :                     (errcode_for_file_access(),
    1422             :                      errmsg("cannot create temporary subdirectory \"%s\": %m",
    1423             :                             directory)));
    1424             :     }
    1425             : }
    1426             : 
    1427             : /*
    1428             :  * Delete a directory and everything in it, if it exists.
    1429             :  */
    1430             : void
    1431         216 : PathNameDeleteTemporaryDir(const char *dirname)
    1432             : {
    1433             :     struct stat statbuf;
    1434             : 
    1435             :     /* Silently ignore missing directory. */
    1436         216 :     if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
    1437          40 :         return;
    1438             : 
    1439             :     /*
    1440             :      * Currently, walkdir doesn't offer a way for our passed in function to
    1441             :      * maintain state.  Perhaps it should, so that we could tell the caller
    1442             :      * whether this operation succeeded or failed.  Since this operation is
    1443             :      * used in a cleanup path, we wouldn't actually behave differently: we'll
    1444             :      * just log failures.
    1445             :      */
    1446         176 :     walkdir(dirname, unlink_if_exists_fname, false, LOG);
    1447             : }
    1448             : 
    1449             : /*
    1450             :  * Open a temporary file that will disappear when we close it.
    1451             :  *
    1452             :  * This routine takes care of generating an appropriate tempfile name.
    1453             :  * There's no need to pass in fileFlags or fileMode either, since only
    1454             :  * one setting makes any sense for a temp file.
    1455             :  *
    1456             :  * Unless interXact is true, the file is remembered by CurrentResourceOwner
    1457             :  * to ensure it's closed and deleted when it's no longer needed, typically at
    1458             :  * the end-of-transaction. In most cases, you don't want temporary files to
    1459             :  * outlive the transaction that created them, so this should be false -- but
    1460             :  * if you need "somewhat" temporary storage, this might be useful. In either
    1461             :  * case, the file is removed when the File is explicitly closed.
    1462             :  */
    1463             : File
    1464        1858 : OpenTemporaryFile(bool interXact)
    1465             : {
    1466        1858 :     File        file = 0;
    1467             : 
    1468             :     /*
    1469             :      * Make sure the current resource owner has space for this File before we
    1470             :      * open it, if we'll be registering it below.
    1471             :      */
    1472        1858 :     if (!interXact)
    1473        1854 :         ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1474             : 
    1475             :     /*
    1476             :      * If some temp tablespace(s) have been given to us, try to use the next
    1477             :      * one.  If a given tablespace can't be found, we silently fall back to
    1478             :      * the database's default tablespace.
    1479             :      *
    1480             :      * BUT: if the temp file is slated to outlive the current transaction,
    1481             :      * force it into the database's default tablespace, so that it will not
    1482             :      * pose a threat to possible tablespace drop attempts.
    1483             :      */
    1484        1858 :     if (numTempTableSpaces > 0 && !interXact)
    1485             :     {
    1486           0 :         Oid         tblspcOid = GetNextTempTableSpace();
    1487             : 
    1488           0 :         if (OidIsValid(tblspcOid))
    1489           0 :             file = OpenTemporaryFileInTablespace(tblspcOid, false);
    1490             :     }
    1491             : 
    1492             :     /*
    1493             :      * If not, or if tablespace is bad, create in database's default
    1494             :      * tablespace.  MyDatabaseTableSpace should normally be set before we get
    1495             :      * here, but just in case it isn't, fall back to pg_default tablespace.
    1496             :      */
    1497        1858 :     if (file <= 0)
    1498        1858 :         file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
    1499             :                                              MyDatabaseTableSpace :
    1500             :                                              DEFAULTTABLESPACE_OID,
    1501             :                                              true);
    1502             : 
    1503             :     /* Mark it for deletion at close and temporary file size limit */
    1504        1858 :     VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
    1505             : 
    1506             :     /* Register it with the current resource owner */
    1507        1858 :     if (!interXact)
    1508        1854 :         RegisterTemporaryFile(file);
    1509             : 
    1510        1858 :     return file;
    1511             : }
    1512             : 
    1513             : /*
    1514             :  * Return the path of the temp directory in a given tablespace.
    1515             :  */
    1516             : void
    1517        7754 : TempTablespacePath(char *path, Oid tablespace)
    1518             : {
    1519             :     /*
    1520             :      * Identify the tempfile directory for this tablespace.
    1521             :      *
    1522             :      * If someone tries to specify pg_global, use pg_default instead.
    1523             :      */
    1524        7754 :     if (tablespace == InvalidOid ||
    1525           0 :         tablespace == DEFAULTTABLESPACE_OID ||
    1526             :         tablespace == GLOBALTABLESPACE_OID)
    1527        7754 :         snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
    1528             :     else
    1529             :     {
    1530             :         /* All other tablespaces are accessed via symlinks */
    1531           0 :         snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
    1532             :                  tablespace, TABLESPACE_VERSION_DIRECTORY,
    1533             :                  PG_TEMP_FILES_DIR);
    1534             :     }
    1535        7754 : }
    1536             : 
    1537             : /*
    1538             :  * Open a temporary file in a specific tablespace.
    1539             :  * Subroutine for OpenTemporaryFile, which see for details.
    1540             :  */
    1541             : static File
    1542        1858 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
    1543             : {
    1544             :     char        tempdirpath[MAXPGPATH];
    1545             :     char        tempfilepath[MAXPGPATH];
    1546             :     File        file;
    1547             : 
    1548        1858 :     TempTablespacePath(tempdirpath, tblspcOid);
    1549             : 
    1550             :     /*
    1551             :      * Generate a tempfile name that should be unique within the current
    1552             :      * database instance.
    1553             :      */
    1554        1858 :     snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
    1555             :              tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
    1556             : 
    1557             :     /*
    1558             :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1559             :      * temp file that can be reused.
    1560             :      */
    1561        1858 :     file = PathNameOpenFile(tempfilepath,
    1562             :                             O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1563        1858 :     if (file <= 0)
    1564             :     {
    1565             :         /*
    1566             :          * We might need to create the tablespace's tempfile directory, if no
    1567             :          * one has yet done so.
    1568             :          *
    1569             :          * Don't check for an error from MakePGDirectory; it could fail if
    1570             :          * someone else just did the same thing.  If it doesn't work then
    1571             :          * we'll bomb out on the second create attempt, instead.
    1572             :          */
    1573          14 :         (void) MakePGDirectory(tempdirpath);
    1574             : 
    1575          14 :         file = PathNameOpenFile(tempfilepath,
    1576             :                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1577          14 :         if (file <= 0 && rejectError)
    1578           0 :             elog(ERROR, "could not create temporary file \"%s\": %m",
    1579             :                  tempfilepath);
    1580             :     }
    1581             : 
    1582        1858 :     return file;
    1583             : }
    1584             : 
    1585             : 
    1586             : /*
    1587             :  * Create a new file.  The directory containing it must already exist.  Files
    1588             :  * created this way are subject to temp_file_limit and are automatically
    1589             :  * closed at end of transaction, but are not automatically deleted on close
    1590             :  * because they are intended to be shared between cooperating backends.
    1591             :  *
    1592             :  * If the file is inside the top-level temporary directory, its name should
    1593             :  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
    1594             :  * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
    1595             :  * inside a directory created with PathNameCreateTemporaryDir(), in which case
    1596             :  * the prefix isn't needed.
    1597             :  */
    1598             : File
    1599        1536 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
    1600             : {
    1601             :     File        file;
    1602             : 
    1603        1536 :     ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1604             : 
    1605             :     /*
    1606             :      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
    1607             :      * temp file that can be reused.
    1608             :      */
    1609        1536 :     file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
    1610        1536 :     if (file <= 0)
    1611             :     {
    1612         178 :         if (error_on_failure)
    1613           0 :             ereport(ERROR,
    1614             :                     (errcode_for_file_access(),
    1615             :                      errmsg("could not create temporary file \"%s\": %m",
    1616             :                             path)));
    1617             :         else
    1618         178 :             return file;
    1619             :     }
    1620             : 
    1621             :     /* Mark it for temp_file_limit accounting. */
    1622        1358 :     VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
    1623             : 
    1624             :     /* Register it for automatic close. */
    1625        1358 :     RegisterTemporaryFile(file);
    1626             : 
    1627        1358 :     return file;
    1628             : }
    1629             : 
    1630             : /*
    1631             :  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
    1632             :  * another backend.  Files opened this way don't count against the
    1633             :  * temp_file_limit of the caller, are read-only and are automatically closed
    1634             :  * at the end of the transaction but are not deleted on close.
    1635             :  */
    1636             : File
    1637        2608 : PathNameOpenTemporaryFile(const char *path)
    1638             : {
    1639             :     File        file;
    1640             : 
    1641        2608 :     ResourceOwnerEnlargeFiles(CurrentResourceOwner);
    1642             : 
    1643             :     /* We open the file read-only. */
    1644        2608 :     file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
    1645             : 
    1646             :     /* If no such file, then we don't raise an error. */
    1647        2608 :     if (file <= 0 && errno != ENOENT)
    1648           0 :         ereport(ERROR,
    1649             :                 (errcode_for_file_access(),
    1650             :                  errmsg("could not open temporary file \"%s\": %m",
    1651             :                         path)));
    1652             : 
    1653        2608 :     if (file > 0)
    1654             :     {
    1655             :         /* Register it for automatic close. */
    1656        1304 :         RegisterTemporaryFile(file);
    1657             :     }
    1658             : 
    1659        2608 :     return file;
    1660             : }
    1661             : 
    1662             : /*
    1663             :  * Delete a file by pathname.  Return true if the file existed, false if
    1664             :  * didn't.
    1665             :  */
    1666             : bool
    1667        2716 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
    1668             : {
    1669             :     struct stat filestats;
    1670             :     int         stat_errno;
    1671             : 
    1672             :     /* Get the final size for pgstat reporting. */
    1673        2716 :     if (stat(path, &filestats) != 0)
    1674        1358 :         stat_errno = errno;
    1675             :     else
    1676        1358 :         stat_errno = 0;
    1677             : 
    1678             :     /*
    1679             :      * Unlike FileClose's automatic file deletion code, we tolerate
    1680             :      * non-existence to support BufFileDeleteShared which doesn't know how
    1681             :      * many segments it has to delete until it runs out.
    1682             :      */
    1683        2716 :     if (stat_errno == ENOENT)
    1684        1358 :         return false;
    1685             : 
    1686        1358 :     if (unlink(path) < 0)
    1687             :     {
    1688           0 :         if (errno != ENOENT)
    1689           0 :             ereport(error_on_failure ? ERROR : LOG,
    1690             :                     (errcode_for_file_access(),
    1691             :                      errmsg("cannot unlink temporary file \"%s\": %m",
    1692             :                             path)));
    1693           0 :         return false;
    1694             :     }
    1695             : 
    1696        1358 :     if (stat_errno == 0)
    1697        1358 :         ReportTemporaryFileUsage(path, filestats.st_size);
    1698             :     else
    1699             :     {
    1700           0 :         errno = stat_errno;
    1701           0 :         ereport(LOG,
    1702             :                 (errcode_for_file_access(),
    1703             :                  errmsg("could not stat file \"%s\": %m", path)));
    1704             :     }
    1705             : 
    1706        1358 :     return true;
    1707             : }
    1708             : 
    1709             : /*
    1710             :  * close a file when done with it
    1711             :  */
    1712             : void
    1713      818968 : FileClose(File file)
    1714             : {
    1715             :     Vfd        *vfdP;
    1716             : 
    1717             :     Assert(FileIsValid(file));
    1718             : 
    1719             :     DO_DB(elog(LOG, "FileClose: %d (%s)",
    1720             :                file, VfdCache[file].fileName));
    1721             : 
    1722      818968 :     vfdP = &VfdCache[file];
    1723             : 
    1724      818968 :     if (!FileIsNotOpen(file))
    1725             :     {
    1726             :         /* close the file */
    1727      710526 :         if (close(vfdP->fd))
    1728             :         {
    1729             :             /*
    1730             :              * We may need to panic on failure to close non-temporary files;
    1731             :              * see LruDelete.
    1732             :              */
    1733           0 :             elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
    1734             :                  "could not close file \"%s\": %m", vfdP->fileName);
    1735             :         }
    1736             : 
    1737      710526 :         --nfile;
    1738      710526 :         vfdP->fd = VFD_CLOSED;
    1739             : 
    1740             :         /* remove the file from the lru ring */
    1741      710526 :         Delete(file);
    1742             :     }
    1743             : 
    1744      818968 :     if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    1745             :     {
    1746             :         /* Subtract its size from current usage (do first in case of error) */
    1747        3216 :         temporary_files_size -= vfdP->fileSize;
    1748        3216 :         vfdP->fileSize = 0;
    1749             :     }
    1750             : 
    1751             :     /*
    1752             :      * Delete the file if it was temporary, and make a log entry if wanted
    1753             :      */
    1754      818968 :     if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
    1755             :     {
    1756             :         struct stat filestats;
    1757             :         int         stat_errno;
    1758             : 
    1759             :         /*
    1760             :          * If we get an error, as could happen within the ereport/elog calls,
    1761             :          * we'll come right back here during transaction abort.  Reset the
    1762             :          * flag to ensure that we can't get into an infinite loop.  This code
    1763             :          * is arranged to ensure that the worst-case consequence is failing to
    1764             :          * emit log message(s), not failing to attempt the unlink.
    1765             :          */
    1766        1858 :         vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
    1767             : 
    1768             : 
    1769             :         /* first try the stat() */
    1770        1858 :         if (stat(vfdP->fileName, &filestats))
    1771           0 :             stat_errno = errno;
    1772             :         else
    1773        1858 :             stat_errno = 0;
    1774             : 
    1775             :         /* in any case do the unlink */
    1776        1858 :         if (unlink(vfdP->fileName))
    1777           0 :             elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
    1778             : 
    1779             :         /* and last report the stat results */
    1780        1858 :         if (stat_errno == 0)
    1781        1858 :             ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
    1782             :         else
    1783             :         {
    1784           0 :             errno = stat_errno;
    1785           0 :             elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
    1786             :         }
    1787             :     }
    1788             : 
    1789             :     /* Unregister it from the resource owner */
    1790      818968 :     if (vfdP->resowner)
    1791        4516 :         ResourceOwnerForgetFile(vfdP->resowner, file);
    1792             : 
    1793             :     /*
    1794             :      * Return the Vfd slot to the free list
    1795             :      */
    1796      818968 :     FreeVfd(file);
    1797      818968 : }
    1798             : 
    1799             : /*
    1800             :  * FilePrefetch - initiate asynchronous read of a given range of the file.
    1801             :  *
    1802             :  * Currently the only implementation of this function is using posix_fadvise
    1803             :  * which is the simplest standardized interface that accomplishes this.
    1804             :  * We could add an implementation using libaio in the future; but note that
    1805             :  * this API is inappropriate for libaio, which wants to have a buffer provided
    1806             :  * to read into.
    1807             :  */
    1808             : int
    1809          84 : FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
    1810             : {
    1811             : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
    1812             :     int         returnCode;
    1813             : 
    1814             :     Assert(FileIsValid(file));
    1815             : 
    1816             :     DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
    1817             :                file, VfdCache[file].fileName,
    1818             :                (int64) offset, amount));
    1819             : 
    1820          84 :     returnCode = FileAccess(file);
    1821          84 :     if (returnCode < 0)
    1822           0 :         return returnCode;
    1823             : 
    1824          84 :     pgstat_report_wait_start(wait_event_info);
    1825          84 :     returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
    1826             :                                POSIX_FADV_WILLNEED);
    1827          84 :     pgstat_report_wait_end();
    1828             : 
    1829          84 :     return returnCode;
    1830             : #else
    1831             :     Assert(FileIsValid(file));
    1832             :     return 0;
    1833             : #endif
    1834             : }
    1835             : 
    1836             : void
    1837      149522 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
    1838             : {
    1839             :     int         returnCode;
    1840             : 
    1841             :     Assert(FileIsValid(file));
    1842             : 
    1843             :     DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
    1844             :                file, VfdCache[file].fileName,
    1845             :                (int64) offset, (int64) nbytes));
    1846             : 
    1847      149522 :     if (nbytes <= 0)
    1848           0 :         return;
    1849             : 
    1850      149522 :     returnCode = FileAccess(file);
    1851      149522 :     if (returnCode < 0)
    1852           0 :         return;
    1853             : 
    1854      149522 :     pgstat_report_wait_start(wait_event_info);
    1855      149522 :     pg_flush_data(VfdCache[file].fd, offset, nbytes);
    1856      149522 :     pgstat_report_wait_end();
    1857             : }
    1858             : 
    1859             : int
    1860      507734 : FileRead(File file, char *buffer, int amount, off_t offset,
    1861             :          uint32 wait_event_info)
    1862             : {
    1863             :     int         returnCode;
    1864             :     Vfd        *vfdP;
    1865             : 
    1866             :     Assert(FileIsValid(file));
    1867             : 
    1868             :     DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
    1869             :                file, VfdCache[file].fileName,
    1870             :                (int64) offset,
    1871             :                amount, buffer));
    1872             : 
    1873      507734 :     returnCode = FileAccess(file);
    1874      507734 :     if (returnCode < 0)
    1875           0 :         return returnCode;
    1876             : 
    1877      507734 :     vfdP = &VfdCache[file];
    1878             : 
    1879             : retry:
    1880      507734 :     pgstat_report_wait_start(wait_event_info);
    1881      507734 :     returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
    1882      507734 :     pgstat_report_wait_end();
    1883             : 
    1884      507734 :     if (returnCode < 0)
    1885             :     {
    1886             :         /*
    1887             :          * Windows may run out of kernel buffers and return "Insufficient
    1888             :          * system resources" error.  Wait a bit and retry to solve it.
    1889             :          *
    1890             :          * It is rumored that EINTR is also possible on some Unix filesystems,
    1891             :          * in which case immediate retry is indicated.
    1892             :          */
    1893             : #ifdef WIN32
    1894             :         DWORD       error = GetLastError();
    1895             : 
    1896             :         switch (error)
    1897             :         {
    1898             :             case ERROR_NO_SYSTEM_RESOURCES:
    1899             :                 pg_usleep(1000L);
    1900             :                 errno = EINTR;
    1901             :                 break;
    1902             :             default:
    1903             :                 _dosmaperr(error);
    1904             :                 break;
    1905             :         }
    1906             : #endif
    1907             :         /* OK to retry if interrupted */
    1908           0 :         if (errno == EINTR)
    1909           0 :             goto retry;
    1910             :     }
    1911             : 
    1912      507734 :     return returnCode;
    1913             : }
    1914             : 
    1915             : int
    1916     1081684 : FileWrite(File file, char *buffer, int amount, off_t offset,
    1917             :           uint32 wait_event_info)
    1918             : {
    1919             :     int         returnCode;
    1920             :     Vfd        *vfdP;
    1921             : 
    1922             :     Assert(FileIsValid(file));
    1923             : 
    1924             :     DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
    1925             :                file, VfdCache[file].fileName,
    1926             :                (int64) offset,
    1927             :                amount, buffer));
    1928             : 
    1929     1081684 :     returnCode = FileAccess(file);
    1930     1081684 :     if (returnCode < 0)
    1931           0 :         return returnCode;
    1932             : 
    1933     1081684 :     vfdP = &VfdCache[file];
    1934             : 
    1935             :     /*
    1936             :      * If enforcing temp_file_limit and it's a temp file, check to see if the
    1937             :      * write would overrun temp_file_limit, and throw error if so.  Note: it's
    1938             :      * really a modularity violation to throw error here; we should set errno
    1939             :      * and return -1.  However, there's no way to report a suitable error
    1940             :      * message if we do that.  All current callers would just throw error
    1941             :      * immediately anyway, so this is safe at present.
    1942             :      */
    1943     1081684 :     if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
    1944             :     {
    1945           0 :         off_t       past_write = offset + amount;
    1946             : 
    1947           0 :         if (past_write > vfdP->fileSize)
    1948             :         {
    1949           0 :             uint64      newTotal = temporary_files_size;
    1950             : 
    1951           0 :             newTotal += past_write - vfdP->fileSize;
    1952           0 :             if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
    1953           0 :                 ereport(ERROR,
    1954             :                         (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
    1955             :                          errmsg("temporary file size exceeds temp_file_limit (%dkB)",
    1956             :                                 temp_file_limit)));
    1957             :         }
    1958             :     }
    1959             : 
    1960             : retry:
    1961     1081684 :     errno = 0;
    1962     1081684 :     pgstat_report_wait_start(wait_event_info);
    1963     1081684 :     returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
    1964     1081684 :     pgstat_report_wait_end();
    1965             : 
    1966             :     /* if write didn't set errno, assume problem is no disk space */
    1967     1081684 :     if (returnCode != amount && errno == 0)
    1968           0 :         errno = ENOSPC;
    1969             : 
    1970     1081684 :     if (returnCode >= 0)
    1971             :     {
    1972             :         /*
    1973             :          * Maintain fileSize and temporary_files_size if it's a temp file.
    1974             :          *
    1975             :          * If seekPos is -1 (unknown), this will do nothing; but we could only
    1976             :          * get here in that state if we're not enforcing temporary_files_size,
    1977             :          * so we don't care.
    1978             :          */
    1979     1081684 :         if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
    1980             :         {
    1981       47728 :             off_t       past_write = offset + amount;
    1982             : 
    1983       47728 :             if (past_write > vfdP->fileSize)
    1984             :             {
    1985       44776 :                 temporary_files_size += past_write - vfdP->fileSize;
    1986       44776 :                 vfdP->fileSize = past_write;
    1987             :             }
    1988             :         }
    1989             :     }
    1990             :     else
    1991             :     {
    1992             :         /*
    1993             :          * See comments in FileRead()
    1994             :          */
    1995             : #ifdef WIN32
    1996             :         DWORD       error = GetLastError();
    1997             : 
    1998             :         switch (error)
    1999             :         {
    2000             :             case ERROR_NO_SYSTEM_RESOURCES:
    2001             :                 pg_usleep(1000L);
    2002             :                 errno = EINTR;
    2003             :                 break;
    2004             :             default:
    2005             :                 _dosmaperr(error);
    2006             :                 break;
    2007             :         }
    2008             : #endif
    2009             :         /* OK to retry if interrupted */
    2010           0 :         if (errno == EINTR)
    2011           0 :             goto retry;
    2012             :     }
    2013             : 
    2014     1081684 :     return returnCode;
    2015             : }
    2016             : 
    2017             : int
    2018       69876 : FileSync(File file, uint32 wait_event_info)
    2019             : {
    2020             :     int         returnCode;
    2021             : 
    2022             :     Assert(FileIsValid(file));
    2023             : 
    2024             :     DO_DB(elog(LOG, "FileSync: %d (%s)",
    2025             :                file, VfdCache[file].fileName));
    2026             : 
    2027       69876 :     returnCode = FileAccess(file);
    2028       69876 :     if (returnCode < 0)
    2029           0 :         return returnCode;
    2030             : 
    2031       69876 :     pgstat_report_wait_start(wait_event_info);
    2032       69876 :     returnCode = pg_fsync(VfdCache[file].fd);
    2033       69876 :     pgstat_report_wait_end();
    2034             : 
    2035       69876 :     return returnCode;
    2036             : }
    2037             : 
    2038             : off_t
    2039     3695946 : FileSize(File file)
    2040             : {
    2041             :     Assert(FileIsValid(file));
    2042             : 
    2043             :     DO_DB(elog(LOG, "FileSize %d (%s)",
    2044             :                file, VfdCache[file].fileName));
    2045             : 
    2046     3695946 :     if (FileIsNotOpen(file))
    2047             :     {
    2048      134198 :         if (FileAccess(file) < 0)
    2049           0 :             return (off_t) -1;
    2050             :     }
    2051             : 
    2052     3695946 :     return lseek(VfdCache[file].fd, 0, SEEK_END);
    2053             : }
    2054             : 
    2055             : int
    2056         376 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
    2057             : {
    2058             :     int         returnCode;
    2059             : 
    2060             :     Assert(FileIsValid(file));
    2061             : 
    2062             :     DO_DB(elog(LOG, "FileTruncate %d (%s)",
    2063             :                file, VfdCache[file].fileName));
    2064             : 
    2065         376 :     returnCode = FileAccess(file);
    2066         376 :     if (returnCode < 0)
    2067           0 :         return returnCode;
    2068             : 
    2069         376 :     pgstat_report_wait_start(wait_event_info);
    2070         376 :     returnCode = ftruncate(VfdCache[file].fd, offset);
    2071         376 :     pgstat_report_wait_end();
    2072             : 
    2073         376 :     if (returnCode == 0 && VfdCache[file].fileSize > offset)
    2074             :     {
    2075             :         /* adjust our state for truncation of a temp file */
    2076             :         Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
    2077           0 :         temporary_files_size -= VfdCache[file].fileSize - offset;
    2078           0 :         VfdCache[file].fileSize = offset;
    2079             :     }
    2080             : 
    2081         376 :     return returnCode;
    2082             : }
    2083             : 
    2084             : /*
    2085             :  * Return the pathname associated with an open file.
    2086             :  *
    2087             :  * The returned string points to an internal buffer, which is valid until
    2088             :  * the file is closed.
    2089             :  */
    2090             : char *
    2091           0 : FilePathName(File file)
    2092             : {
    2093             :     Assert(FileIsValid(file));
    2094             : 
    2095           0 :     return VfdCache[file].fileName;
    2096             : }
    2097             : 
    2098             : /*
    2099             :  * Return the raw file descriptor of an opened file.
    2100             :  *
    2101             :  * The returned file descriptor will be valid until the file is closed, but
    2102             :  * there are a lot of things that can make that happen.  So the caller should
    2103             :  * be careful not to do much of anything else before it finishes using the
    2104             :  * returned file descriptor.
    2105             :  */
    2106             : int
    2107           0 : FileGetRawDesc(File file)
    2108             : {
    2109             :     Assert(FileIsValid(file));
    2110           0 :     return VfdCache[file].fd;
    2111             : }
    2112             : 
    2113             : /*
    2114             :  * FileGetRawFlags - returns the file flags on open(2)
    2115             :  */
    2116             : int
    2117           0 : FileGetRawFlags(File file)
    2118             : {
    2119             :     Assert(FileIsValid(file));
    2120           0 :     return VfdCache[file].fileFlags;
    2121             : }
    2122             : 
    2123             : /*
    2124             :  * FileGetRawMode - returns the mode bitmask passed to open(2)
    2125             :  */
    2126             : mode_t
    2127           0 : FileGetRawMode(File file)
    2128             : {
    2129             :     Assert(FileIsValid(file));
    2130           0 :     return VfdCache[file].fileMode;
    2131             : }
    2132             : 
    2133             : /*
    2134             :  * Make room for another allocatedDescs[] array entry if needed and possible.
    2135             :  * Returns true if an array element is available.
    2136             :  */
    2137             : static bool
    2138      847402 : reserveAllocatedDesc(void)
    2139             : {
    2140             :     AllocateDesc *newDescs;
    2141             :     int         newMax;
    2142             : 
    2143             :     /* Quick out if array already has a free slot. */
    2144      847402 :     if (numAllocatedDescs < maxAllocatedDescs)
    2145      845548 :         return true;
    2146             : 
    2147             :     /*
    2148             :      * If the array hasn't yet been created in the current process, initialize
    2149             :      * it with FD_MINFREE / 2 elements.  In many scenarios this is as many as
    2150             :      * we will ever need, anyway.  We don't want to look at max_safe_fds
    2151             :      * immediately because set_max_safe_fds() may not have run yet.
    2152             :      */
    2153        1854 :     if (allocatedDescs == NULL)
    2154             :     {
    2155        1854 :         newMax = FD_MINFREE / 2;
    2156        1854 :         newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
    2157             :         /* Out of memory already?  Treat as fatal error. */
    2158        1854 :         if (newDescs == NULL)
    2159           0 :             ereport(ERROR,
    2160             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
    2161             :                      errmsg("out of memory")));
    2162        1854 :         allocatedDescs = newDescs;
    2163        1854 :         maxAllocatedDescs = newMax;
    2164        1854 :         return true;
    2165             :     }
    2166             : 
    2167             :     /*
    2168             :      * Consider enlarging the array beyond the initial allocation used above.
    2169             :      * By the time this happens, max_safe_fds should be known accurately.
    2170             :      *
    2171             :      * We mustn't let allocated descriptors hog all the available FDs, and in
    2172             :      * practice we'd better leave a reasonable number of FDs for VFD use.  So
    2173             :      * set the maximum to max_safe_fds / 2.  (This should certainly be at
    2174             :      * least as large as the initial size, FD_MINFREE / 2.)
    2175             :      */
    2176           0 :     newMax = max_safe_fds / 2;
    2177           0 :     if (newMax > maxAllocatedDescs)
    2178             :     {
    2179           0 :         newDescs = (AllocateDesc *) realloc(allocatedDescs,
    2180             :                                             newMax * sizeof(AllocateDesc));
    2181             :         /* Treat out-of-memory as a non-fatal error. */
    2182           0 :         if (newDescs == NULL)
    2183           0 :             return false;
    2184           0 :         allocatedDescs = newDescs;
    2185           0 :         maxAllocatedDescs = newMax;
    2186           0 :         return true;
    2187             :     }
    2188             : 
    2189             :     /* Can't enlarge allocatedDescs[] any more. */
    2190           0 :     return false;
    2191             : }
    2192             : 
    2193             : /*
    2194             :  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
    2195             :  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
    2196             :  * necessary to open the file.  When done, call FreeFile rather than fclose.
    2197             :  *
    2198             :  * Note that files that will be open for any significant length of time
    2199             :  * should NOT be handled this way, since they cannot share kernel file
    2200             :  * descriptors with other files; there is grave risk of running out of FDs
    2201             :  * if anyone locks down too many FDs.  Most callers of this routine are
    2202             :  * simply reading a config file that they will read and close immediately.
    2203             :  *
    2204             :  * fd.c will automatically close all files opened with AllocateFile at
    2205             :  * transaction commit or abort; this prevents FD leakage if a routine
    2206             :  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
    2207             :  *
    2208             :  * Ideally this should be the *only* direct call of fopen() in the backend.
    2209             :  */
    2210             : FILE *
    2211      169030 : AllocateFile(const char *name, const char *mode)
    2212             : {
    2213             :     FILE       *file;
    2214             : 
    2215             :     DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
    2216             :                numAllocatedDescs, name));
    2217             : 
    2218             :     /* Can we allocate another non-virtual FD? */
    2219      169030 :     if (!reserveAllocatedDesc())
    2220           0 :         ereport(ERROR,
    2221             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2222             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2223             :                         maxAllocatedDescs, name)));
    2224             : 
    2225             :     /* Close excess kernel FDs. */
    2226      169030 :     ReleaseLruFiles();
    2227             : 
    2228             : TryAgain:
    2229      169030 :     if ((file = fopen(name, mode)) != NULL)
    2230             :     {
    2231      160778 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2232             : 
    2233      160778 :         desc->kind = AllocateDescFile;
    2234      160778 :         desc->desc.file = file;
    2235      160778 :         desc->create_subid = GetCurrentSubTransactionId();
    2236      160778 :         numAllocatedDescs++;
    2237      160778 :         return desc->desc.file;
    2238             :     }
    2239             : 
    2240        8252 :     if (errno == EMFILE || errno == ENFILE)
    2241             :     {
    2242           0 :         int         save_errno = errno;
    2243             : 
    2244           0 :         ereport(LOG,
    2245             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2246             :                  errmsg("out of file descriptors: %m; release and retry")));
    2247           0 :         errno = 0;
    2248           0 :         if (ReleaseLruFile())
    2249           0 :             goto TryAgain;
    2250           0 :         errno = save_errno;
    2251             :     }
    2252             : 
    2253        8252 :     return NULL;
    2254             : }
    2255             : 
    2256             : /*
    2257             :  * Open a file with OpenTransientFilePerm() and pass default file mode for
    2258             :  * the fileMode parameter.
    2259             :  */
    2260             : int
    2261      637466 : OpenTransientFile(const char *fileName, int fileFlags)
    2262             : {
    2263      637466 :     return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
    2264             : }
    2265             : 
    2266             : /*
    2267             :  * Like AllocateFile, but returns an unbuffered fd like open(2)
    2268             :  */
    2269             : int
    2270      637470 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
    2271             : {
    2272             :     int         fd;
    2273             : 
    2274             :     DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
    2275             :                numAllocatedDescs, fileName));
    2276             : 
    2277             :     /* Can we allocate another non-virtual FD? */
    2278      637470 :     if (!reserveAllocatedDesc())
    2279           0 :         ereport(ERROR,
    2280             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2281             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
    2282             :                         maxAllocatedDescs, fileName)));
    2283             : 
    2284             :     /* Close excess kernel FDs. */
    2285      637470 :     ReleaseLruFiles();
    2286             : 
    2287      637470 :     fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
    2288             : 
    2289      637470 :     if (fd >= 0)
    2290             :     {
    2291      636680 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2292             : 
    2293      636680 :         desc->kind = AllocateDescRawFD;
    2294      636680 :         desc->desc.fd = fd;
    2295      636680 :         desc->create_subid = GetCurrentSubTransactionId();
    2296      636680 :         numAllocatedDescs++;
    2297             : 
    2298      636680 :         return fd;
    2299             :     }
    2300             : 
    2301         790 :     return -1;                  /* failure */
    2302             : }
    2303             : 
    2304             : /*
    2305             :  * Routines that want to initiate a pipe stream should use OpenPipeStream
    2306             :  * rather than plain popen().  This lets fd.c deal with freeing FDs if
    2307             :  * necessary.  When done, call ClosePipeStream rather than pclose.
    2308             :  *
    2309             :  * This function also ensures that the popen'd program is run with default
    2310             :  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
    2311             :  * uses.  This ensures desirable response to, eg, closing a read pipe early.
    2312             :  */
    2313             : FILE *
    2314         322 : OpenPipeStream(const char *command, const char *mode)
    2315             : {
    2316             :     FILE       *file;
    2317             :     int         save_errno;
    2318             : 
    2319             :     DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
    2320             :                numAllocatedDescs, command));
    2321             : 
    2322             :     /* Can we allocate another non-virtual FD? */
    2323         322 :     if (!reserveAllocatedDesc())
    2324           0 :         ereport(ERROR,
    2325             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2326             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
    2327             :                         maxAllocatedDescs, command)));
    2328             : 
    2329             :     /* Close excess kernel FDs. */
    2330         322 :     ReleaseLruFiles();
    2331             : 
    2332             : TryAgain:
    2333         322 :     fflush(stdout);
    2334         322 :     fflush(stderr);
    2335         322 :     pqsignal(SIGPIPE, SIG_DFL);
    2336         322 :     errno = 0;
    2337         322 :     file = popen(command, mode);
    2338         322 :     save_errno = errno;
    2339         322 :     pqsignal(SIGPIPE, SIG_IGN);
    2340         322 :     errno = save_errno;
    2341         322 :     if (file != NULL)
    2342             :     {
    2343         322 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2344             : 
    2345         322 :         desc->kind = AllocateDescPipe;
    2346         322 :         desc->desc.file = file;
    2347         322 :         desc->create_subid = GetCurrentSubTransactionId();
    2348         322 :         numAllocatedDescs++;
    2349         322 :         return desc->desc.file;
    2350             :     }
    2351             : 
    2352           0 :     if (errno == EMFILE || errno == ENFILE)
    2353             :     {
    2354           0 :         ereport(LOG,
    2355             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2356             :                  errmsg("out of file descriptors: %m; release and retry")));
    2357           0 :         if (ReleaseLruFile())
    2358           0 :             goto TryAgain;
    2359           0 :         errno = save_errno;
    2360             :     }
    2361             : 
    2362           0 :     return NULL;
    2363             : }
    2364             : 
    2365             : /*
    2366             :  * Free an AllocateDesc of any type.
    2367             :  *
    2368             :  * The argument *must* point into the allocatedDescs[] array.
    2369             :  */
    2370             : static int
    2371      837784 : FreeDesc(AllocateDesc *desc)
    2372             : {
    2373             :     int         result;
    2374             : 
    2375             :     /* Close the underlying object */
    2376      837784 :     switch (desc->kind)
    2377             :     {
    2378             :         case AllocateDescFile:
    2379      160778 :             result = fclose(desc->desc.file);
    2380      160778 :             break;
    2381             :         case AllocateDescPipe:
    2382         322 :             result = pclose(desc->desc.file);
    2383         322 :             break;
    2384             :         case AllocateDescDir:
    2385       40004 :             result = closedir(desc->desc.dir);
    2386       40004 :             break;
    2387             :         case AllocateDescRawFD:
    2388      636680 :             result = close(desc->desc.fd);
    2389      636680 :             break;
    2390             :         default:
    2391           0 :             elog(ERROR, "AllocateDesc kind not recognized");
    2392             :             result = 0;         /* keep compiler quiet */
    2393             :             break;
    2394             :     }
    2395             : 
    2396             :     /* Compact storage in the allocatedDescs array */
    2397      837784 :     numAllocatedDescs--;
    2398      837784 :     *desc = allocatedDescs[numAllocatedDescs];
    2399             : 
    2400      837784 :     return result;
    2401             : }
    2402             : 
    2403             : /*
    2404             :  * Close a file returned by AllocateFile.
    2405             :  *
    2406             :  * Note we do not check fclose's return value --- it is up to the caller
    2407             :  * to handle close errors.
    2408             :  */
    2409             : int
    2410      160760 : FreeFile(FILE *file)
    2411             : {
    2412             :     int         i;
    2413             : 
    2414             :     DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
    2415             : 
    2416             :     /* Remove file from list of allocated files, if it's present */
    2417      321522 :     for (i = numAllocatedDescs; --i >= 0;)
    2418             :     {
    2419      160762 :         AllocateDesc *desc = &allocatedDescs[i];
    2420             : 
    2421      160762 :         if (desc->kind == AllocateDescFile && desc->desc.file == file)
    2422      160760 :             return FreeDesc(desc);
    2423             :     }
    2424             : 
    2425             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2426           0 :     elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
    2427             : 
    2428           0 :     return fclose(file);
    2429             : }
    2430             : 
    2431             : /*
    2432             :  * Close a file returned by OpenTransientFile.
    2433             :  *
    2434             :  * Note we do not check close's return value --- it is up to the caller
    2435             :  * to handle close errors.
    2436             :  */
    2437             : int
    2438      636680 : CloseTransientFile(int fd)
    2439             : {
    2440             :     int         i;
    2441             : 
    2442             :     DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
    2443             : 
    2444             :     /* Remove fd from list of allocated files, if it's present */
    2445     1273368 :     for (i = numAllocatedDescs; --i >= 0;)
    2446             :     {
    2447      636688 :         AllocateDesc *desc = &allocatedDescs[i];
    2448             : 
    2449      636688 :         if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
    2450      636680 :             return FreeDesc(desc);
    2451             :     }
    2452             : 
    2453             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2454           0 :     elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
    2455             : 
    2456           0 :     return close(fd);
    2457             : }
    2458             : 
    2459             : /*
    2460             :  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
    2461             :  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
    2462             :  * necessary to open the directory, and with closing it after an elog.
    2463             :  * When done, call FreeDir rather than closedir.
    2464             :  *
    2465             :  * Returns NULL, with errno set, on failure.  Note that failure detection
    2466             :  * is commonly left to the following call of ReadDir or ReadDirExtended;
    2467             :  * see the comments for ReadDir.
    2468             :  *
    2469             :  * Ideally this should be the *only* direct call of opendir() in the backend.
    2470             :  */
    2471             : DIR *
    2472       40580 : AllocateDir(const char *dirname)
    2473             : {
    2474             :     DIR        *dir;
    2475             : 
    2476             :     DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
    2477             :                numAllocatedDescs, dirname));
    2478             : 
    2479             :     /* Can we allocate another non-virtual FD? */
    2480       40580 :     if (!reserveAllocatedDesc())
    2481           0 :         ereport(ERROR,
    2482             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2483             :                  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
    2484             :                         maxAllocatedDescs, dirname)));
    2485             : 
    2486             :     /* Close excess kernel FDs. */
    2487       40580 :     ReleaseLruFiles();
    2488             : 
    2489             : TryAgain:
    2490       40580 :     if ((dir = opendir(dirname)) != NULL)
    2491             :     {
    2492       40004 :         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
    2493             : 
    2494       40004 :         desc->kind = AllocateDescDir;
    2495       40004 :         desc->desc.dir = dir;
    2496       40004 :         desc->create_subid = GetCurrentSubTransactionId();
    2497       40004 :         numAllocatedDescs++;
    2498       40004 :         return desc->desc.dir;
    2499             :     }
    2500             : 
    2501         576 :     if (errno == EMFILE || errno == ENFILE)
    2502             :     {
    2503           0 :         int         save_errno = errno;
    2504             : 
    2505           0 :         ereport(LOG,
    2506             :                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
    2507             :                  errmsg("out of file descriptors: %m; release and retry")));
    2508           0 :         errno = 0;
    2509           0 :         if (ReleaseLruFile())
    2510           0 :             goto TryAgain;
    2511           0 :         errno = save_errno;
    2512             :     }
    2513             : 
    2514         576 :     return NULL;
    2515             : }
    2516             : 
    2517             : /*
    2518             :  * Read a directory opened with AllocateDir, ereport'ing any error.
    2519             :  *
    2520             :  * This is easier to use than raw readdir() since it takes care of some
    2521             :  * otherwise rather tedious and error-prone manipulation of errno.  Also,
    2522             :  * if you are happy with a generic error message for AllocateDir failure,
    2523             :  * you can just do
    2524             :  *
    2525             :  *      dir = AllocateDir(path);
    2526             :  *      while ((dirent = ReadDir(dir, path)) != NULL)
    2527             :  *          process dirent;
    2528             :  *      FreeDir(dir);
    2529             :  *
    2530             :  * since a NULL dir parameter is taken as indicating AllocateDir failed.
    2531             :  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
    2532             :  * use this shortcut.)
    2533             :  *
    2534             :  * The pathname passed to AllocateDir must be passed to this routine too,
    2535             :  * but it is only used for error reporting.
    2536             :  */
    2537             : struct dirent *
    2538      790832 : ReadDir(DIR *dir, const char *dirname)
    2539             : {
    2540      790832 :     return ReadDirExtended(dir, dirname, ERROR);
    2541             : }
    2542             : 
    2543             : /*
    2544             :  * Alternate version of ReadDir that allows caller to specify the elevel
    2545             :  * for any error report (whether it's reporting an initial failure of
    2546             :  * AllocateDir or a subsequent directory read failure).
    2547             :  *
    2548             :  * If elevel < ERROR, returns NULL after any error.  With the normal coding
    2549             :  * pattern, this will result in falling out of the loop immediately as
    2550             :  * though the directory contained no (more) entries.
    2551             :  */
    2552             : struct dirent *
    2553     1701426 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
    2554             : {
    2555             :     struct dirent *dent;
    2556             : 
    2557             :     /* Give a generic message for AllocateDir failure, if caller didn't */
    2558     1701426 :     if (dir == NULL)
    2559             :     {
    2560           0 :         ereport(elevel,
    2561             :                 (errcode_for_file_access(),
    2562             :                  errmsg("could not open directory \"%s\": %m",
    2563             :                         dirname)));
    2564           0 :         return NULL;
    2565             :     }
    2566             : 
    2567     1701426 :     errno = 0;
    2568     1701426 :     if ((dent = readdir(dir)) != NULL)
    2569     1670486 :         return dent;
    2570             : 
    2571       30940 :     if (errno)
    2572           0 :         ereport(elevel,
    2573             :                 (errcode_for_file_access(),
    2574             :                  errmsg("could not read directory \"%s\": %m",
    2575             :                         dirname)));
    2576       30940 :     return NULL;
    2577             : }
    2578             : 
    2579             : /*
    2580             :  * Close a directory opened with AllocateDir.
    2581             :  *
    2582             :  * Returns closedir's return value (with errno set if it's not 0).
    2583             :  * Note we do not check the return value --- it is up to the caller
    2584             :  * to handle close errors if wanted.
    2585             :  *
    2586             :  * Does nothing if dir == NULL; we assume that directory open failure was
    2587             :  * already reported if desired.
    2588             :  */
    2589             : int
    2590       39994 : FreeDir(DIR *dir)
    2591             : {
    2592             :     int         i;
    2593             : 
    2594             :     /* Nothing to do if AllocateDir failed */
    2595       39994 :     if (dir == NULL)
    2596           0 :         return 0;
    2597             : 
    2598             :     DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
    2599             : 
    2600             :     /* Remove dir from list of allocated dirs, if it's present */
    2601       79988 :     for (i = numAllocatedDescs; --i >= 0;)
    2602             :     {
    2603       39994 :         AllocateDesc *desc = &allocatedDescs[i];
    2604             : 
    2605       39994 :         if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
    2606       39994 :             return FreeDesc(desc);
    2607             :     }
    2608             : 
    2609             :     /* Only get here if someone passes us a dir not in allocatedDescs */
    2610           0 :     elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
    2611             : 
    2612           0 :     return closedir(dir);
    2613             : }
    2614             : 
    2615             : 
    2616             : /*
    2617             :  * Close a pipe stream returned by OpenPipeStream.
    2618             :  */
    2619             : int
    2620         322 : ClosePipeStream(FILE *file)
    2621             : {
    2622             :     int         i;
    2623             : 
    2624             :     DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
    2625             : 
    2626             :     /* Remove file from list of allocated files, if it's present */
    2627         644 :     for (i = numAllocatedDescs; --i >= 0;)
    2628             :     {
    2629         322 :         AllocateDesc *desc = &allocatedDescs[i];
    2630             : 
    2631         322 :         if (desc->kind == AllocateDescPipe && desc->desc.file == file)
    2632         322 :             return FreeDesc(desc);
    2633             :     }
    2634             : 
    2635             :     /* Only get here if someone passes us a file not in allocatedDescs */
    2636           0 :     elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
    2637             : 
    2638           0 :     return pclose(file);
    2639             : }
    2640             : 
    2641             : /*
    2642             :  * closeAllVfds
    2643             :  *
    2644             :  * Force all VFDs into the physically-closed state, so that the fewest
    2645             :  * possible number of kernel file descriptors are in use.  There is no
    2646             :  * change in the logical state of the VFDs.
    2647             :  */
    2648             : void
    2649          18 : closeAllVfds(void)
    2650             : {
    2651             :     Index       i;
    2652             : 
    2653          18 :     if (SizeVfdCache > 0)
    2654             :     {
    2655             :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    2656         576 :         for (i = 1; i < SizeVfdCache; i++)
    2657             :         {
    2658         558 :             if (!FileIsNotOpen(i))
    2659          26 :                 LruDelete(i);
    2660             :         }
    2661             :     }
    2662          18 : }
    2663             : 
    2664             : 
    2665             : /*
    2666             :  * SetTempTablespaces
    2667             :  *
    2668             :  * Define a list (actually an array) of OIDs of tablespaces to use for
    2669             :  * temporary files.  This list will be used until end of transaction,
    2670             :  * unless this function is called again before then.  It is caller's
    2671             :  * responsibility that the passed-in array has adequate lifespan (typically
    2672             :  * it'd be allocated in TopTransactionContext).
    2673             :  */
    2674             : void
    2675        4370 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
    2676             : {
    2677             :     Assert(numSpaces >= 0);
    2678        4370 :     tempTableSpaces = tableSpaces;
    2679        4370 :     numTempTableSpaces = numSpaces;
    2680             : 
    2681             :     /*
    2682             :      * Select a random starting point in the list.  This is to minimize
    2683             :      * conflicts between backends that are most likely sharing the same list
    2684             :      * of temp tablespaces.  Note that if we create multiple temp files in the
    2685             :      * same transaction, we'll advance circularly through the list --- this
    2686             :      * ensures that large temporary sort files are nicely spread across all
    2687             :      * available tablespaces.
    2688             :      */
    2689        4370 :     if (numSpaces > 1)
    2690           0 :         nextTempTableSpace = random() % numSpaces;
    2691             :     else
    2692        4370 :         nextTempTableSpace = 0;
    2693        4370 : }
    2694             : 
    2695             : /*
    2696             :  * TempTablespacesAreSet
    2697             :  *
    2698             :  * Returns true if SetTempTablespaces has been called in current transaction.
    2699             :  * (This is just so that tablespaces.c doesn't need its own per-transaction
    2700             :  * state.)
    2701             :  */
    2702             : bool
    2703        5216 : TempTablespacesAreSet(void)
    2704             : {
    2705        5216 :     return (numTempTableSpaces >= 0);
    2706             : }
    2707             : 
    2708             : /*
    2709             :  * GetTempTablespaces
    2710             :  *
    2711             :  * Populate an array with the OIDs of the tablespaces that should be used for
    2712             :  * temporary files.  Return the number that were copied into the output array.
    2713             :  */
    2714             : int
    2715         184 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
    2716             : {
    2717             :     int         i;
    2718             : 
    2719             :     Assert(TempTablespacesAreSet());
    2720         184 :     for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
    2721           0 :         tableSpaces[i] = tempTableSpaces[i];
    2722             : 
    2723         184 :     return i;
    2724             : }
    2725             : 
    2726             : /*
    2727             :  * GetNextTempTableSpace
    2728             :  *
    2729             :  * Select the next temp tablespace to use.  A result of InvalidOid means
    2730             :  * to use the current database's default tablespace.
    2731             :  */
    2732             : Oid
    2733        2676 : GetNextTempTableSpace(void)
    2734             : {
    2735        2676 :     if (numTempTableSpaces > 0)
    2736             :     {
    2737             :         /* Advance nextTempTableSpace counter with wraparound */
    2738           0 :         if (++nextTempTableSpace >= numTempTableSpaces)
    2739           0 :             nextTempTableSpace = 0;
    2740           0 :         return tempTableSpaces[nextTempTableSpace];
    2741             :     }
    2742        2676 :     return InvalidOid;
    2743             : }
    2744             : 
    2745             : 
    2746             : /*
    2747             :  * AtEOSubXact_Files
    2748             :  *
    2749             :  * Take care of subtransaction commit/abort.  At abort, we close temp files
    2750             :  * that the subtransaction may have opened.  At commit, we reassign the
    2751             :  * files that were opened to the parent subtransaction.
    2752             :  */
    2753             : void
    2754        7328 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
    2755             :                   SubTransactionId parentSubid)
    2756             : {
    2757             :     Index       i;
    2758             : 
    2759        7328 :     for (i = 0; i < numAllocatedDescs; i++)
    2760             :     {
    2761           0 :         if (allocatedDescs[i].create_subid == mySubid)
    2762             :         {
    2763           0 :             if (isCommit)
    2764           0 :                 allocatedDescs[i].create_subid = parentSubid;
    2765             :             else
    2766             :             {
    2767             :                 /* have to recheck the item after FreeDesc (ugly) */
    2768           0 :                 FreeDesc(&allocatedDescs[i--]);
    2769             :             }
    2770             :         }
    2771             :     }
    2772        7328 : }
    2773             : 
    2774             : /*
    2775             :  * AtEOXact_Files
    2776             :  *
    2777             :  * This routine is called during transaction commit or abort.  All still-open
    2778             :  * per-transaction temporary file VFDs are closed, which also causes the
    2779             :  * underlying files to be deleted (although they should've been closed already
    2780             :  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
    2781             :  * closed. We also forget any transaction-local temp tablespace list.
    2782             :  *
    2783             :  * The isCommit flag is used only to decide whether to emit warnings about
    2784             :  * unclosed files.
    2785             :  */
    2786             : void
    2787      455788 : AtEOXact_Files(bool isCommit)
    2788             : {
    2789      455788 :     CleanupTempFiles(isCommit, false);
    2790      455788 :     tempTableSpaces = NULL;
    2791      455788 :     numTempTableSpaces = -1;
    2792      455788 : }
    2793             : 
    2794             : /*
    2795             :  * AtProcExit_Files
    2796             :  *
    2797             :  * on_proc_exit hook to clean up temp files during backend shutdown.
    2798             :  * Here, we want to clean up *all* temp files including interXact ones.
    2799             :  */
    2800             : static void
    2801       12256 : AtProcExit_Files(int code, Datum arg)
    2802             : {
    2803       12256 :     CleanupTempFiles(false, true);
    2804       12256 : }
    2805             : 
    2806             : /*
    2807             :  * Close temporary files and delete their underlying files.
    2808             :  *
    2809             :  * isCommit: if true, this is normal transaction commit, and we don't
    2810             :  * expect any remaining files; warn if there are some.
    2811             :  *
    2812             :  * isProcExit: if true, this is being called as the backend process is
    2813             :  * exiting. If that's the case, we should remove all temporary files; if
    2814             :  * that's not the case, we are being called for transaction commit/abort
    2815             :  * and should only remove transaction-local temp files.  In either case,
    2816             :  * also clean up "allocated" stdio files, dirs and fds.
    2817             :  */
    2818             : static void
    2819      468044 : CleanupTempFiles(bool isCommit, bool isProcExit)
    2820             : {
    2821             :     Index       i;
    2822             : 
    2823             :     /*
    2824             :      * Careful here: at proc_exit we need extra cleanup, not just
    2825             :      * xact_temporary files.
    2826             :      */
    2827      468044 :     if (isProcExit || have_xact_temporary_files)
    2828             :     {
    2829             :         Assert(FileIsNotOpen(0));   /* Make sure ring not corrupted */
    2830      690572 :         for (i = 1; i < SizeVfdCache; i++)
    2831             :         {
    2832      677996 :             unsigned short fdstate = VfdCache[i].fdstate;
    2833             : 
    2834      677996 :             if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
    2835           0 :                 VfdCache[i].fileName != NULL)
    2836             :             {
    2837             :                 /*
    2838             :                  * If we're in the process of exiting a backend process, close
    2839             :                  * all temporary files. Otherwise, only close temporary files
    2840             :                  * local to the current transaction. They should be closed by
    2841             :                  * the ResourceOwner mechanism already, so this is just a
    2842             :                  * debugging cross-check.
    2843             :                  */
    2844           0 :                 if (isProcExit)
    2845           0 :                     FileClose(i);
    2846           0 :                 else if (fdstate & FD_CLOSE_AT_EOXACT)
    2847             :                 {
    2848           0 :                     elog(WARNING,
    2849             :                          "temporary file %s not closed at end-of-transaction",
    2850             :                          VfdCache[i].fileName);
    2851           0 :                     FileClose(i);
    2852             :                 }
    2853             :             }
    2854             :         }
    2855             : 
    2856       12576 :         have_xact_temporary_files = false;
    2857             :     }
    2858             : 
    2859             :     /* Complain if any allocated files remain open at commit. */
    2860      468044 :     if (isCommit && numAllocatedDescs > 0)
    2861           0 :         elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
    2862             :              numAllocatedDescs);
    2863             : 
    2864             :     /* Clean up "allocated" stdio files, dirs and fds. */
    2865      936116 :     while (numAllocatedDescs > 0)
    2866          28 :         FreeDesc(&allocatedDescs[0]);
    2867      468044 : }
    2868             : 
    2869             : 
    2870             : /*
    2871             :  * Remove temporary and temporary relation files left over from a prior
    2872             :  * postmaster session
    2873             :  *
    2874             :  * This should be called during postmaster startup.  It will forcibly
    2875             :  * remove any leftover files created by OpenTemporaryFile and any leftover
    2876             :  * temporary relation files created by mdcreate.
    2877             :  *
    2878             :  * NOTE: we could, but don't, call this during a post-backend-crash restart
    2879             :  * cycle.  The argument for not doing it is that someone might want to examine
    2880             :  * the temp files for debugging purposes.  This does however mean that
    2881             :  * OpenTemporaryFile had better allow for collision with an existing temp
    2882             :  * file name.
    2883             :  *
    2884             :  * NOTE: this function and its subroutines generally report syscall failures
    2885             :  * with ereport(LOG) and keep going.  Removing temp files is not so critical
    2886             :  * that we should fail to start the database when we can't do it.
    2887             :  */
    2888             : void
    2889         572 : RemovePgTempFiles(void)
    2890             : {
    2891             :     char        temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
    2892             :     DIR        *spc_dir;
    2893             :     struct dirent *spc_de;
    2894             : 
    2895             :     /*
    2896             :      * First process temp files in pg_default ($PGDATA/base)
    2897             :      */
    2898         572 :     snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
    2899         572 :     RemovePgTempFilesInDir(temp_path, true, false);
    2900         572 :     RemovePgTempRelationFiles("base");
    2901             : 
    2902             :     /*
    2903             :      * Cycle through temp directories for all non-default tablespaces.
    2904             :      */
    2905         572 :     spc_dir = AllocateDir("pg_tblspc");
    2906             : 
    2907        2294 :     while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
    2908             :     {
    2909        1728 :         if (strcmp(spc_de->d_name, ".") == 0 ||
    2910         578 :             strcmp(spc_de->d_name, "..") == 0)
    2911        1144 :             continue;
    2912             : 
    2913           6 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
    2914           6 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
    2915           6 :         RemovePgTempFilesInDir(temp_path, true, false);
    2916             : 
    2917           6 :         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
    2918           6 :                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
    2919           6 :         RemovePgTempRelationFiles(temp_path);
    2920             :     }
    2921             : 
    2922         572 :     FreeDir(spc_dir);
    2923             : 
    2924             :     /*
    2925             :      * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
    2926             :      * DataDir as well.
    2927             :      */
    2928             : #ifdef EXEC_BACKEND
    2929             :     RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false);
    2930             : #endif
    2931         572 : }
    2932             : 
    2933             : /*
    2934             :  * Process one pgsql_tmp directory for RemovePgTempFiles.
    2935             :  *
    2936             :  * If missing_ok is true, it's all right for the named directory to not exist.
    2937             :  * Any other problem results in a LOG message.  (missing_ok should be true at
    2938             :  * the top level, since pgsql_tmp directories are not created until needed.)
    2939             :  *
    2940             :  * At the top level, this should be called with unlink_all = false, so that
    2941             :  * only files matching the temporary name prefix will be unlinked.  When
    2942             :  * recursing it will be called with unlink_all = true to unlink everything
    2943             :  * under a top-level temporary directory.
    2944             :  *
    2945             :  * (These two flags could be replaced by one, but it seems clearer to keep
    2946             :  * them separate.)
    2947             :  */
    2948             : static void
    2949         578 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
    2950             : {
    2951             :     DIR        *temp_dir;
    2952             :     struct dirent *temp_de;
    2953             :     char        rm_path[MAXPGPATH * 2];
    2954             : 
    2955         578 :     temp_dir = AllocateDir(tmpdirname);
    2956             : 
    2957         578 :     if (temp_dir == NULL && errno == ENOENT && missing_ok)
    2958         576 :         return;
    2959             : 
    2960           8 :     while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
    2961             :     {
    2962           6 :         if (strcmp(temp_de->d_name, ".") == 0 ||
    2963           2 :             strcmp(temp_de->d_name, "..") == 0)
    2964           4 :             continue;
    2965             : 
    2966           0 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    2967           0 :                  tmpdirname, temp_de->d_name);
    2968             : 
    2969           0 :         if (unlink_all ||
    2970           0 :             strncmp(temp_de->d_name,
    2971             :                     PG_TEMP_FILE_PREFIX,
    2972             :                     strlen(PG_TEMP_FILE_PREFIX)) == 0)
    2973           0 :         {
    2974             :             struct stat statbuf;
    2975             : 
    2976           0 :             if (lstat(rm_path, &statbuf) < 0)
    2977             :             {
    2978           0 :                 ereport(LOG,
    2979             :                         (errcode_for_file_access(),
    2980             :                          errmsg("could not stat file \"%s\": %m", rm_path)));
    2981           0 :                 continue;
    2982             :             }
    2983             : 
    2984           0 :             if (S_ISDIR(statbuf.st_mode))
    2985             :             {
    2986             :                 /* recursively remove contents, then directory itself */
    2987           0 :                 RemovePgTempFilesInDir(rm_path, false, true);
    2988             : 
    2989           0 :                 if (rmdir(rm_path) < 0)
    2990           0 :                     ereport(LOG,
    2991             :                             (errcode_for_file_access(),
    2992             :                              errmsg("could not remove directory \"%s\": %m",
    2993             :                                     rm_path)));
    2994             :             }
    2995             :             else
    2996             :             {
    2997           0 :                 if (unlink(rm_path) < 0)
    2998           0 :                     ereport(LOG,
    2999             :                             (errcode_for_file_access(),
    3000             :                              errmsg("could not remove file \"%s\": %m",
    3001             :                                     rm_path)));
    3002             :             }
    3003             :         }
    3004             :         else
    3005           0 :             ereport(LOG,
    3006             :                     (errmsg("unexpected file found in temporary-files directory: \"%s\"",
    3007             :                             rm_path)));
    3008             :     }
    3009             : 
    3010           2 :     FreeDir(temp_dir);
    3011             : }
    3012             : 
    3013             : /* Process one tablespace directory, look for per-DB subdirectories */
    3014             : static void
    3015         578 : RemovePgTempRelationFiles(const char *tsdirname)
    3016             : {
    3017             :     DIR        *ts_dir;
    3018             :     struct dirent *de;
    3019             :     char        dbspace_path[MAXPGPATH * 2];
    3020             : 
    3021         578 :     ts_dir = AllocateDir(tsdirname);
    3022             : 
    3023        4144 :     while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
    3024             :     {
    3025             :         /*
    3026             :          * We're only interested in the per-database directories, which have
    3027             :          * numeric names.  Note that this code will also (properly) ignore "."
    3028             :          * and "..".
    3029             :          */
    3030        2988 :         if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
    3031        1158 :             continue;
    3032             : 
    3033        1830 :         snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
    3034        1830 :                  tsdirname, de->d_name);
    3035        1830 :         RemovePgTempRelationFilesInDbspace(dbspace_path);
    3036             :     }
    3037             : 
    3038         578 :     FreeDir(ts_dir);
    3039         578 : }
    3040             : 
    3041             : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
    3042             : static void
    3043        1830 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
    3044             : {
    3045             :     DIR        *dbspace_dir;
    3046             :     struct dirent *de;
    3047             :     char        rm_path[MAXPGPATH * 2];
    3048             : 
    3049        1830 :     dbspace_dir = AllocateDir(dbspacedirname);
    3050             : 
    3051      567088 :     while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
    3052             :     {
    3053      563428 :         if (!looks_like_temp_rel_name(de->d_name))
    3054      563420 :             continue;
    3055             : 
    3056           8 :         snprintf(rm_path, sizeof(rm_path), "%s/%s",
    3057           8 :                  dbspacedirname, de->d_name);
    3058             : 
    3059           8 :         if (unlink(rm_path) < 0)
    3060           0 :             ereport(LOG,
    3061             :                     (errcode_for_file_access(),
    3062             :                      errmsg("could not remove file \"%s\": %m",
    3063             :                             rm_path)));
    3064             :     }
    3065             : 
    3066        1830 :     FreeDir(dbspace_dir);
    3067        1830 : }
    3068             : 
    3069             : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
    3070             : bool
    3071      648144 : looks_like_temp_rel_name(const char *name)
    3072             : {
    3073             :     int         pos;
    3074             :     int         savepos;
    3075             : 
    3076             :     /* Must start with "t". */
    3077      648144 :     if (name[0] != 't')
    3078      648104 :         return false;
    3079             : 
    3080             :     /* Followed by a non-empty string of digits and then an underscore. */
    3081          40 :     for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
    3082             :         ;
    3083          40 :     if (pos == 1 || name[pos] != '_')
    3084           0 :         return false;
    3085             : 
    3086             :     /* Followed by another nonempty string of digits. */
    3087          40 :     for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
    3088             :         ;
    3089          40 :     if (savepos == pos)
    3090           0 :         return false;
    3091             : 
    3092             :     /* We might have _forkname or .segment or both. */
    3093          40 :     if (name[pos] == '_')
    3094             :     {
    3095          20 :         int         forkchar = forkname_chars(&name[pos + 1], NULL);
    3096             : 
    3097          20 :         if (forkchar <= 0)
    3098           0 :             return false;
    3099          20 :         pos += forkchar + 1;
    3100             :     }
    3101          40 :     if (name[pos] == '.')
    3102             :     {
    3103             :         int         segchar;
    3104             : 
    3105          20 :         for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
    3106             :             ;
    3107          20 :         if (segchar <= 1)
    3108           0 :             return false;
    3109          20 :         pos += segchar;
    3110             :     }
    3111             : 
    3112             :     /* Now we should be at the end. */
    3113          40 :     if (name[pos] != '\0')
    3114           0 :         return false;
    3115          40 :     return true;
    3116             : }
    3117             : 
    3118             : 
    3119             : /*
    3120             :  * Issue fsync recursively on PGDATA and all its contents.
    3121             :  *
    3122             :  * We fsync regular files and directories wherever they are, but we
    3123             :  * follow symlinks only for pg_wal and immediately under pg_tblspc.
    3124             :  * Other symlinks are presumed to point at files we're not responsible
    3125             :  * for fsyncing, and might not have privileges to write at all.
    3126             :  *
    3127             :  * Errors are logged but not considered fatal; that's because this is used
    3128             :  * only during database startup, to deal with the possibility that there are
    3129             :  * issued-but-unsynced writes pending against the data directory.  We want to
    3130             :  * ensure that such writes reach disk before anything that's done in the new
    3131             :  * run.  However, aborting on error would result in failure to start for
    3132             :  * harmless cases such as read-only files in the data directory, and that's
    3133             :  * not good either.
    3134             :  *
    3135             :  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
    3136             :  * rewriting all changes again during recovery.
    3137             :  *
    3138             :  * Note we assume we're chdir'd into PGDATA to begin with.
    3139             :  */
    3140             : void
    3141          82 : SyncDataDirectory(void)
    3142             : {
    3143             :     bool        xlog_is_symlink;
    3144             : 
    3145             :     /* We can skip this whole thing if fsync is disabled. */
    3146          82 :     if (!enableFsync)
    3147          80 :         return;
    3148             : 
    3149             :     /*
    3150             :      * If pg_wal is a symlink, we'll need to recurse into it separately,
    3151             :      * because the first walkdir below will ignore it.
    3152             :      */
    3153           2 :     xlog_is_symlink = false;
    3154             : 
    3155             : #ifndef WIN32
    3156             :     {
    3157             :         struct stat st;
    3158             : 
    3159           2 :         if (lstat("pg_wal", &st) < 0)
    3160           0 :             ereport(LOG,
    3161             :                     (errcode_for_file_access(),
    3162             :                      errmsg("could not stat file \"%s\": %m",
    3163             :                             "pg_wal")));
    3164           2 :         else if (S_ISLNK(st.st_mode))
    3165           0 :             xlog_is_symlink = true;
    3166             :     }
    3167             : #else
    3168             :     if (pgwin32_is_junction("pg_wal"))
    3169             :         xlog_is_symlink = true;
    3170             : #endif
    3171             : 
    3172             :     /*
    3173             :      * If possible, hint to the kernel that we're soon going to fsync the data
    3174             :      * directory and its contents.  Errors in this step are even less
    3175             :      * interesting than normal, so log them only at DEBUG1.
    3176             :      */
    3177             : #ifdef PG_FLUSH_DATA_WORKS
    3178           2 :     walkdir(".", pre_sync_fname, false, DEBUG1);
    3179           2 :     if (xlog_is_symlink)
    3180           0 :         walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
    3181           2 :     walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
    3182             : #endif
    3183             : 
    3184             :     /*
    3185             :      * Now we do the fsync()s in the same order.
    3186             :      *
    3187             :      * The main call ignores symlinks, so in addition to specially processing
    3188             :      * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
    3189             :      * process_symlinks = true.  Note that if there are any plain directories
    3190             :      * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
    3191             :      * so we don't worry about optimizing it.
    3192             :      */
    3193           2 :     walkdir(".", datadir_fsync_fname, false, LOG);
    3194           2 :     if (xlog_is_symlink)
    3195           0 :         walkdir("pg_wal", datadir_fsync_fname, false, LOG);
    3196           2 :     walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
    3197             : }
    3198             : 
    3199             : /*
    3200             :  * walkdir: recursively walk a directory, applying the action to each
    3201             :  * regular file and directory (including the named directory itself).
    3202             :  *
    3203             :  * If process_symlinks is true, the action and recursion are also applied
    3204             :  * to regular files and directories that are pointed to by symlinks in the
    3205             :  * given directory; otherwise symlinks are ignored.  Symlinks are always
    3206             :  * ignored in subdirectories, ie we intentionally don't pass down the
    3207             :  * process_symlinks flag to recursive calls.
    3208             :  *
    3209             :  * Errors are reported at level elevel, which might be ERROR or less.
    3210             :  *
    3211             :  * See also walkdir in initdb.c, which is a frontend version of this logic.
    3212             :  */
    3213             : static void
    3214         284 : walkdir(const char *path,
    3215             :         void (*action) (const char *fname, bool isdir, int elevel),
    3216             :         bool process_symlinks,
    3217             :         int elevel)
    3218             : {
    3219             :     DIR        *dir;
    3220             :     struct dirent *de;
    3221             : 
    3222         284 :     dir = AllocateDir(path);
    3223             : 
    3224        6530 :     while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
    3225             :     {
    3226             :         char        subpath[MAXPGPATH * 2];
    3227             :         struct stat fst;
    3228             :         int         sret;
    3229             : 
    3230        5962 :         CHECK_FOR_INTERRUPTS();
    3231             : 
    3232       11640 :         if (strcmp(de->d_name, ".") == 0 ||
    3233        5678 :             strcmp(de->d_name, "..") == 0)
    3234        1136 :             continue;
    3235             : 
    3236        5394 :         snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
    3237             : 
    3238        5394 :         if (process_symlinks)
    3239           0 :             sret = stat(subpath, &fst);
    3240             :         else
    3241        5394 :             sret = lstat(subpath, &fst);
    3242             : 
    3243        5394 :         if (sret < 0)
    3244             :         {
    3245           0 :             ereport(elevel,
    3246             :                     (errcode_for_file_access(),
    3247             :                      errmsg("could not stat file \"%s\": %m", subpath)));
    3248           0 :             continue;
    3249             :         }
    3250             : 
    3251        5394 :         if (S_ISREG(fst.st_mode))
    3252        5294 :             (*action) (subpath, false, elevel);
    3253         100 :         else if (S_ISDIR(fst.st_mode))
    3254         100 :             walkdir(subpath, action, false, elevel);
    3255             :     }
    3256             : 
    3257         284 :     FreeDir(dir);               /* we ignore any error here */
    3258             : 
    3259             :     /*
    3260             :      * It's important to fsync the destination directory itself as individual
    3261             :      * file fsyncs don't guarantee that the directory entry for the file is
    3262             :      * synced.  However, skip this if AllocateDir failed; the action function
    3263             :      * might not be robust against that.
    3264             :      */
    3265         284 :     if (dir)
    3266         284 :         (*action) (path, true, elevel);
    3267         284 : }
    3268             : 
    3269             : 
    3270             : /*
    3271             :  * Hint to the OS that it should get ready to fsync() this file.
    3272             :  *
    3273             :  * Ignores errors trying to open unreadable files, and logs other errors at a
    3274             :  * caller-specified level.
    3275             :  */
    3276             : #ifdef PG_FLUSH_DATA_WORKS
    3277             : 
    3278             : static void
    3279        2022 : pre_sync_fname(const char *fname, bool isdir, int elevel)
    3280             : {
    3281             :     int         fd;
    3282             : 
    3283             :     /* Don't try to flush directories, it'll likely just fail */
    3284        2022 :     if (isdir)
    3285          54 :         return;
    3286             : 
    3287        1968 :     fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
    3288             : 
    3289        1968 :     if (fd < 0)
    3290             :     {
    3291           0 :         if (errno == EACCES)
    3292           0 :             return;
    3293           0 :         ereport(elevel,
    3294             :                 (errcode_for_file_access(),
    3295             :                  errmsg("could not open file \"%s\": %m", fname)));
    3296           0 :         return;
    3297             :     }
    3298             : 
    3299             :     /*
    3300             :      * pg_flush_data() ignores errors, which is ok because this is only a
    3301             :      * hint.
    3302             :      */
    3303        1968 :     pg_flush_data(fd, 0, 0);
    3304             : 
    3305        1968 :     if (CloseTransientFile(fd))
    3306           0 :         ereport(elevel,
    3307             :                 (errcode_for_file_access(),
    3308             :                  errmsg("could not close file \"%s\": %m", fname)));
    3309             : }
    3310             : 
    3311             : #endif                          /* PG_FLUSH_DATA_WORKS */
    3312             : 
    3313             : static void
    3314        2022 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
    3315             : {
    3316             :     /*
    3317             :      * We want to silently ignoring errors about unreadable files.  Pass that
    3318             :      * desire on to fsync_fname_ext().
    3319             :      */
    3320        2022 :     fsync_fname_ext(fname, isdir, true, elevel);
    3321        2022 : }
    3322             : 
    3323             : static void
    3324        1534 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
    3325             : {
    3326        1534 :     if (isdir)
    3327             :     {
    3328         176 :         if (rmdir(fname) != 0 && errno != ENOENT)
    3329           0 :             ereport(elevel,
    3330             :                     (errcode_for_file_access(),
    3331             :                      errmsg("could not rmdir directory \"%s\": %m", fname)));
    3332             :     }
    3333             :     else
    3334             :     {
    3335             :         /* Use PathNameDeleteTemporaryFile to report filesize */
    3336        1358 :         PathNameDeleteTemporaryFile(fname, false);
    3337             :     }
    3338        1534 : }
    3339             : 
    3340             : /*
    3341             :  * fsync_fname_ext -- Try to fsync a file or directory
    3342             :  *
    3343             :  * If ignore_perm is true, ignore errors upon trying to open unreadable
    3344             :  * files. Logs other errors at a caller-specified level.
    3345             :  *
    3346             :  * Returns 0 if the operation succeeded, -1 otherwise.
    3347             :  */
    3348             : static int
    3349       28348 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
    3350             : {
    3351             :     int         fd;
    3352             :     int         flags;
    3353             :     int         returncode;
    3354             : 
    3355             :     /*
    3356             :      * Some OSs require directories to be opened read-only whereas other
    3357             :      * systems don't allow us to fsync files opened read-only; so we need both
    3358             :      * cases here.  Using O_RDWR will cause us to fail to fsync files that are
    3359             :      * not writable by our userid, but we assume that's OK.
    3360             :      */
    3361       28348 :     flags = PG_BINARY;
    3362       28348 :     if (!isdir)
    3363       11686 :         flags |= O_RDWR;
    3364             :     else
    3365       16662 :         flags |= O_RDONLY;
    3366             : 
    3367       28348 :     fd = OpenTransientFile(fname, flags);
    3368             : 
    3369             :     /*
    3370             :      * Some OSs don't allow us to open directories at all (Windows returns
    3371             :      * EACCES), just ignore the error in that case.  If desired also silently
    3372             :      * ignoring errors about unreadable files. Log others.
    3373             :      */
    3374       28348 :     if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
    3375           0 :         return 0;
    3376       28348 :     else if (fd < 0 && ignore_perm && errno == EACCES)
    3377           0 :         return 0;
    3378       28348 :     else if (fd < 0)
    3379             :     {
    3380           0 :         ereport(elevel,
    3381             :                 (errcode_for_file_access(),
    3382             :                  errmsg("could not open file \"%s\": %m", fname)));
    3383           0 :         return -1;
    3384             :     }
    3385             : 
    3386       28348 :     returncode = pg_fsync(fd);
    3387             : 
    3388             :     /*
    3389             :      * Some OSes don't allow us to fsync directories at all, so we can ignore
    3390             :      * those errors. Anything else needs to be logged.
    3391             :      */
    3392       28348 :     if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
    3393             :     {
    3394             :         int         save_errno;
    3395             : 
    3396             :         /* close file upon error, might not be in transaction context */
    3397           0 :         save_errno = errno;
    3398           0 :         (void) CloseTransientFile(fd);
    3399           0 :         errno = save_errno;
    3400             : 
    3401           0 :         ereport(elevel,
    3402             :                 (errcode_for_file_access(),
    3403             :                  errmsg("could not fsync file \"%s\": %m", fname)));
    3404           0 :         return -1;
    3405             :     }
    3406             : 
    3407       28348 :     if (CloseTransientFile(fd))
    3408             :     {
    3409           0 :         ereport(elevel,
    3410             :                 (errcode_for_file_access(),
    3411             :                  errmsg("could not close file \"%s\": %m", fname)));
    3412           0 :         return -1;
    3413             :     }
    3414             : 
    3415       28348 :     return 0;
    3416             : }
    3417             : 
    3418             : /*
    3419             :  * fsync_parent_path -- fsync the parent path of a file or directory
    3420             :  *
    3421             :  * This is aimed at making file operations persistent on disk in case of
    3422             :  * an OS crash or power failure.
    3423             :  */
    3424             : static int
    3425        4362 : fsync_parent_path(const char *fname, int elevel)
    3426             : {
    3427             :     char        parentpath[MAXPGPATH];
    3428             : 
    3429        4362 :     strlcpy(parentpath, fname, MAXPGPATH);
    3430        4362 :     get_parent_directory(parentpath);
    3431             : 
    3432             :     /*
    3433             :      * get_parent_directory() returns an empty string if the input argument is
    3434             :      * just a file name (see comments in path.c), so handle that as being the
    3435             :      * current directory.
    3436             :      */
    3437        4362 :     if (strlen(parentpath) == 0)
    3438         128 :         strlcpy(parentpath, ".", MAXPGPATH);
    3439             : 
    3440        4362 :     if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
    3441           0 :         return -1;
    3442             : 
    3443        4362 :     return 0;
    3444             : }
    3445             : 
    3446             : /*
    3447             :  * Create a PostgreSQL data sub-directory
    3448             :  *
    3449             :  * The data directory itself, and most of its sub-directories, are created at
    3450             :  * initdb time, but we do have some occasions when we create directories in
    3451             :  * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
    3452             :  * make sure that those directories are created consistently.  Today, that means
    3453             :  * making sure that the created directory has the correct permissions, which is
    3454             :  * what pg_dir_create_mode tracks for us.
    3455             :  *
    3456             :  * Note that we also set the umask() based on what we understand the correct
    3457             :  * permissions to be (see file_perm.c).
    3458             :  *
    3459             :  * For permissions other than the default, mkdir() can be used directly, but
    3460             :  * be sure to consider carefully such cases -- a sub-directory with incorrect
    3461             :  * permissions in a PostgreSQL data directory could cause backups and other
    3462             :  * processes to fail.
    3463             :  */
    3464             : int
    3465        1478 : MakePGDirectory(const char *directoryName)
    3466             : {
    3467        1478 :     return mkdir(directoryName, pg_dir_create_mode);
    3468             : }
    3469             : 
    3470             : /*
    3471             :  * Return the passed-in error level, or PANIC if data_sync_retry is off.
    3472             :  *
    3473             :  * Failure to fsync any data file is cause for immediate panic, unless
    3474             :  * data_sync_retry is enabled.  Data may have been written to the operating
    3475             :  * system and removed from our buffer pool already, and if we are running on
    3476             :  * an operating system that forgets dirty data on write-back failure, there
    3477             :  * may be only one copy of the data remaining: in the WAL.  A later attempt to
    3478             :  * fsync again might falsely report success.  Therefore we must not allow any
    3479             :  * further checkpoints to be attempted.  data_sync_retry can in theory be
    3480             :  * enabled on systems known not to drop dirty buffered data on write-back
    3481             :  * failure (with the likely outcome that checkpoints will continue to fail
    3482             :  * until the underlying problem is fixed).
    3483             :  *
    3484             :  * Any code that reports a failure from fsync() or related functions should
    3485             :  * filter the error level with this function.
    3486             :  */
    3487             : int
    3488       13340 : data_sync_elevel(int elevel)
    3489             : {
    3490       13340 :     return data_sync_retry ? elevel : PANIC;
    3491             : }

Generated by: LCOV version 1.13