Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * fd.c
4 : * Virtual file descriptor code.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/fd.c
11 : *
12 : * NOTES:
13 : *
14 : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : * The server opens many file descriptors for a variety of reasons,
16 : * including base tables, scratch files (e.g., sort and hash spool
17 : * files), and random calls to C library routines like system(3); it
18 : * is quite easy to exceed system limits on the number of open files a
19 : * single process can have. (This is around 1024 on many modern
20 : * operating systems, but may be lower on others.)
21 : *
22 : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : * being opened and closed as needed. Obviously, if a routine is
24 : * opened using these interfaces, all subsequent operations must also
25 : * be through these interfaces (the File type is not a real file
26 : * descriptor).
27 : *
28 : * For this scheme to work, most (if not all) routines throughout the
29 : * server should use these interfaces instead of calling the C library
30 : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : * may find ourselves short of real file descriptors anyway.
32 : *
33 : * INTERFACE ROUTINES
34 : *
35 : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : * File is closed, either explicitly or implicitly at end of transaction or
38 : * process exit. PathNameOpenFile is intended for files that are held open
39 : * for a long time, like relation files. It is the caller's responsibility
40 : * to close them, there is no automatic mechanism in fd.c for that.
41 : *
42 : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : * temporary files that have names so that they can be shared between
44 : * backends. Such files are automatically closed and count against the
45 : * temporary file limit of the backend that creates them, but unlike anonymous
46 : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : * ownership mechanism that provides automatic cleanup for shared files when
48 : * the last of a group of backends detaches.
49 : *
50 : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : * They behave like the corresponding native functions, except that the handle
53 : * is registered with the current subtransaction, and will be automatically
54 : * closed at abort. These are intended mainly for short operations like
55 : * reading a configuration file; there is a limit on the number of files that
56 : * can be opened using these functions at any one time.
57 : *
58 : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : * release file descriptors in use by the virtual file descriptors if
60 : * necessary. There is no automatic cleanup of file descriptors returned by
61 : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : * descriptor by calling close(2).
63 : *
64 : * If a non-virtual file descriptor needs to be held open for any length of
65 : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : *
70 : *-------------------------------------------------------------------------
71 : */
72 :
73 : #include "postgres.h"
74 :
75 : #include <dirent.h>
76 : #include <sys/file.h>
77 : #include <sys/param.h>
78 : #include <sys/resource.h> /* for getrlimit */
79 : #include <sys/stat.h>
80 : #include <sys/types.h>
81 : #ifndef WIN32
82 : #include <sys/mman.h>
83 : #endif
84 : #include <limits.h>
85 : #include <unistd.h>
86 : #include <fcntl.h>
87 :
88 : #include "access/xact.h"
89 : #include "access/xlog.h"
90 : #include "catalog/pg_tablespace.h"
91 : #include "common/file_perm.h"
92 : #include "common/file_utils.h"
93 : #include "common/pg_prng.h"
94 : #include "miscadmin.h"
95 : #include "pgstat.h"
96 : #include "postmaster/startup.h"
97 : #include "storage/aio.h"
98 : #include "storage/fd.h"
99 : #include "storage/ipc.h"
100 : #include "utils/guc.h"
101 : #include "utils/guc_hooks.h"
102 : #include "utils/resowner.h"
103 : #include "utils/varlena.h"
104 :
105 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 : #if defined(HAVE_SYNC_FILE_RANGE)
107 : #define PG_FLUSH_DATA_WORKS 1
108 : #elif !defined(WIN32) && defined(MS_ASYNC)
109 : #define PG_FLUSH_DATA_WORKS 1
110 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 : #define PG_FLUSH_DATA_WORKS 1
112 : #endif
113 :
114 : /*
115 : * We must leave some file descriptors free for system(), the dynamic loader,
116 : * and other code that tries to open files without consulting fd.c. This
117 : * is the number left free. (While we try fairly hard to prevent EMFILE
118 : * errors, there's never any guarantee that we won't get ENFILE due to
119 : * other processes chewing up FDs. So it's a bad idea to try to open files
120 : * without consulting fd.c. Nonetheless we cannot control all code.)
121 : *
122 : * Because this is just a fixed setting, we are effectively assuming that
123 : * no such code will leave FDs open over the long term; otherwise the slop
124 : * is likely to be insufficient. Note in particular that we expect that
125 : * loading a shared library does not result in any permanent increase in
126 : * the number of open files. (This appears to be true on most if not
127 : * all platforms as of Feb 2004.)
128 : */
129 : #define NUM_RESERVED_FDS 10
130 :
131 : /*
132 : * If we have fewer than this many usable FDs after allowing for the reserved
133 : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 : * much less than that. Note that this value ensures numExternalFDs can be
135 : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 : * will not pass unless that can grow to at least 14.)
137 : */
138 : #define FD_MINFREE 48
139 :
140 : /*
141 : * A number of platforms allow individual processes to open many more files
142 : * than they can really support when *many* processes do the same thing.
143 : * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 : * what the postmaster's initial probe suggests will work.
145 : */
146 : int max_files_per_process = 1000;
147 :
148 : /*
149 : * Maximum number of file descriptors to open for operations that fd.c knows
150 : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 : * to a conservative value, and remains that way indefinitely in bootstrap or
152 : * standalone-backend cases. In normal postmaster operation, the postmaster
153 : * calls set_max_safe_fds() late in initialization to update the value, and
154 : * that value is then inherited by forked subprocesses.
155 : *
156 : * Note: the value of max_files_per_process is taken into account while
157 : * setting this variable, and so need not be tested separately.
158 : */
159 : int max_safe_fds = FD_MINFREE; /* default if not changed */
160 :
161 : /* Whether it is safe to continue running after fsync() fails. */
162 : bool data_sync_retry = false;
163 :
164 : /* How SyncDataDirectory() should do its job. */
165 : int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
166 :
167 : /* How data files should be bulk-extended with zeros. */
168 : int file_extend_method = DEFAULT_FILE_EXTEND_METHOD;
169 :
170 : /* Which kinds of files should be opened with PG_O_DIRECT. */
171 : int io_direct_flags;
172 :
173 : /* Debugging.... */
174 :
175 : #ifdef FDDEBUG
176 : #define DO_DB(A) \
177 : do { \
178 : int _do_db_save_errno = errno; \
179 : A; \
180 : errno = _do_db_save_errno; \
181 : } while (0)
182 : #else
183 : #define DO_DB(A) \
184 : ((void) 0)
185 : #endif
186 :
187 : #define VFD_CLOSED (-1)
188 :
189 : #define FileIsValid(file) \
190 : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
191 :
192 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
193 :
194 : /* these are the assigned bits in fdstate below: */
195 : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
196 : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
197 : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
198 :
199 : typedef struct vfd
200 : {
201 : int fd; /* current FD, or VFD_CLOSED if none */
202 : unsigned short fdstate; /* bitflags for VFD's state */
203 : ResourceOwner resowner; /* owner, for automatic cleanup */
204 : File nextFree; /* link to next free VFD, if in freelist */
205 : File lruMoreRecently; /* doubly linked recency-of-use list */
206 : File lruLessRecently;
207 : pgoff_t fileSize; /* current size of file (0 if not temporary) */
208 : char *fileName; /* name of file, or NULL for unused VFD */
209 : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
210 : int fileFlags; /* open(2) flags for (re)opening the file */
211 : mode_t fileMode; /* mode to pass to open(2) */
212 : } Vfd;
213 :
214 : /*
215 : * Virtual File Descriptor array pointer and size. This grows as
216 : * needed. 'File' values are indexes into this array.
217 : * Note that VfdCache[0] is not a usable VFD, just a list header.
218 : */
219 : static Vfd *VfdCache;
220 : static Size SizeVfdCache = 0;
221 :
222 : /*
223 : * Number of file descriptors known to be in use by VFD entries.
224 : */
225 : static int nfile = 0;
226 :
227 : /*
228 : * Flag to tell whether it's worth scanning VfdCache looking for temp files
229 : * to close
230 : */
231 : static bool have_xact_temporary_files = false;
232 :
233 : /*
234 : * Tracks the total size of all temporary files. Note: when temp_file_limit
235 : * is being enforced, this cannot overflow since the limit cannot be more
236 : * than INT_MAX kilobytes. When not enforcing, it could theoretically
237 : * overflow, but we don't care.
238 : */
239 : static uint64 temporary_files_size = 0;
240 :
241 : /* Temporary file access initialized and not yet shut down? */
242 : #ifdef USE_ASSERT_CHECKING
243 : static bool temporary_files_allowed = false;
244 : #endif
245 :
246 : /*
247 : * List of OS handles opened with AllocateFile, AllocateDir and
248 : * OpenTransientFile.
249 : */
250 : typedef enum
251 : {
252 : AllocateDescFile,
253 : AllocateDescPipe,
254 : AllocateDescDir,
255 : AllocateDescRawFD,
256 : } AllocateDescKind;
257 :
258 : typedef struct
259 : {
260 : AllocateDescKind kind;
261 : SubTransactionId create_subid;
262 : union
263 : {
264 : FILE *file;
265 : DIR *dir;
266 : int fd;
267 : } desc;
268 : } AllocateDesc;
269 :
270 : static int numAllocatedDescs = 0;
271 : static int maxAllocatedDescs = 0;
272 : static AllocateDesc *allocatedDescs = NULL;
273 :
274 : /*
275 : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
276 : */
277 : static int numExternalFDs = 0;
278 :
279 : /*
280 : * Number of temporary files opened during the current session;
281 : * this is used in generation of tempfile names.
282 : */
283 : static long tempFileCounter = 0;
284 :
285 : /*
286 : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
287 : * indicating that the current database's default tablespace should be used.)
288 : * When numTempTableSpaces is -1, this has not been set in the current
289 : * transaction.
290 : */
291 : static Oid *tempTableSpaces = NULL;
292 : static int numTempTableSpaces = -1;
293 : static int nextTempTableSpace = 0;
294 :
295 :
296 : /*--------------------
297 : *
298 : * Private Routines
299 : *
300 : * Delete - delete a file from the Lru ring
301 : * LruDelete - remove a file from the Lru ring and close its FD
302 : * Insert - put a file at the front of the Lru ring
303 : * LruInsert - put a file at the front of the Lru ring and open it
304 : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
305 : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
306 : * AllocateVfd - grab a free (or new) file record (from VfdCache)
307 : * FreeVfd - free a file record
308 : *
309 : * The Least Recently Used ring is a doubly linked list that begins and
310 : * ends on element zero. Element zero is special -- it doesn't represent
311 : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
312 : * anchor that shows us the beginning/end of the ring.
313 : * Only VFD elements that are currently really open (have an FD assigned) are
314 : * in the Lru ring. Elements that are "virtually" open can be recognized
315 : * by having a non-null fileName field.
316 : *
317 : * example:
318 : *
319 : * /--less----\ /---------\
320 : * v \ v \
321 : * #0 --more---> LeastRecentlyUsed --more-\ \
322 : * ^\ | |
323 : * \\less--> MostRecentlyUsedFile <---/ |
324 : * \more---/ \--less--/
325 : *
326 : *--------------------
327 : */
328 : static void Delete(File file);
329 : static void LruDelete(File file);
330 : static void Insert(File file);
331 : static int LruInsert(File file);
332 : static bool ReleaseLruFile(void);
333 : static void ReleaseLruFiles(void);
334 : static File AllocateVfd(void);
335 : static void FreeVfd(File file);
336 :
337 : static int FileAccess(File file);
338 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
339 : static bool reserveAllocatedDesc(void);
340 : static int FreeDesc(AllocateDesc *desc);
341 :
342 : static void BeforeShmemExit_Files(int code, Datum arg);
343 : static void CleanupTempFiles(bool isCommit, bool isProcExit);
344 : static void RemovePgTempRelationFiles(const char *tsdirname);
345 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
346 :
347 : static void walkdir(const char *path,
348 : void (*action) (const char *fname, bool isdir, int elevel),
349 : bool process_symlinks,
350 : int elevel);
351 : #ifdef PG_FLUSH_DATA_WORKS
352 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
353 : #endif
354 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
355 : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
356 :
357 : static int fsync_parent_path(const char *fname, int elevel);
358 :
359 :
360 : /* ResourceOwner callbacks to hold virtual file descriptors */
361 : static void ResOwnerReleaseFile(Datum res);
362 : static char *ResOwnerPrintFile(Datum res);
363 :
364 : static const ResourceOwnerDesc file_resowner_desc =
365 : {
366 : .name = "File",
367 : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
368 : .release_priority = RELEASE_PRIO_FILES,
369 : .ReleaseResource = ResOwnerReleaseFile,
370 : .DebugPrint = ResOwnerPrintFile
371 : };
372 :
373 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
374 : static inline void
375 9284 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
376 : {
377 9284 : ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
378 9284 : }
379 : static inline void
380 9276 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
381 : {
382 9276 : ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
383 9276 : }
384 :
385 : /*
386 : * pg_fsync --- do fsync with or without writethrough
387 : */
388 : int
389 139010 : pg_fsync(int fd)
390 : {
391 : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
392 : struct stat st;
393 :
394 : /*
395 : * Some operating system implementations of fsync() have requirements
396 : * about the file access modes that were used when their file descriptor
397 : * argument was opened, and these requirements differ depending on whether
398 : * the file descriptor is for a directory.
399 : *
400 : * For any file descriptor that may eventually be handed to fsync(), we
401 : * should have opened it with access modes that are compatible with
402 : * fsync() on all supported systems, otherwise the code may not be
403 : * portable, even if it runs ok on the current system.
404 : *
405 : * We assert here that a descriptor for a file was opened with write
406 : * permissions (i.e., not O_RDONLY) and for a directory without write
407 : * permissions (O_RDONLY). Notice that the assertion check is made even
408 : * if fsync() is disabled.
409 : *
410 : * If fstat() fails, ignore it and let the follow-up fsync() complain.
411 : */
412 : if (fstat(fd, &st) == 0)
413 : {
414 : int desc_flags = fcntl(fd, F_GETFL);
415 :
416 : desc_flags &= O_ACCMODE;
417 :
418 : if (S_ISDIR(st.st_mode))
419 : Assert(desc_flags == O_RDONLY);
420 : else
421 : Assert(desc_flags != O_RDONLY);
422 : }
423 : errno = 0;
424 : #endif
425 :
426 : /* #if is to skip the wal_sync_method test if there's no need for it */
427 : #if defined(HAVE_FSYNC_WRITETHROUGH)
428 : if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
429 : return pg_fsync_writethrough(fd);
430 : else
431 : #endif
432 139010 : return pg_fsync_no_writethrough(fd);
433 : }
434 :
435 :
436 : /*
437 : * pg_fsync_no_writethrough --- same as fsync except does nothing if
438 : * enableFsync is off
439 : */
440 : int
441 139010 : pg_fsync_no_writethrough(int fd)
442 : {
443 : int rc;
444 :
445 139010 : if (!enableFsync)
446 139010 : return 0;
447 :
448 0 : retry:
449 0 : rc = fsync(fd);
450 :
451 0 : if (rc == -1 && errno == EINTR)
452 0 : goto retry;
453 :
454 0 : return rc;
455 : }
456 :
457 : /*
458 : * pg_fsync_writethrough
459 : */
460 : int
461 0 : pg_fsync_writethrough(int fd)
462 : {
463 0 : if (enableFsync)
464 : {
465 : #if defined(F_FULLFSYNC)
466 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
467 : #else
468 0 : errno = ENOSYS;
469 0 : return -1;
470 : #endif
471 : }
472 : else
473 0 : return 0;
474 : }
475 :
476 : /*
477 : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
478 : */
479 : int
480 0 : pg_fdatasync(int fd)
481 : {
482 : int rc;
483 :
484 0 : if (!enableFsync)
485 0 : return 0;
486 :
487 0 : retry:
488 0 : rc = fdatasync(fd);
489 :
490 0 : if (rc == -1 && errno == EINTR)
491 0 : goto retry;
492 :
493 0 : return rc;
494 : }
495 :
496 : /*
497 : * pg_file_exists -- check that a file exists.
498 : *
499 : * This requires an absolute path to the file. Returns true if the file is
500 : * not a directory, false otherwise.
501 : */
502 : bool
503 41876 : pg_file_exists(const char *name)
504 : {
505 : struct stat st;
506 :
507 : Assert(name != NULL);
508 :
509 41876 : if (stat(name, &st) == 0)
510 21892 : return !S_ISDIR(st.st_mode);
511 19984 : else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
512 0 : ereport(ERROR,
513 : (errcode_for_file_access(),
514 : errmsg("could not access file \"%s\": %m", name)));
515 :
516 19984 : return false;
517 : }
518 :
519 : /*
520 : * pg_flush_data --- advise OS that the described dirty data should be flushed
521 : *
522 : * offset of 0 with nbytes 0 means that the entire file should be flushed
523 : */
524 : void
525 77072 : pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
526 : {
527 : /*
528 : * Right now file flushing is primarily used to avoid making later
529 : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
530 : * if fsyncs are disabled - that's a decision we might want to make
531 : * configurable at some point.
532 : */
533 77072 : if (!enableFsync)
534 77072 : return;
535 :
536 : /*
537 : * We compile all alternatives that are supported on the current platform,
538 : * to find portability problems more easily.
539 : */
540 : #if defined(HAVE_SYNC_FILE_RANGE)
541 : {
542 : int rc;
543 : static bool not_implemented_by_kernel = false;
544 :
545 0 : if (not_implemented_by_kernel)
546 0 : return;
547 :
548 0 : retry:
549 :
550 : /*
551 : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
552 : * tells the OS that writeback for the specified blocks should be
553 : * started, but that we don't want to wait for completion. Note that
554 : * this call might block if too much dirty data exists in the range.
555 : * This is the preferable method on OSs supporting it, as it works
556 : * reliably when available (contrast to msync()) and doesn't flush out
557 : * clean data (like FADV_DONTNEED).
558 : */
559 0 : rc = sync_file_range(fd, offset, nbytes,
560 : SYNC_FILE_RANGE_WRITE);
561 0 : if (rc != 0)
562 : {
563 : int elevel;
564 :
565 0 : if (rc == EINTR)
566 0 : goto retry;
567 :
568 : /*
569 : * For systems that don't have an implementation of
570 : * sync_file_range() such as Windows WSL, generate only one
571 : * warning and then suppress all further attempts by this process.
572 : */
573 0 : if (errno == ENOSYS)
574 : {
575 0 : elevel = WARNING;
576 0 : not_implemented_by_kernel = true;
577 : }
578 : else
579 0 : elevel = data_sync_elevel(WARNING);
580 :
581 0 : ereport(elevel,
582 : (errcode_for_file_access(),
583 : errmsg("could not flush dirty data: %m")));
584 : }
585 :
586 0 : return;
587 : }
588 : #endif
589 : #if !defined(WIN32) && defined(MS_ASYNC)
590 : {
591 : void *p;
592 : static int pagesize = 0;
593 :
594 : /*
595 : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
596 : * writeback. On linux it only does so if MS_SYNC is specified, but
597 : * then it does the writeback synchronously. Luckily all common linux
598 : * systems have sync_file_range(). This is preferable over
599 : * FADV_DONTNEED because it doesn't flush out clean data.
600 : *
601 : * We map the file (mmap()), tell the kernel to sync back the contents
602 : * (msync()), and then remove the mapping again (munmap()).
603 : */
604 :
605 : /* mmap() needs actual length if we want to map whole file */
606 : if (offset == 0 && nbytes == 0)
607 : {
608 : nbytes = lseek(fd, 0, SEEK_END);
609 : if (nbytes < 0)
610 : {
611 : ereport(WARNING,
612 : (errcode_for_file_access(),
613 : errmsg("could not determine dirty data size: %m")));
614 : return;
615 : }
616 : }
617 :
618 : /*
619 : * Some platforms reject partial-page mmap() attempts. To deal with
620 : * that, just truncate the request to a page boundary. If any extra
621 : * bytes don't get flushed, well, it's only a hint anyway.
622 : */
623 :
624 : /* fetch pagesize only once */
625 : if (pagesize == 0)
626 : pagesize = sysconf(_SC_PAGESIZE);
627 :
628 : /* align length to pagesize, dropping any fractional page */
629 : if (pagesize > 0)
630 : nbytes = (nbytes / pagesize) * pagesize;
631 :
632 : /* fractional-page request is a no-op */
633 : if (nbytes <= 0)
634 : return;
635 :
636 : /*
637 : * mmap could well fail, particularly on 32-bit platforms where there
638 : * may simply not be enough address space. If so, silently fall
639 : * through to the next implementation.
640 : */
641 : if (nbytes <= (pgoff_t) SSIZE_MAX)
642 : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
643 : else
644 : p = MAP_FAILED;
645 :
646 : if (p != MAP_FAILED)
647 : {
648 : int rc;
649 :
650 : rc = msync(p, (size_t) nbytes, MS_ASYNC);
651 : if (rc != 0)
652 : {
653 : ereport(data_sync_elevel(WARNING),
654 : (errcode_for_file_access(),
655 : errmsg("could not flush dirty data: %m")));
656 : /* NB: need to fall through to munmap()! */
657 : }
658 :
659 : rc = munmap(p, (size_t) nbytes);
660 : if (rc != 0)
661 : {
662 : /* FATAL error because mapping would remain */
663 : ereport(FATAL,
664 : (errcode_for_file_access(),
665 : errmsg("could not munmap() while flushing data: %m")));
666 : }
667 :
668 : return;
669 : }
670 : }
671 : #endif
672 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
673 : {
674 : int rc;
675 :
676 : /*
677 : * Signal the kernel that the passed in range should not be cached
678 : * anymore. This has the, desired, side effect of writing out dirty
679 : * data, and the, undesired, side effect of likely discarding useful
680 : * clean cached blocks. For the latter reason this is the least
681 : * preferable method.
682 : */
683 :
684 : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
685 :
686 : if (rc != 0)
687 : {
688 : /* don't error out, this is just a performance optimization */
689 : ereport(WARNING,
690 : (errcode_for_file_access(),
691 : errmsg("could not flush dirty data: %m")));
692 : }
693 :
694 : return;
695 : }
696 : #endif
697 : }
698 :
699 : /*
700 : * Truncate an open file to a given length.
701 : */
702 : static int
703 1138 : pg_ftruncate(int fd, pgoff_t length)
704 : {
705 : int ret;
706 :
707 1138 : retry:
708 1138 : ret = ftruncate(fd, length);
709 :
710 1138 : if (ret == -1 && errno == EINTR)
711 0 : goto retry;
712 :
713 1138 : return ret;
714 : }
715 :
716 : /*
717 : * Truncate a file to a given length by name.
718 : */
719 : int
720 453690 : pg_truncate(const char *path, pgoff_t length)
721 : {
722 : int ret;
723 : #ifdef WIN32
724 : int save_errno;
725 : int fd;
726 :
727 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
728 : if (fd >= 0)
729 : {
730 : ret = pg_ftruncate(fd, length);
731 : save_errno = errno;
732 : CloseTransientFile(fd);
733 : errno = save_errno;
734 : }
735 : else
736 : ret = -1;
737 : #else
738 :
739 453690 : retry:
740 453690 : ret = truncate(path, length);
741 :
742 453690 : if (ret == -1 && errno == EINTR)
743 0 : goto retry;
744 : #endif
745 :
746 453690 : return ret;
747 : }
748 :
749 : /*
750 : * fsync_fname -- fsync a file or directory, handling errors properly
751 : *
752 : * Try to fsync a file or directory. When doing the latter, ignore errors that
753 : * indicate the OS just doesn't allow/require fsyncing directories.
754 : */
755 : void
756 43926 : fsync_fname(const char *fname, bool isdir)
757 : {
758 43926 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
759 43926 : }
760 :
761 : /*
762 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
763 : *
764 : * This routine ensures that, after returning, the effect of renaming file
765 : * persists in case of a crash. A crash while this routine is running will
766 : * leave you with either the pre-existing or the moved file in place of the
767 : * new file; no mixed state or truncated files are possible.
768 : *
769 : * It does so by using fsync on the old filename and the possibly existing
770 : * target filename before the rename, and the target file and directory after.
771 : *
772 : * Note that rename() cannot be used across arbitrary directories, as they
773 : * might not be on the same filesystem. Therefore this routine does not
774 : * support renaming across directories.
775 : *
776 : * Log errors with the caller specified severity.
777 : *
778 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779 : * valid upon return.
780 : */
781 : int
782 13138 : durable_rename(const char *oldfile, const char *newfile, int elevel)
783 : {
784 : int fd;
785 :
786 : /*
787 : * First fsync the old and target path (if it exists), to ensure that they
788 : * are properly persistent on disk. Syncing the target file is not
789 : * strictly necessary, but it makes it easier to reason about crashes;
790 : * because it's then guaranteed that either source or target file exists
791 : * after a crash.
792 : */
793 13138 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
794 0 : return -1;
795 :
796 13138 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
797 13138 : if (fd < 0)
798 : {
799 9138 : if (errno != ENOENT)
800 : {
801 0 : ereport(elevel,
802 : (errcode_for_file_access(),
803 : errmsg("could not open file \"%s\": %m", newfile)));
804 0 : return -1;
805 : }
806 : }
807 : else
808 : {
809 4000 : if (pg_fsync(fd) != 0)
810 : {
811 : int save_errno;
812 :
813 : /* close file upon error, might not be in transaction context */
814 0 : save_errno = errno;
815 0 : CloseTransientFile(fd);
816 0 : errno = save_errno;
817 :
818 0 : ereport(elevel,
819 : (errcode_for_file_access(),
820 : errmsg("could not fsync file \"%s\": %m", newfile)));
821 0 : return -1;
822 : }
823 :
824 4000 : if (CloseTransientFile(fd) != 0)
825 : {
826 0 : ereport(elevel,
827 : (errcode_for_file_access(),
828 : errmsg("could not close file \"%s\": %m", newfile)));
829 0 : return -1;
830 : }
831 : }
832 :
833 : /* Time to do the real deal... */
834 13138 : if (rename(oldfile, newfile) < 0)
835 : {
836 0 : ereport(elevel,
837 : (errcode_for_file_access(),
838 : errmsg("could not rename file \"%s\" to \"%s\": %m",
839 : oldfile, newfile)));
840 0 : return -1;
841 : }
842 :
843 : /*
844 : * To guarantee renaming the file is persistent, fsync the file with its
845 : * new name, and its containing directory.
846 : */
847 13138 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
848 0 : return -1;
849 :
850 13138 : if (fsync_parent_path(newfile, elevel) != 0)
851 0 : return -1;
852 :
853 13138 : return 0;
854 : }
855 :
856 : /*
857 : * durable_unlink -- remove a file in a durable manner
858 : *
859 : * This routine ensures that, after returning, the effect of removing file
860 : * persists in case of a crash. A crash while this routine is running will
861 : * leave the system in no mixed state.
862 : *
863 : * It does so by using fsync on the parent directory of the file after the
864 : * actual removal is done.
865 : *
866 : * Log errors with the severity specified by caller.
867 : *
868 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
869 : * valid upon return.
870 : */
871 : int
872 2620 : durable_unlink(const char *fname, int elevel)
873 : {
874 2620 : if (unlink(fname) < 0)
875 : {
876 90 : ereport(elevel,
877 : (errcode_for_file_access(),
878 : errmsg("could not remove file \"%s\": %m",
879 : fname)));
880 90 : return -1;
881 : }
882 :
883 : /*
884 : * To guarantee that the removal of the file is persistent, fsync its
885 : * parent directory.
886 : */
887 2530 : if (fsync_parent_path(fname, elevel) != 0)
888 0 : return -1;
889 :
890 2530 : return 0;
891 : }
892 :
893 : /*
894 : * InitFileAccess --- initialize this module during backend startup
895 : *
896 : * This is called during either normal or standalone backend start.
897 : * It is *not* called in the postmaster.
898 : *
899 : * Note that this does not initialize temporary file access, that is
900 : * separately initialized via InitTemporaryFileAccess().
901 : */
902 : void
903 45406 : InitFileAccess(void)
904 : {
905 : Assert(SizeVfdCache == 0); /* call me only once */
906 :
907 : /* initialize cache header entry */
908 45406 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
909 45406 : if (VfdCache == NULL)
910 0 : ereport(FATAL,
911 : (errcode(ERRCODE_OUT_OF_MEMORY),
912 : errmsg("out of memory")));
913 :
914 363248 : MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
915 45406 : VfdCache->fd = VFD_CLOSED;
916 :
917 45406 : SizeVfdCache = 1;
918 45406 : }
919 :
920 : /*
921 : * InitTemporaryFileAccess --- initialize temporary file access during startup
922 : *
923 : * This is called during either normal or standalone backend start.
924 : * It is *not* called in the postmaster.
925 : *
926 : * This is separate from InitFileAccess() because temporary file cleanup can
927 : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
928 : * our reporting has to happen before that. Low level file access should be
929 : * available for longer, hence the separate initialization / shutdown of
930 : * temporary file handling.
931 : */
932 : void
933 45406 : InitTemporaryFileAccess(void)
934 : {
935 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
936 : Assert(!temporary_files_allowed); /* call me only once */
937 :
938 : /*
939 : * Register before-shmem-exit hook to ensure temp files are dropped while
940 : * we can still report stats.
941 : */
942 45406 : before_shmem_exit(BeforeShmemExit_Files, 0);
943 :
944 : #ifdef USE_ASSERT_CHECKING
945 : temporary_files_allowed = true;
946 : #endif
947 45406 : }
948 :
949 : /*
950 : * count_usable_fds --- count how many FDs the system will let us open,
951 : * and estimate how many are already open.
952 : *
953 : * We stop counting if usable_fds reaches max_to_probe. Note: a small
954 : * value of max_to_probe might result in an underestimate of already_open;
955 : * we must fill in any "gaps" in the set of used FDs before the calculation
956 : * of already_open will give the right answer. In practice, max_to_probe
957 : * of a couple of dozen should be enough to ensure good results.
958 : *
959 : * We assume stderr (FD 2) is available for dup'ing. While the calling
960 : * script could theoretically close that, it would be a really bad idea,
961 : * since then one risks loss of error messages from, e.g., libc.
962 : */
963 : static void
964 2270 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
965 : {
966 : int *fd;
967 : int size;
968 2270 : int used = 0;
969 2270 : int highestfd = 0;
970 : int j;
971 :
972 : #ifdef HAVE_GETRLIMIT
973 : struct rlimit rlim;
974 : int getrlimit_status;
975 : #endif
976 :
977 2270 : size = 1024;
978 2270 : fd = (int *) palloc(size * sizeof(int));
979 :
980 : #ifdef HAVE_GETRLIMIT
981 2270 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
982 2270 : if (getrlimit_status != 0)
983 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
984 : #endif /* HAVE_GETRLIMIT */
985 :
986 : /* dup until failure or probe limit reached */
987 : for (;;)
988 2267730 : {
989 : int thisfd;
990 :
991 : #ifdef HAVE_GETRLIMIT
992 :
993 : /*
994 : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
995 : * some platforms
996 : */
997 2270000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
998 0 : break;
999 : #endif
1000 :
1001 2270000 : thisfd = dup(2);
1002 2270000 : if (thisfd < 0)
1003 : {
1004 : /* Expect EMFILE or ENFILE, else it's fishy */
1005 0 : if (errno != EMFILE && errno != ENFILE)
1006 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1007 0 : break;
1008 : }
1009 :
1010 2270000 : if (used >= size)
1011 : {
1012 0 : size *= 2;
1013 0 : fd = (int *) repalloc(fd, size * sizeof(int));
1014 : }
1015 2270000 : fd[used++] = thisfd;
1016 :
1017 2270000 : if (highestfd < thisfd)
1018 2270000 : highestfd = thisfd;
1019 :
1020 2270000 : if (used >= max_to_probe)
1021 2270 : break;
1022 : }
1023 :
1024 : /* release the files we opened */
1025 2272270 : for (j = 0; j < used; j++)
1026 2270000 : close(fd[j]);
1027 :
1028 2270 : pfree(fd);
1029 :
1030 : /*
1031 : * Return results. usable_fds is just the number of successful dups. We
1032 : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1033 : * number) and so already_open is highestfd+1 - usable_fds.
1034 : */
1035 2270 : *usable_fds = used;
1036 2270 : *already_open = highestfd + 1 - used;
1037 2270 : }
1038 :
1039 : /*
1040 : * set_max_safe_fds
1041 : * Determine number of file descriptors that fd.c is allowed to use
1042 : */
1043 : void
1044 2270 : set_max_safe_fds(void)
1045 : {
1046 : int usable_fds;
1047 : int already_open;
1048 :
1049 : /*----------
1050 : * We want to set max_safe_fds to
1051 : * MIN(usable_fds, max_files_per_process)
1052 : * less the slop factor for files that are opened without consulting
1053 : * fd.c. This ensures that we won't allow to open more than
1054 : * max_files_per_process, or the experimentally-determined EMFILE limit,
1055 : * additional files.
1056 : *----------
1057 : */
1058 2270 : count_usable_fds(max_files_per_process,
1059 : &usable_fds, &already_open);
1060 :
1061 2270 : max_safe_fds = Min(usable_fds, max_files_per_process);
1062 :
1063 : /*
1064 : * Take off the FDs reserved for system() etc.
1065 : */
1066 2270 : max_safe_fds -= NUM_RESERVED_FDS;
1067 :
1068 : /*
1069 : * Make sure we still have enough to get by.
1070 : */
1071 2270 : if (max_safe_fds < FD_MINFREE)
1072 0 : ereport(FATAL,
1073 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1074 : errmsg("insufficient file descriptors available to start server process"),
1075 : errdetail("System allows %d, server needs at least %d, %d files are already open.",
1076 : max_safe_fds + NUM_RESERVED_FDS,
1077 : FD_MINFREE + NUM_RESERVED_FDS,
1078 : already_open)));
1079 :
1080 2270 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1081 : max_safe_fds, usable_fds, already_open);
1082 2270 : }
1083 :
1084 : /*
1085 : * Open a file with BasicOpenFilePerm() and pass default file mode for the
1086 : * fileMode parameter.
1087 : */
1088 : int
1089 75728 : BasicOpenFile(const char *fileName, int fileFlags)
1090 : {
1091 75728 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1092 : }
1093 :
1094 : /*
1095 : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1096 : *
1097 : * This is exported for use by places that really want a plain kernel FD,
1098 : * but need to be proof against running out of FDs. Once an FD has been
1099 : * successfully returned, it is the caller's responsibility to ensure that
1100 : * it will not be leaked on ereport()! Most users should *not* call this
1101 : * routine directly, but instead use the VFD abstraction level, which
1102 : * provides protection against descriptor leaks as well as management of
1103 : * files that need to be open for more than a short period of time.
1104 : *
1105 : * Ideally this should be the *only* direct call of open() in the backend.
1106 : * In practice, the postmaster calls open() directly, and there are some
1107 : * direct open() calls done early in backend startup. Those are OK since
1108 : * this module wouldn't have any open files to close at that point anyway.
1109 : */
1110 : int
1111 18568100 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1112 : {
1113 : int fd;
1114 :
1115 18568100 : tryAgain:
1116 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1117 : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1118 : #else
1119 18568100 : fd = open(fileName, fileFlags, fileMode);
1120 : #endif
1121 :
1122 18568100 : if (fd >= 0)
1123 : {
1124 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1125 : if (fileFlags & PG_O_DIRECT)
1126 : {
1127 : if (fcntl(fd, F_NOCACHE, 1) < 0)
1128 : {
1129 : int save_errno = errno;
1130 :
1131 : close(fd);
1132 : errno = save_errno;
1133 : return -1;
1134 : }
1135 : }
1136 : #endif
1137 :
1138 17709970 : return fd; /* success! */
1139 : }
1140 :
1141 858130 : if (errno == EMFILE || errno == ENFILE)
1142 : {
1143 0 : int save_errno = errno;
1144 :
1145 0 : ereport(LOG,
1146 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1147 : errmsg("out of file descriptors: %m; release and retry")));
1148 0 : errno = 0;
1149 0 : if (ReleaseLruFile())
1150 0 : goto tryAgain;
1151 0 : errno = save_errno;
1152 : }
1153 :
1154 858130 : return -1; /* failure */
1155 : }
1156 :
1157 : /*
1158 : * AcquireExternalFD - attempt to reserve an external file descriptor
1159 : *
1160 : * This should be used by callers that need to hold a file descriptor open
1161 : * over more than a short interval, but cannot use any of the other facilities
1162 : * provided by this module.
1163 : *
1164 : * The difference between this and the underlying ReserveExternalFD function
1165 : * is that this will report failure (by setting errno and returning false)
1166 : * if "too many" external FDs are already reserved. This should be used in
1167 : * any code where the total number of FDs to be reserved is not predictable
1168 : * and small.
1169 : */
1170 : bool
1171 322616 : AcquireExternalFD(void)
1172 : {
1173 : /*
1174 : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1175 : * "external" FDs.
1176 : */
1177 322616 : if (numExternalFDs < max_safe_fds / 3)
1178 : {
1179 322616 : ReserveExternalFD();
1180 322616 : return true;
1181 : }
1182 0 : errno = EMFILE;
1183 0 : return false;
1184 : }
1185 :
1186 : /*
1187 : * ReserveExternalFD - report external consumption of a file descriptor
1188 : *
1189 : * This should be used by callers that need to hold a file descriptor open
1190 : * over more than a short interval, but cannot use any of the other facilities
1191 : * provided by this module. This just tracks the use of the FD and closes
1192 : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1193 : *
1194 : * Call this directly only in code where failure to reserve the FD would be
1195 : * fatal; for example, the WAL-writing code does so, since the alternative is
1196 : * session failure. Also, it's very unwise to do so in code that could
1197 : * consume more than one FD per process.
1198 : *
1199 : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1200 : * available, it doesn't matter too much whether this is called before or
1201 : * after actually opening the FD; but doing so beforehand reduces the risk of
1202 : * an EMFILE failure if not everybody played nice. In any case, it's solely
1203 : * caller's responsibility to keep the external-FD count in sync with reality.
1204 : */
1205 : void
1206 483142 : ReserveExternalFD(void)
1207 : {
1208 : /*
1209 : * Release VFDs if needed to stay safe. Because we do this before
1210 : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1211 : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1212 : */
1213 483142 : ReleaseLruFiles();
1214 :
1215 483142 : numExternalFDs++;
1216 483142 : }
1217 :
1218 : /*
1219 : * ReleaseExternalFD - report release of an external file descriptor
1220 : *
1221 : * This is guaranteed not to change errno, so it can be used in failure paths.
1222 : */
1223 : void
1224 441690 : ReleaseExternalFD(void)
1225 : {
1226 : Assert(numExternalFDs > 0);
1227 441690 : numExternalFDs--;
1228 441690 : }
1229 :
1230 :
1231 : #if defined(FDDEBUG)
1232 :
1233 : static void
1234 : _dump_lru(void)
1235 : {
1236 : int mru = VfdCache[0].lruLessRecently;
1237 : Vfd *vfdP = &VfdCache[mru];
1238 : char buf[2048];
1239 :
1240 : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1241 : while (mru != 0)
1242 : {
1243 : mru = vfdP->lruLessRecently;
1244 : vfdP = &VfdCache[mru];
1245 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1246 : }
1247 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1248 : elog(LOG, "%s", buf);
1249 : }
1250 : #endif /* FDDEBUG */
1251 :
1252 : static void
1253 2791206 : Delete(File file)
1254 : {
1255 : Vfd *vfdP;
1256 :
1257 : Assert(file != 0);
1258 :
1259 : DO_DB(elog(LOG, "Delete %d (%s)",
1260 : file, VfdCache[file].fileName));
1261 : DO_DB(_dump_lru());
1262 :
1263 2791206 : vfdP = &VfdCache[file];
1264 :
1265 2791206 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1266 2791206 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1267 :
1268 : DO_DB(_dump_lru());
1269 2791206 : }
1270 :
1271 : static void
1272 5296 : LruDelete(File file)
1273 : {
1274 : Vfd *vfdP;
1275 :
1276 : Assert(file != 0);
1277 :
1278 : DO_DB(elog(LOG, "LruDelete %d (%s)",
1279 : file, VfdCache[file].fileName));
1280 :
1281 5296 : vfdP = &VfdCache[file];
1282 :
1283 5296 : pgaio_closing_fd(vfdP->fd);
1284 :
1285 : /*
1286 : * Close the file. We aren't expecting this to fail; if it does, better
1287 : * to leak the FD than to mess up our internal state.
1288 : */
1289 5296 : if (close(vfdP->fd) != 0)
1290 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1291 : "could not close file \"%s\": %m", vfdP->fileName);
1292 5296 : vfdP->fd = VFD_CLOSED;
1293 5296 : --nfile;
1294 :
1295 : /* delete the vfd record from the LRU ring */
1296 5296 : Delete(file);
1297 5296 : }
1298 :
1299 : static void
1300 3780288 : Insert(File file)
1301 : {
1302 : Vfd *vfdP;
1303 :
1304 : Assert(file != 0);
1305 :
1306 : DO_DB(elog(LOG, "Insert %d (%s)",
1307 : file, VfdCache[file].fileName));
1308 : DO_DB(_dump_lru());
1309 :
1310 3780288 : vfdP = &VfdCache[file];
1311 :
1312 3780288 : vfdP->lruMoreRecently = 0;
1313 3780288 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1314 3780288 : VfdCache[0].lruLessRecently = file;
1315 3780288 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1316 :
1317 : DO_DB(_dump_lru());
1318 3780288 : }
1319 :
1320 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1321 : static int
1322 40 : LruInsert(File file)
1323 : {
1324 : Vfd *vfdP;
1325 :
1326 : Assert(file != 0);
1327 :
1328 : DO_DB(elog(LOG, "LruInsert %d (%s)",
1329 : file, VfdCache[file].fileName));
1330 :
1331 40 : vfdP = &VfdCache[file];
1332 :
1333 40 : if (FileIsNotOpen(file))
1334 : {
1335 : /* Close excess kernel FDs. */
1336 40 : ReleaseLruFiles();
1337 :
1338 : /*
1339 : * The open could still fail for lack of file descriptors, eg due to
1340 : * overall system file table being full. So, be prepared to release
1341 : * another FD if necessary...
1342 : */
1343 40 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1344 : vfdP->fileMode);
1345 40 : if (vfdP->fd < 0)
1346 : {
1347 : DO_DB(elog(LOG, "re-open failed: %m"));
1348 0 : return -1;
1349 : }
1350 : else
1351 : {
1352 40 : ++nfile;
1353 : }
1354 : }
1355 :
1356 : /*
1357 : * put it at the head of the Lru ring
1358 : */
1359 :
1360 40 : Insert(file);
1361 :
1362 40 : return 0;
1363 : }
1364 :
1365 : /*
1366 : * Release one kernel FD by closing the least-recently-used VFD.
1367 : */
1368 : static bool
1369 5034 : ReleaseLruFile(void)
1370 : {
1371 : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1372 :
1373 5034 : if (nfile > 0)
1374 : {
1375 : /*
1376 : * There are opened files and so there should be at least one used vfd
1377 : * in the ring.
1378 : */
1379 : Assert(VfdCache[0].lruMoreRecently != 0);
1380 5034 : LruDelete(VfdCache[0].lruMoreRecently);
1381 5034 : return true; /* freed a file */
1382 : }
1383 0 : return false; /* no files available to free */
1384 : }
1385 :
1386 : /*
1387 : * Release kernel FDs as needed to get under the max_safe_fds limit.
1388 : * After calling this, it's OK to try to open another file.
1389 : */
1390 : static void
1391 19254916 : ReleaseLruFiles(void)
1392 : {
1393 19259950 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1394 : {
1395 5034 : if (!ReleaseLruFile())
1396 0 : break;
1397 : }
1398 19254916 : }
1399 :
1400 : static File
1401 2979378 : AllocateVfd(void)
1402 : {
1403 : Index i;
1404 : File file;
1405 :
1406 : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1407 :
1408 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1409 :
1410 2979378 : if (VfdCache[0].nextFree == 0)
1411 : {
1412 : /*
1413 : * The free list is empty so it is time to increase the size of the
1414 : * array. We choose to double it each time this happens. However,
1415 : * there's not much point in starting *real* small.
1416 : */
1417 54682 : Size newCacheSize = SizeVfdCache * 2;
1418 : Vfd *newVfdCache;
1419 :
1420 54682 : if (newCacheSize < 32)
1421 38642 : newCacheSize = 32;
1422 :
1423 : /*
1424 : * Be careful not to clobber VfdCache ptr if realloc fails.
1425 : */
1426 54682 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1427 54682 : if (newVfdCache == NULL)
1428 0 : ereport(ERROR,
1429 : (errcode(ERRCODE_OUT_OF_MEMORY),
1430 : errmsg("out of memory")));
1431 54682 : VfdCache = newVfdCache;
1432 :
1433 : /*
1434 : * Initialize the new entries and link them into the free list.
1435 : */
1436 2659624 : for (i = SizeVfdCache; i < newCacheSize; i++)
1437 : {
1438 20839536 : MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1439 2604942 : VfdCache[i].nextFree = i + 1;
1440 2604942 : VfdCache[i].fd = VFD_CLOSED;
1441 : }
1442 54682 : VfdCache[newCacheSize - 1].nextFree = 0;
1443 54682 : VfdCache[0].nextFree = SizeVfdCache;
1444 :
1445 : /*
1446 : * Record the new size
1447 : */
1448 54682 : SizeVfdCache = newCacheSize;
1449 : }
1450 :
1451 2979378 : file = VfdCache[0].nextFree;
1452 :
1453 2979378 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1454 :
1455 2979378 : return file;
1456 : }
1457 :
1458 : static void
1459 1985600 : FreeVfd(File file)
1460 : {
1461 1985600 : Vfd *vfdP = &VfdCache[file];
1462 :
1463 : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1464 : file, vfdP->fileName ? vfdP->fileName : ""));
1465 :
1466 1985600 : if (vfdP->fileName != NULL)
1467 : {
1468 1141004 : free(vfdP->fileName);
1469 1141004 : vfdP->fileName = NULL;
1470 : }
1471 1985600 : vfdP->fdstate = 0x0;
1472 :
1473 1985600 : vfdP->nextFree = VfdCache[0].nextFree;
1474 1985600 : VfdCache[0].nextFree = file;
1475 1985600 : }
1476 :
1477 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1478 : static int
1479 6307496 : FileAccess(File file)
1480 : {
1481 : int returnValue;
1482 :
1483 : DO_DB(elog(LOG, "FileAccess %d (%s)",
1484 : file, VfdCache[file].fileName));
1485 :
1486 : /*
1487 : * Is the file open? If not, open it and put it at the head of the LRU
1488 : * ring (possibly closing the least recently used file to get an FD).
1489 : */
1490 :
1491 6307496 : if (FileIsNotOpen(file))
1492 : {
1493 40 : returnValue = LruInsert(file);
1494 40 : if (returnValue != 0)
1495 0 : return returnValue;
1496 : }
1497 6307456 : else if (VfdCache[0].lruLessRecently != file)
1498 : {
1499 : /*
1500 : * We now know that the file is open and that it is not the last one
1501 : * accessed, so we need to move it to the head of the Lru ring.
1502 : */
1503 :
1504 1645466 : Delete(file);
1505 1645466 : Insert(file);
1506 : }
1507 :
1508 6307496 : return 0;
1509 : }
1510 :
1511 : /*
1512 : * Called whenever a temporary file is deleted to report its size.
1513 : */
1514 : static void
1515 5696 : ReportTemporaryFileUsage(const char *path, pgoff_t size)
1516 : {
1517 5696 : pgstat_report_tempfile(size);
1518 :
1519 5696 : if (log_temp_files >= 0)
1520 : {
1521 1806 : if ((size / 1024) >= log_temp_files)
1522 226 : ereport(LOG,
1523 : (errmsg("temporary file: path \"%s\", size %lu",
1524 : path, (unsigned long) size)));
1525 : }
1526 5696 : }
1527 :
1528 : /*
1529 : * Called to register a temporary file for automatic close.
1530 : * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1531 : * before the file was opened.
1532 : */
1533 : static void
1534 9284 : RegisterTemporaryFile(File file)
1535 : {
1536 9284 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1537 9284 : VfdCache[file].resowner = CurrentResourceOwner;
1538 :
1539 : /* Backup mechanism for closing at end of xact. */
1540 9284 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1541 9284 : have_xact_temporary_files = true;
1542 9284 : }
1543 :
1544 : /*
1545 : * Called when we get a shared invalidation message on some relation.
1546 : */
1547 : #ifdef NOT_USED
1548 : void
1549 : FileInvalidate(File file)
1550 : {
1551 : Assert(FileIsValid(file));
1552 : if (!FileIsNotOpen(file))
1553 : LruDelete(file);
1554 : }
1555 : #endif
1556 :
1557 : /*
1558 : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1559 : * fileMode parameter.
1560 : */
1561 : File
1562 2979378 : PathNameOpenFile(const char *fileName, int fileFlags)
1563 : {
1564 2979378 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1565 : }
1566 :
1567 : /*
1568 : * open a file in an arbitrary directory
1569 : *
1570 : * NB: if the passed pathname is relative (which it usually is),
1571 : * it will be interpreted relative to the process' working directory
1572 : * (which should always be $PGDATA when this code is running).
1573 : */
1574 : File
1575 2979378 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1576 : {
1577 : char *fnamecopy;
1578 : File file;
1579 : Vfd *vfdP;
1580 :
1581 : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1582 : fileName, fileFlags, fileMode));
1583 :
1584 : /*
1585 : * We need a malloc'd copy of the file name; fail cleanly if no room.
1586 : */
1587 2979378 : fnamecopy = strdup(fileName);
1588 2979378 : if (fnamecopy == NULL)
1589 0 : ereport(ERROR,
1590 : (errcode(ERRCODE_OUT_OF_MEMORY),
1591 : errmsg("out of memory")));
1592 :
1593 2979378 : file = AllocateVfd();
1594 2979378 : vfdP = &VfdCache[file];
1595 :
1596 : /* Close excess kernel FDs. */
1597 2979378 : ReleaseLruFiles();
1598 :
1599 : /*
1600 : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1601 : * client shouldn't be expected to know which kernel descriptors are
1602 : * currently open, so it wouldn't make sense for them to be inherited by
1603 : * executed subprograms.
1604 : */
1605 2979378 : fileFlags |= O_CLOEXEC;
1606 :
1607 2979378 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1608 :
1609 2979378 : if (vfdP->fd < 0)
1610 : {
1611 844596 : int save_errno = errno;
1612 :
1613 844596 : FreeVfd(file);
1614 844596 : free(fnamecopy);
1615 844596 : errno = save_errno;
1616 844596 : return -1;
1617 : }
1618 2134782 : ++nfile;
1619 : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1620 : vfdP->fd));
1621 :
1622 2134782 : vfdP->fileName = fnamecopy;
1623 : /* Saved flags are adjusted to be OK for re-opening file */
1624 2134782 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1625 2134782 : vfdP->fileMode = fileMode;
1626 2134782 : vfdP->fileSize = 0;
1627 2134782 : vfdP->fdstate = 0x0;
1628 2134782 : vfdP->resowner = NULL;
1629 :
1630 2134782 : Insert(file);
1631 :
1632 2134782 : return file;
1633 : }
1634 :
1635 : /*
1636 : * Create directory 'directory'. If necessary, create 'basedir', which must
1637 : * be the directory above it. This is designed for creating the top-level
1638 : * temporary directory on demand before creating a directory underneath it.
1639 : * Do nothing if the directory already exists.
1640 : *
1641 : * Directories created within the top-level temporary directory should begin
1642 : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1643 : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1644 : * that do not need any particular prefix.
1645 : */
1646 : void
1647 392 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1648 : {
1649 392 : if (MakePGDirectory(directory) < 0)
1650 : {
1651 32 : if (errno == EEXIST)
1652 10 : return;
1653 :
1654 : /*
1655 : * Failed. Try to create basedir first in case it's missing. Tolerate
1656 : * EEXIST to close a race against another process following the same
1657 : * algorithm.
1658 : */
1659 22 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1660 0 : ereport(ERROR,
1661 : (errcode_for_file_access(),
1662 : errmsg("cannot create temporary directory \"%s\": %m",
1663 : basedir)));
1664 :
1665 : /* Try again. */
1666 22 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1667 0 : ereport(ERROR,
1668 : (errcode_for_file_access(),
1669 : errmsg("cannot create temporary subdirectory \"%s\": %m",
1670 : directory)));
1671 : }
1672 : }
1673 :
1674 : /*
1675 : * Delete a directory and everything in it, if it exists.
1676 : */
1677 : void
1678 468 : PathNameDeleteTemporaryDir(const char *dirname)
1679 : {
1680 : struct stat statbuf;
1681 :
1682 : /* Silently ignore missing directory. */
1683 468 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1684 86 : return;
1685 :
1686 : /*
1687 : * Currently, walkdir doesn't offer a way for our passed in function to
1688 : * maintain state. Perhaps it should, so that we could tell the caller
1689 : * whether this operation succeeded or failed. Since this operation is
1690 : * used in a cleanup path, we wouldn't actually behave differently: we'll
1691 : * just log failures.
1692 : */
1693 382 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1694 : }
1695 :
1696 : /*
1697 : * Open a temporary file that will disappear when we close it.
1698 : *
1699 : * This routine takes care of generating an appropriate tempfile name.
1700 : * There's no need to pass in fileFlags or fileMode either, since only
1701 : * one setting makes any sense for a temp file.
1702 : *
1703 : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1704 : * to ensure it's closed and deleted when it's no longer needed, typically at
1705 : * the end-of-transaction. In most cases, you don't want temporary files to
1706 : * outlive the transaction that created them, so this should be false -- but
1707 : * if you need "somewhat" temporary storage, this might be useful. In either
1708 : * case, the file is removed when the File is explicitly closed.
1709 : */
1710 : File
1711 2986 : OpenTemporaryFile(bool interXact)
1712 : {
1713 2986 : File file = 0;
1714 :
1715 : Assert(temporary_files_allowed); /* check temp file access is up */
1716 :
1717 : /*
1718 : * Make sure the current resource owner has space for this File before we
1719 : * open it, if we'll be registering it below.
1720 : */
1721 2986 : if (!interXact)
1722 2986 : ResourceOwnerEnlarge(CurrentResourceOwner);
1723 :
1724 : /*
1725 : * If some temp tablespace(s) have been given to us, try to use the next
1726 : * one. If a given tablespace can't be found, we silently fall back to
1727 : * the database's default tablespace.
1728 : *
1729 : * BUT: if the temp file is slated to outlive the current transaction,
1730 : * force it into the database's default tablespace, so that it will not
1731 : * pose a threat to possible tablespace drop attempts.
1732 : */
1733 2986 : if (numTempTableSpaces > 0 && !interXact)
1734 : {
1735 2 : Oid tblspcOid = GetNextTempTableSpace();
1736 :
1737 2 : if (OidIsValid(tblspcOid))
1738 2 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1739 : }
1740 :
1741 : /*
1742 : * If not, or if tablespace is bad, create in database's default
1743 : * tablespace. MyDatabaseTableSpace should normally be set before we get
1744 : * here, but just in case it isn't, fall back to pg_default tablespace.
1745 : */
1746 2986 : if (file <= 0)
1747 2984 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1748 : MyDatabaseTableSpace :
1749 : DEFAULTTABLESPACE_OID,
1750 : true);
1751 :
1752 : /* Mark it for deletion at close and temporary file size limit */
1753 2986 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1754 :
1755 : /* Register it with the current resource owner */
1756 2986 : if (!interXact)
1757 2986 : RegisterTemporaryFile(file);
1758 :
1759 2986 : return file;
1760 : }
1761 :
1762 : /*
1763 : * Return the path of the temp directory in a given tablespace.
1764 : */
1765 : void
1766 18162 : TempTablespacePath(char *path, Oid tablespace)
1767 : {
1768 : /*
1769 : * Identify the tempfile directory for this tablespace.
1770 : *
1771 : * If someone tries to specify pg_global, use pg_default instead.
1772 : */
1773 18162 : if (tablespace == InvalidOid ||
1774 2 : tablespace == DEFAULTTABLESPACE_OID ||
1775 : tablespace == GLOBALTABLESPACE_OID)
1776 18160 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1777 : else
1778 : {
1779 : /* All other tablespaces are accessed via symlinks */
1780 2 : snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1781 : PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
1782 : PG_TEMP_FILES_DIR);
1783 : }
1784 18162 : }
1785 :
1786 : /*
1787 : * Open a temporary file in a specific tablespace.
1788 : * Subroutine for OpenTemporaryFile, which see for details.
1789 : */
1790 : static File
1791 2986 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1792 : {
1793 : char tempdirpath[MAXPGPATH];
1794 : char tempfilepath[MAXPGPATH];
1795 : File file;
1796 :
1797 2986 : TempTablespacePath(tempdirpath, tblspcOid);
1798 :
1799 : /*
1800 : * Generate a tempfile name that should be unique within the current
1801 : * database instance.
1802 : */
1803 2986 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1804 : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1805 :
1806 : /*
1807 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1808 : * temp file that can be reused.
1809 : */
1810 2986 : file = PathNameOpenFile(tempfilepath,
1811 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1812 2986 : if (file <= 0)
1813 : {
1814 : /*
1815 : * We might need to create the tablespace's tempfile directory, if no
1816 : * one has yet done so.
1817 : *
1818 : * Don't check for an error from MakePGDirectory; it could fail if
1819 : * someone else just did the same thing. If it doesn't work then
1820 : * we'll bomb out on the second create attempt, instead.
1821 : */
1822 200 : (void) MakePGDirectory(tempdirpath);
1823 :
1824 200 : file = PathNameOpenFile(tempfilepath,
1825 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1826 200 : if (file <= 0 && rejectError)
1827 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1828 : tempfilepath);
1829 : }
1830 :
1831 2986 : return file;
1832 : }
1833 :
1834 :
1835 : /*
1836 : * Create a new file. The directory containing it must already exist. Files
1837 : * created this way are subject to temp_file_limit and are automatically
1838 : * closed at end of transaction, but are not automatically deleted on close
1839 : * because they are intended to be shared between cooperating backends.
1840 : *
1841 : * If the file is inside the top-level temporary directory, its name should
1842 : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1843 : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1844 : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1845 : * the prefix isn't needed.
1846 : */
1847 : File
1848 3102 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1849 : {
1850 : File file;
1851 :
1852 : Assert(temporary_files_allowed); /* check temp file access is up */
1853 :
1854 3102 : ResourceOwnerEnlarge(CurrentResourceOwner);
1855 :
1856 : /*
1857 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1858 : * temp file that can be reused.
1859 : */
1860 3102 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1861 3102 : if (file <= 0)
1862 : {
1863 392 : if (error_on_failure)
1864 0 : ereport(ERROR,
1865 : (errcode_for_file_access(),
1866 : errmsg("could not create temporary file \"%s\": %m",
1867 : path)));
1868 : else
1869 392 : return file;
1870 : }
1871 :
1872 : /* Mark it for temp_file_limit accounting. */
1873 2710 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1874 :
1875 : /* Register it for automatic close. */
1876 2710 : RegisterTemporaryFile(file);
1877 :
1878 2710 : return file;
1879 : }
1880 :
1881 : /*
1882 : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1883 : * another backend. Files opened this way don't count against the
1884 : * temp_file_limit of the caller, are automatically closed at the end of the
1885 : * transaction but are not deleted on close.
1886 : */
1887 : File
1888 7722 : PathNameOpenTemporaryFile(const char *path, int mode)
1889 : {
1890 : File file;
1891 :
1892 : Assert(temporary_files_allowed); /* check temp file access is up */
1893 :
1894 7722 : ResourceOwnerEnlarge(CurrentResourceOwner);
1895 :
1896 7722 : file = PathNameOpenFile(path, mode | PG_BINARY);
1897 :
1898 : /* If no such file, then we don't raise an error. */
1899 7722 : if (file <= 0 && errno != ENOENT)
1900 0 : ereport(ERROR,
1901 : (errcode_for_file_access(),
1902 : errmsg("could not open temporary file \"%s\": %m",
1903 : path)));
1904 :
1905 7722 : if (file > 0)
1906 : {
1907 : /* Register it for automatic close. */
1908 3588 : RegisterTemporaryFile(file);
1909 : }
1910 :
1911 7722 : return file;
1912 : }
1913 :
1914 : /*
1915 : * Delete a file by pathname. Return true if the file existed, false if
1916 : * didn't.
1917 : */
1918 : bool
1919 6124 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1920 : {
1921 : struct stat filestats;
1922 : int stat_errno;
1923 :
1924 : /* Get the final size for pgstat reporting. */
1925 6124 : if (stat(path, &filestats) != 0)
1926 3414 : stat_errno = errno;
1927 : else
1928 2710 : stat_errno = 0;
1929 :
1930 : /*
1931 : * Unlike FileClose's automatic file deletion code, we tolerate
1932 : * non-existence to support BufFileDeleteFileSet which doesn't know how
1933 : * many segments it has to delete until it runs out.
1934 : */
1935 6124 : if (stat_errno == ENOENT)
1936 3414 : return false;
1937 :
1938 2710 : if (unlink(path) < 0)
1939 : {
1940 0 : if (errno != ENOENT)
1941 0 : ereport(error_on_failure ? ERROR : LOG,
1942 : (errcode_for_file_access(),
1943 : errmsg("could not unlink temporary file \"%s\": %m",
1944 : path)));
1945 0 : return false;
1946 : }
1947 :
1948 2710 : if (stat_errno == 0)
1949 2710 : ReportTemporaryFileUsage(path, filestats.st_size);
1950 : else
1951 : {
1952 0 : errno = stat_errno;
1953 0 : ereport(LOG,
1954 : (errcode_for_file_access(),
1955 : errmsg("could not stat file \"%s\": %m", path)));
1956 : }
1957 :
1958 2710 : return true;
1959 : }
1960 :
1961 : /*
1962 : * close a file when done with it
1963 : */
1964 : void
1965 1141004 : FileClose(File file)
1966 : {
1967 : Vfd *vfdP;
1968 :
1969 : Assert(FileIsValid(file));
1970 :
1971 : DO_DB(elog(LOG, "FileClose: %d (%s)",
1972 : file, VfdCache[file].fileName));
1973 :
1974 1141004 : vfdP = &VfdCache[file];
1975 :
1976 1141004 : if (!FileIsNotOpen(file))
1977 : {
1978 1140444 : pgaio_closing_fd(vfdP->fd);
1979 :
1980 : /* close the file */
1981 1140444 : if (close(vfdP->fd) != 0)
1982 : {
1983 : /*
1984 : * We may need to panic on failure to close non-temporary files;
1985 : * see LruDelete.
1986 : */
1987 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1988 : "could not close file \"%s\": %m", vfdP->fileName);
1989 : }
1990 :
1991 1140444 : --nfile;
1992 1140444 : vfdP->fd = VFD_CLOSED;
1993 :
1994 : /* remove the file from the lru ring */
1995 1140444 : Delete(file);
1996 : }
1997 :
1998 1141004 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1999 : {
2000 : /* Subtract its size from current usage (do first in case of error) */
2001 5696 : temporary_files_size -= vfdP->fileSize;
2002 5696 : vfdP->fileSize = 0;
2003 : }
2004 :
2005 : /*
2006 : * Delete the file if it was temporary, and make a log entry if wanted
2007 : */
2008 1141004 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2009 : {
2010 : struct stat filestats;
2011 : int stat_errno;
2012 :
2013 : /*
2014 : * If we get an error, as could happen within the ereport/elog calls,
2015 : * we'll come right back here during transaction abort. Reset the
2016 : * flag to ensure that we can't get into an infinite loop. This code
2017 : * is arranged to ensure that the worst-case consequence is failing to
2018 : * emit log message(s), not failing to attempt the unlink.
2019 : */
2020 2986 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2021 :
2022 :
2023 : /* first try the stat() */
2024 2986 : if (stat(vfdP->fileName, &filestats))
2025 0 : stat_errno = errno;
2026 : else
2027 2986 : stat_errno = 0;
2028 :
2029 : /* in any case do the unlink */
2030 2986 : if (unlink(vfdP->fileName))
2031 0 : ereport(LOG,
2032 : (errcode_for_file_access(),
2033 : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2034 :
2035 : /* and last report the stat results */
2036 2986 : if (stat_errno == 0)
2037 2986 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2038 : else
2039 : {
2040 0 : errno = stat_errno;
2041 0 : ereport(LOG,
2042 : (errcode_for_file_access(),
2043 : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2044 : }
2045 : }
2046 :
2047 : /* Unregister it from the resource owner */
2048 1141004 : if (vfdP->resowner)
2049 9276 : ResourceOwnerForgetFile(vfdP->resowner, file);
2050 :
2051 : /*
2052 : * Return the Vfd slot to the free list
2053 : */
2054 1141004 : FreeVfd(file);
2055 1141004 : }
2056 :
2057 : /*
2058 : * FilePrefetch - initiate asynchronous read of a given range of the file.
2059 : *
2060 : * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2061 : *
2062 : * posix_fadvise() is the simplest standardized interface that accomplishes
2063 : * this.
2064 : */
2065 : int
2066 17496 : FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2067 : {
2068 : Assert(FileIsValid(file));
2069 :
2070 : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2071 : file, VfdCache[file].fileName,
2072 : (int64) offset, (int64) amount));
2073 :
2074 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2075 : {
2076 : int returnCode;
2077 :
2078 17496 : returnCode = FileAccess(file);
2079 17496 : if (returnCode < 0)
2080 0 : return returnCode;
2081 :
2082 17496 : retry:
2083 17496 : pgstat_report_wait_start(wait_event_info);
2084 17496 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2085 : POSIX_FADV_WILLNEED);
2086 17496 : pgstat_report_wait_end();
2087 :
2088 17496 : if (returnCode == EINTR)
2089 0 : goto retry;
2090 :
2091 17496 : return returnCode;
2092 : }
2093 : #elif defined(__darwin__)
2094 : {
2095 : struct radvisory
2096 : {
2097 : off_t ra_offset; /* offset into the file */
2098 : int ra_count; /* size of the read */
2099 : } ra;
2100 : int returnCode;
2101 :
2102 : returnCode = FileAccess(file);
2103 : if (returnCode < 0)
2104 : return returnCode;
2105 :
2106 : ra.ra_offset = offset;
2107 : ra.ra_count = amount;
2108 : pgstat_report_wait_start(wait_event_info);
2109 : returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2110 : pgstat_report_wait_end();
2111 : if (returnCode != -1)
2112 : return 0;
2113 : else
2114 : return errno;
2115 : }
2116 : #else
2117 : return 0;
2118 : #endif
2119 : }
2120 :
2121 : void
2122 0 : FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2123 : {
2124 : int returnCode;
2125 :
2126 : Assert(FileIsValid(file));
2127 :
2128 : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2129 : file, VfdCache[file].fileName,
2130 : (int64) offset, (int64) nbytes));
2131 :
2132 0 : if (nbytes <= 0)
2133 0 : return;
2134 :
2135 0 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
2136 0 : return;
2137 :
2138 0 : returnCode = FileAccess(file);
2139 0 : if (returnCode < 0)
2140 0 : return;
2141 :
2142 0 : pgstat_report_wait_start(wait_event_info);
2143 0 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
2144 0 : pgstat_report_wait_end();
2145 : }
2146 :
2147 : ssize_t
2148 814822 : FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2149 : uint32 wait_event_info)
2150 : {
2151 : ssize_t returnCode;
2152 : Vfd *vfdP;
2153 :
2154 : Assert(FileIsValid(file));
2155 :
2156 : DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2157 : file, VfdCache[file].fileName,
2158 : (int64) offset,
2159 : iovcnt));
2160 :
2161 814822 : returnCode = FileAccess(file);
2162 814822 : if (returnCode < 0)
2163 0 : return returnCode;
2164 :
2165 814822 : vfdP = &VfdCache[file];
2166 :
2167 814822 : retry:
2168 814822 : pgstat_report_wait_start(wait_event_info);
2169 814822 : returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2170 814822 : pgstat_report_wait_end();
2171 :
2172 814822 : if (returnCode < 0)
2173 : {
2174 : /*
2175 : * Windows may run out of kernel buffers and return "Insufficient
2176 : * system resources" error. Wait a bit and retry to solve it.
2177 : *
2178 : * It is rumored that EINTR is also possible on some Unix filesystems,
2179 : * in which case immediate retry is indicated.
2180 : */
2181 : #ifdef WIN32
2182 : DWORD error = GetLastError();
2183 :
2184 : switch (error)
2185 : {
2186 : case ERROR_NO_SYSTEM_RESOURCES:
2187 : pg_usleep(1000L);
2188 : errno = EINTR;
2189 : break;
2190 : default:
2191 : _dosmaperr(error);
2192 : break;
2193 : }
2194 : #endif
2195 : /* OK to retry if interrupted */
2196 0 : if (errno == EINTR)
2197 0 : goto retry;
2198 : }
2199 :
2200 814822 : return returnCode;
2201 : }
2202 :
2203 : int
2204 2593360 : FileStartReadV(PgAioHandle *ioh, File file,
2205 : int iovcnt, pgoff_t offset,
2206 : uint32 wait_event_info)
2207 : {
2208 : int returnCode;
2209 : Vfd *vfdP;
2210 :
2211 : Assert(FileIsValid(file));
2212 :
2213 : DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2214 : file, VfdCache[file].fileName,
2215 : (int64) offset,
2216 : iovcnt));
2217 :
2218 2593360 : returnCode = FileAccess(file);
2219 2593360 : if (returnCode < 0)
2220 0 : return returnCode;
2221 :
2222 2593360 : vfdP = &VfdCache[file];
2223 :
2224 2593360 : pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2225 :
2226 2593360 : return 0;
2227 : }
2228 :
2229 : ssize_t
2230 1529168 : FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2231 : uint32 wait_event_info)
2232 : {
2233 : ssize_t returnCode;
2234 : Vfd *vfdP;
2235 :
2236 : Assert(FileIsValid(file));
2237 :
2238 : DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2239 : file, VfdCache[file].fileName,
2240 : (int64) offset,
2241 : iovcnt));
2242 :
2243 1529168 : returnCode = FileAccess(file);
2244 1529168 : if (returnCode < 0)
2245 0 : return returnCode;
2246 :
2247 1529168 : vfdP = &VfdCache[file];
2248 :
2249 : /*
2250 : * If enforcing temp_file_limit and it's a temp file, check to see if the
2251 : * write would overrun temp_file_limit, and throw error if so. Note: it's
2252 : * really a modularity violation to throw error here; we should set errno
2253 : * and return -1. However, there's no way to report a suitable error
2254 : * message if we do that. All current callers would just throw error
2255 : * immediately anyway, so this is safe at present.
2256 : */
2257 1529168 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2258 : {
2259 0 : pgoff_t past_write = offset;
2260 :
2261 0 : for (int i = 0; i < iovcnt; ++i)
2262 0 : past_write += iov[i].iov_len;
2263 :
2264 0 : if (past_write > vfdP->fileSize)
2265 : {
2266 0 : uint64 newTotal = temporary_files_size;
2267 :
2268 0 : newTotal += past_write - vfdP->fileSize;
2269 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2270 0 : ereport(ERROR,
2271 : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2272 : errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2273 : temp_file_limit)));
2274 : }
2275 : }
2276 :
2277 1529168 : retry:
2278 1529168 : pgstat_report_wait_start(wait_event_info);
2279 1529168 : returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2280 1529168 : pgstat_report_wait_end();
2281 :
2282 1529168 : if (returnCode >= 0)
2283 : {
2284 : /*
2285 : * Some callers expect short writes to set errno, and traditionally we
2286 : * have assumed that they imply disk space shortage. We don't want to
2287 : * waste CPU cycles adding up the total size here, so we'll just set
2288 : * it for all successful writes in case such a caller determines that
2289 : * the write was short and ereports "%m".
2290 : */
2291 1529168 : errno = ENOSPC;
2292 :
2293 : /*
2294 : * Maintain fileSize and temporary_files_size if it's a temp file.
2295 : */
2296 1529168 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2297 : {
2298 108992 : pgoff_t past_write = offset + returnCode;
2299 :
2300 108992 : if (past_write > vfdP->fileSize)
2301 : {
2302 75590 : temporary_files_size += past_write - vfdP->fileSize;
2303 75590 : vfdP->fileSize = past_write;
2304 : }
2305 : }
2306 : }
2307 : else
2308 : {
2309 : /*
2310 : * See comments in FileReadV()
2311 : */
2312 : #ifdef WIN32
2313 : DWORD error = GetLastError();
2314 :
2315 : switch (error)
2316 : {
2317 : case ERROR_NO_SYSTEM_RESOURCES:
2318 : pg_usleep(1000L);
2319 : errno = EINTR;
2320 : break;
2321 : default:
2322 : _dosmaperr(error);
2323 : break;
2324 : }
2325 : #endif
2326 : /* OK to retry if interrupted */
2327 0 : if (errno == EINTR)
2328 0 : goto retry;
2329 : }
2330 :
2331 1529168 : return returnCode;
2332 : }
2333 :
2334 : int
2335 4420 : FileSync(File file, uint32 wait_event_info)
2336 : {
2337 : int returnCode;
2338 :
2339 : Assert(FileIsValid(file));
2340 :
2341 : DO_DB(elog(LOG, "FileSync: %d (%s)",
2342 : file, VfdCache[file].fileName));
2343 :
2344 4420 : returnCode = FileAccess(file);
2345 4420 : if (returnCode < 0)
2346 0 : return returnCode;
2347 :
2348 4420 : pgstat_report_wait_start(wait_event_info);
2349 4420 : returnCode = pg_fsync(VfdCache[file].fd);
2350 4420 : pgstat_report_wait_end();
2351 :
2352 4420 : return returnCode;
2353 : }
2354 :
2355 : /*
2356 : * Zero a region of the file.
2357 : *
2358 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2359 : * appropriate error.
2360 : */
2361 : int
2362 429112 : FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2363 : {
2364 : int returnCode;
2365 : ssize_t written;
2366 :
2367 : Assert(FileIsValid(file));
2368 :
2369 : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2370 : file, VfdCache[file].fileName,
2371 : (int64) offset, (int64) amount));
2372 :
2373 429112 : returnCode = FileAccess(file);
2374 429112 : if (returnCode < 0)
2375 0 : return returnCode;
2376 :
2377 429112 : pgstat_report_wait_start(wait_event_info);
2378 429112 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2379 429112 : pgstat_report_wait_end();
2380 :
2381 429112 : if (written < 0)
2382 0 : return -1;
2383 429112 : else if (written != amount)
2384 : {
2385 : /* if errno is unset, assume problem is no disk space */
2386 0 : if (errno == 0)
2387 0 : errno = ENOSPC;
2388 0 : return -1;
2389 : }
2390 :
2391 429112 : return 0;
2392 : }
2393 :
2394 : /*
2395 : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2396 : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2397 : * use FileZero() instead.
2398 : *
2399 : * Note that at least glibc() implements posix_fallocate() in userspace if not
2400 : * implemented by the filesystem. That's not the case for all environments
2401 : * though.
2402 : *
2403 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2404 : * appropriate error.
2405 : */
2406 : int
2407 1018 : FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2408 : {
2409 : #ifdef HAVE_POSIX_FALLOCATE
2410 : int returnCode;
2411 :
2412 : Assert(FileIsValid(file));
2413 :
2414 : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2415 : file, VfdCache[file].fileName,
2416 : (int64) offset, (int64) amount));
2417 :
2418 1018 : returnCode = FileAccess(file);
2419 1018 : if (returnCode < 0)
2420 0 : return -1;
2421 :
2422 1018 : retry:
2423 1018 : pgstat_report_wait_start(wait_event_info);
2424 1018 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2425 1018 : pgstat_report_wait_end();
2426 :
2427 1018 : if (returnCode == 0)
2428 1018 : return 0;
2429 0 : else if (returnCode == EINTR)
2430 0 : goto retry;
2431 :
2432 : /* for compatibility with %m printing etc */
2433 0 : errno = returnCode;
2434 :
2435 : /*
2436 : * Return in cases of a "real" failure, if fallocate is not supported,
2437 : * fall through to the FileZero() backed implementation.
2438 : */
2439 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2440 0 : return -1;
2441 : #endif
2442 :
2443 0 : return FileZero(file, offset, amount, wait_event_info);
2444 : }
2445 :
2446 : pgoff_t
2447 4756354 : FileSize(File file)
2448 : {
2449 : Assert(FileIsValid(file));
2450 :
2451 : DO_DB(elog(LOG, "FileSize %d (%s)",
2452 : file, VfdCache[file].fileName));
2453 :
2454 4756354 : if (FileIsNotOpen(file))
2455 : {
2456 24 : if (FileAccess(file) < 0)
2457 0 : return (pgoff_t) -1;
2458 : }
2459 :
2460 4756354 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2461 : }
2462 :
2463 : int
2464 1138 : FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2465 : {
2466 : int returnCode;
2467 :
2468 : Assert(FileIsValid(file));
2469 :
2470 : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2471 : file, VfdCache[file].fileName));
2472 :
2473 1138 : returnCode = FileAccess(file);
2474 1138 : if (returnCode < 0)
2475 0 : return returnCode;
2476 :
2477 1138 : pgstat_report_wait_start(wait_event_info);
2478 1138 : returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2479 1138 : pgstat_report_wait_end();
2480 :
2481 1138 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2482 : {
2483 : /* adjust our state for truncation of a temp file */
2484 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2485 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2486 0 : VfdCache[file].fileSize = offset;
2487 : }
2488 :
2489 1138 : return returnCode;
2490 : }
2491 :
2492 : /*
2493 : * Return the pathname associated with an open file.
2494 : *
2495 : * The returned string points to an internal buffer, which is valid until
2496 : * the file is closed.
2497 : */
2498 : char *
2499 46 : FilePathName(File file)
2500 : {
2501 : Assert(FileIsValid(file));
2502 :
2503 46 : return VfdCache[file].fileName;
2504 : }
2505 :
2506 : /*
2507 : * Return the raw file descriptor of an opened file.
2508 : *
2509 : * The returned file descriptor will be valid until the file is closed, but
2510 : * there are a lot of things that can make that happen. So the caller should
2511 : * be careful not to do much of anything else before it finishes using the
2512 : * returned file descriptor.
2513 : */
2514 : int
2515 916938 : FileGetRawDesc(File file)
2516 : {
2517 : int returnCode;
2518 :
2519 916938 : returnCode = FileAccess(file);
2520 916938 : if (returnCode < 0)
2521 0 : return returnCode;
2522 :
2523 : Assert(FileIsValid(file));
2524 916938 : return VfdCache[file].fd;
2525 : }
2526 :
2527 : /*
2528 : * FileGetRawFlags - returns the file flags on open(2)
2529 : */
2530 : int
2531 0 : FileGetRawFlags(File file)
2532 : {
2533 : Assert(FileIsValid(file));
2534 0 : return VfdCache[file].fileFlags;
2535 : }
2536 :
2537 : /*
2538 : * FileGetRawMode - returns the mode bitmask passed to open(2)
2539 : */
2540 : mode_t
2541 0 : FileGetRawMode(File file)
2542 : {
2543 : Assert(FileIsValid(file));
2544 0 : return VfdCache[file].fileMode;
2545 : }
2546 :
2547 : /*
2548 : * Make room for another allocatedDescs[] array entry if needed and possible.
2549 : * Returns true if an array element is available.
2550 : */
2551 : static bool
2552 15792356 : reserveAllocatedDesc(void)
2553 : {
2554 : AllocateDesc *newDescs;
2555 : int newMax;
2556 :
2557 : /* Quick out if array already has a free slot. */
2558 15792356 : if (numAllocatedDescs < maxAllocatedDescs)
2559 15790050 : return true;
2560 :
2561 : /*
2562 : * If the array hasn't yet been created in the current process, initialize
2563 : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2564 : * we will ever need, anyway. We don't want to look at max_safe_fds
2565 : * immediately because set_max_safe_fds() may not have run yet.
2566 : */
2567 2306 : if (allocatedDescs == NULL)
2568 : {
2569 2306 : newMax = FD_MINFREE / 3;
2570 2306 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2571 : /* Out of memory already? Treat as fatal error. */
2572 2306 : if (newDescs == NULL)
2573 0 : ereport(ERROR,
2574 : (errcode(ERRCODE_OUT_OF_MEMORY),
2575 : errmsg("out of memory")));
2576 2306 : allocatedDescs = newDescs;
2577 2306 : maxAllocatedDescs = newMax;
2578 2306 : return true;
2579 : }
2580 :
2581 : /*
2582 : * Consider enlarging the array beyond the initial allocation used above.
2583 : * By the time this happens, max_safe_fds should be known accurately.
2584 : *
2585 : * We mustn't let allocated descriptors hog all the available FDs, and in
2586 : * practice we'd better leave a reasonable number of FDs for VFD use. So
2587 : * set the maximum to max_safe_fds / 3. (This should certainly be at
2588 : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2589 : * tightening the restriction here.) Recall that "external" FDs are
2590 : * allowed to consume another third of max_safe_fds.
2591 : */
2592 0 : newMax = max_safe_fds / 3;
2593 0 : if (newMax > maxAllocatedDescs)
2594 : {
2595 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2596 : newMax * sizeof(AllocateDesc));
2597 : /* Treat out-of-memory as a non-fatal error. */
2598 0 : if (newDescs == NULL)
2599 0 : return false;
2600 0 : allocatedDescs = newDescs;
2601 0 : maxAllocatedDescs = newMax;
2602 0 : return true;
2603 : }
2604 :
2605 : /* Can't enlarge allocatedDescs[] any more. */
2606 0 : return false;
2607 : }
2608 :
2609 : /*
2610 : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2611 : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2612 : * necessary to open the file. When done, call FreeFile rather than fclose.
2613 : *
2614 : * Note that files that will be open for any significant length of time
2615 : * should NOT be handled this way, since they cannot share kernel file
2616 : * descriptors with other files; there is grave risk of running out of FDs
2617 : * if anyone locks down too many FDs. Most callers of this routine are
2618 : * simply reading a config file that they will read and close immediately.
2619 : *
2620 : * fd.c will automatically close all files opened with AllocateFile at
2621 : * transaction commit or abort; this prevents FD leakage if a routine
2622 : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2623 : *
2624 : * Ideally this should be the *only* direct call of fopen() in the backend.
2625 : */
2626 : FILE *
2627 184214 : AllocateFile(const char *name, const char *mode)
2628 : {
2629 : FILE *file;
2630 :
2631 : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2632 : numAllocatedDescs, name));
2633 :
2634 : /* Can we allocate another non-virtual FD? */
2635 184214 : if (!reserveAllocatedDesc())
2636 0 : ereport(ERROR,
2637 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2638 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2639 : maxAllocatedDescs, name)));
2640 :
2641 : /* Close excess kernel FDs. */
2642 184214 : ReleaseLruFiles();
2643 :
2644 184214 : TryAgain:
2645 184214 : if ((file = fopen(name, mode)) != NULL)
2646 : {
2647 169500 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2648 :
2649 169500 : desc->kind = AllocateDescFile;
2650 169500 : desc->desc.file = file;
2651 169500 : desc->create_subid = GetCurrentSubTransactionId();
2652 169500 : numAllocatedDescs++;
2653 169500 : return desc->desc.file;
2654 : }
2655 :
2656 14714 : if (errno == EMFILE || errno == ENFILE)
2657 : {
2658 0 : int save_errno = errno;
2659 :
2660 0 : ereport(LOG,
2661 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2662 : errmsg("out of file descriptors: %m; release and retry")));
2663 0 : errno = 0;
2664 0 : if (ReleaseLruFile())
2665 0 : goto TryAgain;
2666 0 : errno = save_errno;
2667 : }
2668 :
2669 14714 : return NULL;
2670 : }
2671 :
2672 : /*
2673 : * Open a file with OpenTransientFilePerm() and pass default file mode for
2674 : * the fileMode parameter.
2675 : */
2676 : int
2677 15512708 : OpenTransientFile(const char *fileName, int fileFlags)
2678 : {
2679 15512708 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2680 : }
2681 :
2682 : /*
2683 : * Like AllocateFile, but returns an unbuffered fd like open(2)
2684 : */
2685 : int
2686 15512720 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2687 : {
2688 : int fd;
2689 :
2690 : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2691 : numAllocatedDescs, fileName));
2692 :
2693 : /* Can we allocate another non-virtual FD? */
2694 15512720 : if (!reserveAllocatedDesc())
2695 0 : ereport(ERROR,
2696 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2697 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2698 : maxAllocatedDescs, fileName)));
2699 :
2700 : /* Close excess kernel FDs. */
2701 15512720 : ReleaseLruFiles();
2702 :
2703 15512720 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2704 :
2705 15512720 : if (fd >= 0)
2706 : {
2707 15502600 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2708 :
2709 15502600 : desc->kind = AllocateDescRawFD;
2710 15502600 : desc->desc.fd = fd;
2711 15502600 : desc->create_subid = GetCurrentSubTransactionId();
2712 15502600 : numAllocatedDescs++;
2713 :
2714 15502600 : return fd;
2715 : }
2716 :
2717 10120 : return -1; /* failure */
2718 : }
2719 :
2720 : /*
2721 : * Routines that want to initiate a pipe stream should use OpenPipeStream
2722 : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2723 : * necessary. When done, call ClosePipeStream rather than pclose.
2724 : *
2725 : * This function also ensures that the popen'd program is run with default
2726 : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2727 : * uses. This ensures desirable response to, eg, closing a read pipe early.
2728 : */
2729 : FILE *
2730 122 : OpenPipeStream(const char *command, const char *mode)
2731 : {
2732 : FILE *file;
2733 : int save_errno;
2734 :
2735 : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2736 : numAllocatedDescs, command));
2737 :
2738 : /* Can we allocate another non-virtual FD? */
2739 122 : if (!reserveAllocatedDesc())
2740 0 : ereport(ERROR,
2741 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2742 : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2743 : maxAllocatedDescs, command)));
2744 :
2745 : /* Close excess kernel FDs. */
2746 122 : ReleaseLruFiles();
2747 :
2748 122 : TryAgain:
2749 122 : fflush(NULL);
2750 122 : pqsignal(SIGPIPE, SIG_DFL);
2751 122 : errno = 0;
2752 122 : file = popen(command, mode);
2753 122 : save_errno = errno;
2754 122 : pqsignal(SIGPIPE, SIG_IGN);
2755 122 : errno = save_errno;
2756 122 : if (file != NULL)
2757 : {
2758 122 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2759 :
2760 122 : desc->kind = AllocateDescPipe;
2761 122 : desc->desc.file = file;
2762 122 : desc->create_subid = GetCurrentSubTransactionId();
2763 122 : numAllocatedDescs++;
2764 122 : return desc->desc.file;
2765 : }
2766 :
2767 0 : if (errno == EMFILE || errno == ENFILE)
2768 : {
2769 0 : ereport(LOG,
2770 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2771 : errmsg("out of file descriptors: %m; release and retry")));
2772 0 : if (ReleaseLruFile())
2773 0 : goto TryAgain;
2774 0 : errno = save_errno;
2775 : }
2776 :
2777 0 : return NULL;
2778 : }
2779 :
2780 : /*
2781 : * Free an AllocateDesc of any type.
2782 : *
2783 : * The argument *must* point into the allocatedDescs[] array.
2784 : */
2785 : static int
2786 15765638 : FreeDesc(AllocateDesc *desc)
2787 : {
2788 : int result;
2789 :
2790 : /* Close the underlying object */
2791 15765638 : switch (desc->kind)
2792 : {
2793 169500 : case AllocateDescFile:
2794 169500 : result = fclose(desc->desc.file);
2795 169500 : break;
2796 122 : case AllocateDescPipe:
2797 122 : result = pclose(desc->desc.file);
2798 122 : break;
2799 93416 : case AllocateDescDir:
2800 93416 : result = closedir(desc->desc.dir);
2801 93416 : break;
2802 15502600 : case AllocateDescRawFD:
2803 15502600 : pgaio_closing_fd(desc->desc.fd);
2804 15502600 : result = close(desc->desc.fd);
2805 15502600 : break;
2806 0 : default:
2807 0 : elog(ERROR, "AllocateDesc kind not recognized");
2808 : result = 0; /* keep compiler quiet */
2809 : break;
2810 : }
2811 :
2812 : /* Compact storage in the allocatedDescs array */
2813 15765638 : numAllocatedDescs--;
2814 15765638 : *desc = allocatedDescs[numAllocatedDescs];
2815 :
2816 15765638 : return result;
2817 : }
2818 :
2819 : /*
2820 : * Close a file returned by AllocateFile.
2821 : *
2822 : * Note we do not check fclose's return value --- it is up to the caller
2823 : * to handle close errors.
2824 : */
2825 : int
2826 169468 : FreeFile(FILE *file)
2827 : {
2828 : int i;
2829 :
2830 : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2831 :
2832 : /* Remove file from list of allocated files, if it's present */
2833 169474 : for (i = numAllocatedDescs; --i >= 0;)
2834 : {
2835 169474 : AllocateDesc *desc = &allocatedDescs[i];
2836 :
2837 169474 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2838 169468 : return FreeDesc(desc);
2839 : }
2840 :
2841 : /* Only get here if someone passes us a file not in allocatedDescs */
2842 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2843 :
2844 0 : return fclose(file);
2845 : }
2846 :
2847 : /*
2848 : * Close a file returned by OpenTransientFile.
2849 : *
2850 : * Note we do not check close's return value --- it is up to the caller
2851 : * to handle close errors.
2852 : */
2853 : int
2854 15502598 : CloseTransientFile(int fd)
2855 : {
2856 : int i;
2857 :
2858 : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2859 :
2860 : /* Remove fd from list of allocated files, if it's present */
2861 15502620 : for (i = numAllocatedDescs; --i >= 0;)
2862 : {
2863 15502620 : AllocateDesc *desc = &allocatedDescs[i];
2864 :
2865 15502620 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2866 15502598 : return FreeDesc(desc);
2867 : }
2868 :
2869 : /* Only get here if someone passes us a file not in allocatedDescs */
2870 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2871 :
2872 0 : pgaio_closing_fd(fd);
2873 :
2874 0 : return close(fd);
2875 : }
2876 :
2877 : /*
2878 : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2879 : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2880 : * necessary to open the directory, and with closing it after an elog.
2881 : * When done, call FreeDir rather than closedir.
2882 : *
2883 : * Returns NULL, with errno set, on failure. Note that failure detection
2884 : * is commonly left to the following call of ReadDir or ReadDirExtended;
2885 : * see the comments for ReadDir.
2886 : *
2887 : * Ideally this should be the *only* direct call of opendir() in the backend.
2888 : */
2889 : DIR *
2890 95300 : AllocateDir(const char *dirname)
2891 : {
2892 : DIR *dir;
2893 :
2894 : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2895 : numAllocatedDescs, dirname));
2896 :
2897 : /* Can we allocate another non-virtual FD? */
2898 95300 : if (!reserveAllocatedDesc())
2899 0 : ereport(ERROR,
2900 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2901 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2902 : maxAllocatedDescs, dirname)));
2903 :
2904 : /* Close excess kernel FDs. */
2905 95300 : ReleaseLruFiles();
2906 :
2907 95300 : TryAgain:
2908 95300 : if ((dir = opendir(dirname)) != NULL)
2909 : {
2910 93416 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2911 :
2912 93416 : desc->kind = AllocateDescDir;
2913 93416 : desc->desc.dir = dir;
2914 93416 : desc->create_subid = GetCurrentSubTransactionId();
2915 93416 : numAllocatedDescs++;
2916 93416 : return desc->desc.dir;
2917 : }
2918 :
2919 1884 : if (errno == EMFILE || errno == ENFILE)
2920 : {
2921 0 : int save_errno = errno;
2922 :
2923 0 : ereport(LOG,
2924 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2925 : errmsg("out of file descriptors: %m; release and retry")));
2926 0 : errno = 0;
2927 0 : if (ReleaseLruFile())
2928 0 : goto TryAgain;
2929 0 : errno = save_errno;
2930 : }
2931 :
2932 1884 : return NULL;
2933 : }
2934 :
2935 : /*
2936 : * Read a directory opened with AllocateDir, ereport'ing any error.
2937 : *
2938 : * This is easier to use than raw readdir() since it takes care of some
2939 : * otherwise rather tedious and error-prone manipulation of errno. Also,
2940 : * if you are happy with a generic error message for AllocateDir failure,
2941 : * you can just do
2942 : *
2943 : * dir = AllocateDir(path);
2944 : * while ((dirent = ReadDir(dir, path)) != NULL)
2945 : * process dirent;
2946 : * FreeDir(dir);
2947 : *
2948 : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2949 : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2950 : * use this shortcut.)
2951 : *
2952 : * The pathname passed to AllocateDir must be passed to this routine too,
2953 : * but it is only used for error reporting.
2954 : */
2955 : struct dirent *
2956 4719322 : ReadDir(DIR *dir, const char *dirname)
2957 : {
2958 4719322 : return ReadDirExtended(dir, dirname, ERROR);
2959 : }
2960 :
2961 : /*
2962 : * Alternate version of ReadDir that allows caller to specify the elevel
2963 : * for any error report (whether it's reporting an initial failure of
2964 : * AllocateDir or a subsequent directory read failure).
2965 : *
2966 : * If elevel < ERROR, returns NULL after any error. With the normal coding
2967 : * pattern, this will result in falling out of the loop immediately as
2968 : * though the directory contained no (more) entries.
2969 : */
2970 : struct dirent *
2971 8080058 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2972 : {
2973 : struct dirent *dent;
2974 :
2975 : /* Give a generic message for AllocateDir failure, if caller didn't */
2976 8080058 : if (dir == NULL)
2977 : {
2978 6 : ereport(elevel,
2979 : (errcode_for_file_access(),
2980 : errmsg("could not open directory \"%s\": %m",
2981 : dirname)));
2982 0 : return NULL;
2983 : }
2984 :
2985 8080052 : errno = 0;
2986 8080052 : if ((dent = readdir(dir)) != NULL)
2987 8010338 : return dent;
2988 :
2989 69714 : if (errno)
2990 0 : ereport(elevel,
2991 : (errcode_for_file_access(),
2992 : errmsg("could not read directory \"%s\": %m",
2993 : dirname)));
2994 69714 : return NULL;
2995 : }
2996 :
2997 : /*
2998 : * Close a directory opened with AllocateDir.
2999 : *
3000 : * Returns closedir's return value (with errno set if it's not 0).
3001 : * Note we do not check the return value --- it is up to the caller
3002 : * to handle close errors if wanted.
3003 : *
3004 : * Does nothing if dir == NULL; we assume that directory open failure was
3005 : * already reported if desired.
3006 : */
3007 : int
3008 93148 : FreeDir(DIR *dir)
3009 : {
3010 : int i;
3011 :
3012 : /* Nothing to do if AllocateDir failed */
3013 93148 : if (dir == NULL)
3014 0 : return 0;
3015 :
3016 : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3017 :
3018 : /* Remove dir from list of allocated dirs, if it's present */
3019 93148 : for (i = numAllocatedDescs; --i >= 0;)
3020 : {
3021 93148 : AllocateDesc *desc = &allocatedDescs[i];
3022 :
3023 93148 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3024 93148 : return FreeDesc(desc);
3025 : }
3026 :
3027 : /* Only get here if someone passes us a dir not in allocatedDescs */
3028 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3029 :
3030 0 : return closedir(dir);
3031 : }
3032 :
3033 :
3034 : /*
3035 : * Close a pipe stream returned by OpenPipeStream.
3036 : */
3037 : int
3038 122 : ClosePipeStream(FILE *file)
3039 : {
3040 : int i;
3041 :
3042 : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3043 :
3044 : /* Remove file from list of allocated files, if it's present */
3045 122 : for (i = numAllocatedDescs; --i >= 0;)
3046 : {
3047 122 : AllocateDesc *desc = &allocatedDescs[i];
3048 :
3049 122 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3050 122 : return FreeDesc(desc);
3051 : }
3052 :
3053 : /* Only get here if someone passes us a file not in allocatedDescs */
3054 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3055 :
3056 0 : return pclose(file);
3057 : }
3058 :
3059 : /*
3060 : * closeAllVfds
3061 : *
3062 : * Force all VFDs into the physically-closed state, so that the fewest
3063 : * possible number of kernel file descriptors are in use. There is no
3064 : * change in the logical state of the VFDs.
3065 : */
3066 : void
3067 64 : closeAllVfds(void)
3068 : {
3069 : Index i;
3070 :
3071 64 : if (SizeVfdCache > 0)
3072 : {
3073 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3074 2048 : for (i = 1; i < SizeVfdCache; i++)
3075 : {
3076 1984 : if (!FileIsNotOpen(i))
3077 262 : LruDelete(i);
3078 : }
3079 : }
3080 64 : }
3081 :
3082 :
3083 : /*
3084 : * SetTempTablespaces
3085 : *
3086 : * Define a list (actually an array) of OIDs of tablespaces to use for
3087 : * temporary files. This list will be used until end of transaction,
3088 : * unless this function is called again before then. It is caller's
3089 : * responsibility that the passed-in array has adequate lifespan (typically
3090 : * it'd be allocated in TopTransactionContext).
3091 : *
3092 : * Some entries of the array may be InvalidOid, indicating that the current
3093 : * database's default tablespace should be used.
3094 : */
3095 : void
3096 6590 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3097 : {
3098 : Assert(numSpaces >= 0);
3099 6590 : tempTableSpaces = tableSpaces;
3100 6590 : numTempTableSpaces = numSpaces;
3101 :
3102 : /*
3103 : * Select a random starting point in the list. This is to minimize
3104 : * conflicts between backends that are most likely sharing the same list
3105 : * of temp tablespaces. Note that if we create multiple temp files in the
3106 : * same transaction, we'll advance circularly through the list --- this
3107 : * ensures that large temporary sort files are nicely spread across all
3108 : * available tablespaces.
3109 : */
3110 6590 : if (numSpaces > 1)
3111 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
3112 0 : 0, numSpaces - 1);
3113 : else
3114 6590 : nextTempTableSpace = 0;
3115 6590 : }
3116 :
3117 : /*
3118 : * TempTablespacesAreSet
3119 : *
3120 : * Returns true if SetTempTablespaces has been called in current transaction.
3121 : * (This is just so that tablespaces.c doesn't need its own per-transaction
3122 : * state.)
3123 : */
3124 : bool
3125 9086 : TempTablespacesAreSet(void)
3126 : {
3127 9086 : return (numTempTableSpaces >= 0);
3128 : }
3129 :
3130 : /*
3131 : * GetTempTablespaces
3132 : *
3133 : * Populate an array with the OIDs of the tablespaces that should be used for
3134 : * temporary files. (Some entries may be InvalidOid, indicating that the
3135 : * current database's default tablespace should be used.) At most numSpaces
3136 : * entries will be filled.
3137 : * Returns the number of OIDs that were copied into the output array.
3138 : */
3139 : int
3140 420 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3141 : {
3142 : int i;
3143 :
3144 : Assert(TempTablespacesAreSet());
3145 420 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3146 0 : tableSpaces[i] = tempTableSpaces[i];
3147 :
3148 420 : return i;
3149 : }
3150 :
3151 : /*
3152 : * GetNextTempTableSpace
3153 : *
3154 : * Select the next temp tablespace to use. A result of InvalidOid means
3155 : * to use the current database's default tablespace.
3156 : */
3157 : Oid
3158 4474 : GetNextTempTableSpace(void)
3159 : {
3160 4474 : if (numTempTableSpaces > 0)
3161 : {
3162 : /* Advance nextTempTableSpace counter with wraparound */
3163 2 : if (++nextTempTableSpace >= numTempTableSpaces)
3164 2 : nextTempTableSpace = 0;
3165 2 : return tempTableSpaces[nextTempTableSpace];
3166 : }
3167 4472 : return InvalidOid;
3168 : }
3169 :
3170 :
3171 : /*
3172 : * AtEOSubXact_Files
3173 : *
3174 : * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3175 : * that the subtransaction may have opened. At commit, we reassign them to
3176 : * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3177 : * instead.)
3178 : */
3179 : void
3180 20160 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3181 : SubTransactionId parentSubid)
3182 : {
3183 : Index i;
3184 :
3185 20160 : for (i = 0; i < numAllocatedDescs; i++)
3186 : {
3187 0 : if (allocatedDescs[i].create_subid == mySubid)
3188 : {
3189 0 : if (isCommit)
3190 0 : allocatedDescs[i].create_subid = parentSubid;
3191 : else
3192 : {
3193 : /* have to recheck the item after FreeDesc (ugly) */
3194 0 : FreeDesc(&allocatedDescs[i--]);
3195 : }
3196 : }
3197 : }
3198 20160 : }
3199 :
3200 : /*
3201 : * AtEOXact_Files
3202 : *
3203 : * This routine is called during transaction commit or abort. All still-open
3204 : * per-transaction temporary file VFDs are closed, which also causes the
3205 : * underlying files to be deleted (although they should've been closed already
3206 : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3207 : * closed. We also forget any transaction-local temp tablespace list.
3208 : *
3209 : * The isCommit flag is used only to decide whether to emit warnings about
3210 : * unclosed files.
3211 : */
3212 : void
3213 1001222 : AtEOXact_Files(bool isCommit)
3214 : {
3215 1001222 : CleanupTempFiles(isCommit, false);
3216 1001222 : tempTableSpaces = NULL;
3217 1001222 : numTempTableSpaces = -1;
3218 1001222 : }
3219 :
3220 : /*
3221 : * BeforeShmemExit_Files
3222 : *
3223 : * before_shmem_exit hook to clean up temp files during backend shutdown.
3224 : * Here, we want to clean up *all* temp files including interXact ones.
3225 : */
3226 : static void
3227 45406 : BeforeShmemExit_Files(int code, Datum arg)
3228 : {
3229 45406 : CleanupTempFiles(false, true);
3230 :
3231 : /* prevent further temp files from being created */
3232 : #ifdef USE_ASSERT_CHECKING
3233 : temporary_files_allowed = false;
3234 : #endif
3235 45406 : }
3236 :
3237 : /*
3238 : * Close temporary files and delete their underlying files.
3239 : *
3240 : * isCommit: if true, this is normal transaction commit, and we don't
3241 : * expect any remaining files; warn if there are some.
3242 : *
3243 : * isProcExit: if true, this is being called as the backend process is
3244 : * exiting. If that's the case, we should remove all temporary files; if
3245 : * that's not the case, we are being called for transaction commit/abort
3246 : * and should only remove transaction-local temp files. In either case,
3247 : * also clean up "allocated" stdio files, dirs and fds.
3248 : */
3249 : static void
3250 1046628 : CleanupTempFiles(bool isCommit, bool isProcExit)
3251 : {
3252 : Index i;
3253 :
3254 : /*
3255 : * Careful here: at proc_exit we need extra cleanup, not just
3256 : * xact_temporary files.
3257 : */
3258 1046628 : if (isProcExit || have_xact_temporary_files)
3259 : {
3260 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3261 2730732 : for (i = 1; i < SizeVfdCache; i++)
3262 : {
3263 2683674 : unsigned short fdstate = VfdCache[i].fdstate;
3264 :
3265 2683674 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3266 8 : VfdCache[i].fileName != NULL)
3267 : {
3268 : /*
3269 : * If we're in the process of exiting a backend process, close
3270 : * all temporary files. Otherwise, only close temporary files
3271 : * local to the current transaction. They should be closed by
3272 : * the ResourceOwner mechanism already, so this is just a
3273 : * debugging cross-check.
3274 : */
3275 8 : if (isProcExit)
3276 8 : FileClose(i);
3277 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
3278 : {
3279 0 : elog(WARNING,
3280 : "temporary file %s not closed at end-of-transaction",
3281 : VfdCache[i].fileName);
3282 0 : FileClose(i);
3283 : }
3284 : }
3285 : }
3286 :
3287 47058 : have_xact_temporary_files = false;
3288 : }
3289 :
3290 : /* Complain if any allocated files remain open at commit. */
3291 1046628 : if (isCommit && numAllocatedDescs > 0)
3292 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3293 : numAllocatedDescs);
3294 :
3295 : /* Clean up "allocated" stdio files, dirs and fds. */
3296 1046930 : while (numAllocatedDescs > 0)
3297 302 : FreeDesc(&allocatedDescs[0]);
3298 1046628 : }
3299 :
3300 :
3301 : /*
3302 : * Remove temporary and temporary relation files left over from a prior
3303 : * postmaster session
3304 : *
3305 : * This should be called during postmaster startup. It will forcibly
3306 : * remove any leftover files created by OpenTemporaryFile and any leftover
3307 : * temporary relation files created by mdcreate.
3308 : *
3309 : * During post-backend-crash restart cycle, this routine is called when
3310 : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3311 : * queries are using temp files could result in useless storage usage that can
3312 : * only be reclaimed by a service restart. The argument against enabling it is
3313 : * that someone might want to examine the temporary files for debugging
3314 : * purposes. This does however mean that OpenTemporaryFile had better allow for
3315 : * collision with an existing temp file name.
3316 : *
3317 : * NOTE: this function and its subroutines generally report syscall failures
3318 : * with ereport(LOG) and keep going. Removing temp files is not so critical
3319 : * that we should fail to start the database when we can't do it.
3320 : */
3321 : void
3322 1832 : RemovePgTempFiles(void)
3323 : {
3324 : char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3325 : DIR *spc_dir;
3326 : struct dirent *spc_de;
3327 :
3328 : /*
3329 : * First process temp files in pg_default ($PGDATA/base)
3330 : */
3331 1832 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3332 1832 : RemovePgTempFilesInDir(temp_path, true, false);
3333 1832 : RemovePgTempRelationFiles("base");
3334 :
3335 : /*
3336 : * Cycle through temp directories for all non-default tablespaces.
3337 : */
3338 1832 : spc_dir = AllocateDir(PG_TBLSPC_DIR);
3339 :
3340 5650 : while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3341 : {
3342 3818 : if (strcmp(spc_de->d_name, ".") == 0 ||
3343 1986 : strcmp(spc_de->d_name, "..") == 0)
3344 3664 : continue;
3345 :
3346 154 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3347 154 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
3348 : PG_TEMP_FILES_DIR);
3349 154 : RemovePgTempFilesInDir(temp_path, true, false);
3350 :
3351 154 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3352 154 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3353 154 : RemovePgTempRelationFiles(temp_path);
3354 : }
3355 :
3356 1832 : FreeDir(spc_dir);
3357 :
3358 : /*
3359 : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3360 : * DataDir as well. However, that is *not* cleaned here because doing so
3361 : * would create a race condition. It's done separately, earlier in
3362 : * postmaster startup.
3363 : */
3364 1832 : }
3365 :
3366 : /*
3367 : * Process one pgsql_tmp directory for RemovePgTempFiles.
3368 : *
3369 : * If missing_ok is true, it's all right for the named directory to not exist.
3370 : * Any other problem results in a LOG message. (missing_ok should be true at
3371 : * the top level, since pgsql_tmp directories are not created until needed.)
3372 : *
3373 : * At the top level, this should be called with unlink_all = false, so that
3374 : * only files matching the temporary name prefix will be unlinked. When
3375 : * recursing it will be called with unlink_all = true to unlink everything
3376 : * under a top-level temporary directory.
3377 : *
3378 : * (These two flags could be replaced by one, but it seems clearer to keep
3379 : * them separate.)
3380 : */
3381 : void
3382 1988 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3383 : {
3384 : DIR *temp_dir;
3385 : struct dirent *temp_de;
3386 : char rm_path[MAXPGPATH * 2];
3387 :
3388 1988 : temp_dir = AllocateDir(tmpdirname);
3389 :
3390 1988 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
3391 1844 : return;
3392 :
3393 438 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3394 : {
3395 294 : if (strcmp(temp_de->d_name, ".") == 0 ||
3396 150 : strcmp(temp_de->d_name, "..") == 0)
3397 288 : continue;
3398 :
3399 6 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3400 6 : tmpdirname, temp_de->d_name);
3401 :
3402 6 : if (unlink_all ||
3403 6 : strncmp(temp_de->d_name,
3404 : PG_TEMP_FILE_PREFIX,
3405 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
3406 6 : {
3407 6 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3408 :
3409 6 : if (type == PGFILETYPE_ERROR)
3410 0 : continue;
3411 6 : else if (type == PGFILETYPE_DIR)
3412 : {
3413 : /* recursively remove contents, then directory itself */
3414 2 : RemovePgTempFilesInDir(rm_path, false, true);
3415 :
3416 2 : if (rmdir(rm_path) < 0)
3417 0 : ereport(LOG,
3418 : (errcode_for_file_access(),
3419 : errmsg("could not remove directory \"%s\": %m",
3420 : rm_path)));
3421 : }
3422 : else
3423 : {
3424 4 : if (unlink(rm_path) < 0)
3425 0 : ereport(LOG,
3426 : (errcode_for_file_access(),
3427 : errmsg("could not remove file \"%s\": %m",
3428 : rm_path)));
3429 : }
3430 : }
3431 : else
3432 0 : ereport(LOG,
3433 : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3434 : rm_path)));
3435 : }
3436 :
3437 144 : FreeDir(temp_dir);
3438 : }
3439 :
3440 : /* Process one tablespace directory, look for per-DB subdirectories */
3441 : static void
3442 1986 : RemovePgTempRelationFiles(const char *tsdirname)
3443 : {
3444 : DIR *ts_dir;
3445 : struct dirent *de;
3446 : char dbspace_path[MAXPGPATH * 2];
3447 :
3448 1986 : ts_dir = AllocateDir(tsdirname);
3449 :
3450 12346 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3451 : {
3452 : /*
3453 : * We're only interested in the per-database directories, which have
3454 : * numeric names. Note that this code will also (properly) ignore "."
3455 : * and "..".
3456 : */
3457 10360 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3458 4114 : continue;
3459 :
3460 6246 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3461 6246 : tsdirname, de->d_name);
3462 6246 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3463 : }
3464 :
3465 1986 : FreeDir(ts_dir);
3466 1986 : }
3467 :
3468 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3469 : static void
3470 6246 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3471 : {
3472 : DIR *dbspace_dir;
3473 : struct dirent *de;
3474 : char rm_path[MAXPGPATH * 2];
3475 :
3476 6246 : dbspace_dir = AllocateDir(dbspacedirname);
3477 :
3478 1891664 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3479 : {
3480 1885418 : if (!looks_like_temp_rel_name(de->d_name))
3481 1885410 : continue;
3482 :
3483 8 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3484 8 : dbspacedirname, de->d_name);
3485 :
3486 8 : if (unlink(rm_path) < 0)
3487 0 : ereport(LOG,
3488 : (errcode_for_file_access(),
3489 : errmsg("could not remove file \"%s\": %m",
3490 : rm_path)));
3491 : }
3492 :
3493 6246 : FreeDir(dbspace_dir);
3494 6246 : }
3495 :
3496 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3497 : bool
3498 2507920 : looks_like_temp_rel_name(const char *name)
3499 : {
3500 : int pos;
3501 : int savepos;
3502 :
3503 : /* Must start with "t". */
3504 2507920 : if (name[0] != 't')
3505 2507840 : return false;
3506 :
3507 : /* Followed by a non-empty string of digits and then an underscore. */
3508 392 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3509 : ;
3510 80 : if (pos == 1 || name[pos] != '_')
3511 0 : return false;
3512 :
3513 : /* Followed by another nonempty string of digits. */
3514 392 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3515 : ;
3516 80 : if (savepos == pos)
3517 0 : return false;
3518 :
3519 : /* We might have _forkname or .segment or both. */
3520 80 : if (name[pos] == '_')
3521 : {
3522 40 : int forkchar = forkname_chars(&name[pos + 1], NULL);
3523 :
3524 40 : if (forkchar <= 0)
3525 0 : return false;
3526 40 : pos += forkchar + 1;
3527 : }
3528 80 : if (name[pos] == '.')
3529 : {
3530 : int segchar;
3531 :
3532 80 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3533 : ;
3534 40 : if (segchar <= 1)
3535 0 : return false;
3536 40 : pos += segchar;
3537 : }
3538 :
3539 : /* Now we should be at the end. */
3540 80 : if (name[pos] != '\0')
3541 0 : return false;
3542 80 : return true;
3543 : }
3544 :
3545 : #ifdef HAVE_SYNCFS
3546 : static void
3547 0 : do_syncfs(const char *path)
3548 : {
3549 : int fd;
3550 :
3551 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3552 : path);
3553 :
3554 0 : fd = OpenTransientFile(path, O_RDONLY);
3555 0 : if (fd < 0)
3556 : {
3557 0 : ereport(LOG,
3558 : (errcode_for_file_access(),
3559 : errmsg("could not open file \"%s\": %m", path)));
3560 0 : return;
3561 : }
3562 0 : if (syncfs(fd) < 0)
3563 0 : ereport(LOG,
3564 : (errcode_for_file_access(),
3565 : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3566 0 : CloseTransientFile(fd);
3567 : }
3568 : #endif
3569 :
3570 : /*
3571 : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3572 : * all potential filesystem, depending on recovery_init_sync_method setting.
3573 : *
3574 : * We fsync regular files and directories wherever they are, but we
3575 : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3576 : * Other symlinks are presumed to point at files we're not responsible
3577 : * for fsyncing, and might not have privileges to write at all.
3578 : *
3579 : * Errors are logged but not considered fatal; that's because this is used
3580 : * only during database startup, to deal with the possibility that there are
3581 : * issued-but-unsynced writes pending against the data directory. We want to
3582 : * ensure that such writes reach disk before anything that's done in the new
3583 : * run. However, aborting on error would result in failure to start for
3584 : * harmless cases such as read-only files in the data directory, and that's
3585 : * not good either.
3586 : *
3587 : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3588 : * rewriting all changes again during recovery.
3589 : *
3590 : * Note we assume we're chdir'd into PGDATA to begin with.
3591 : */
3592 : void
3593 372 : SyncDataDirectory(void)
3594 : {
3595 : bool xlog_is_symlink;
3596 :
3597 : /* We can skip this whole thing if fsync is disabled. */
3598 372 : if (!enableFsync)
3599 372 : return;
3600 :
3601 : /*
3602 : * If pg_wal is a symlink, we'll need to recurse into it separately,
3603 : * because the first walkdir below will ignore it.
3604 : */
3605 0 : xlog_is_symlink = false;
3606 :
3607 : {
3608 : struct stat st;
3609 :
3610 0 : if (lstat("pg_wal", &st) < 0)
3611 0 : ereport(LOG,
3612 : (errcode_for_file_access(),
3613 : errmsg("could not stat file \"%s\": %m",
3614 : "pg_wal")));
3615 0 : else if (S_ISLNK(st.st_mode))
3616 0 : xlog_is_symlink = true;
3617 : }
3618 :
3619 : #ifdef HAVE_SYNCFS
3620 0 : if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
3621 : {
3622 : DIR *dir;
3623 : struct dirent *de;
3624 :
3625 : /*
3626 : * On Linux, we don't have to open every single file one by one. We
3627 : * can use syncfs() to sync whole filesystems. We only expect
3628 : * filesystem boundaries to exist where we tolerate symlinks, namely
3629 : * pg_wal and the tablespaces, so we call syncfs() for each of those
3630 : * directories.
3631 : */
3632 :
3633 : /* Prepare to report progress syncing the data directory via syncfs. */
3634 0 : begin_startup_progress_phase();
3635 :
3636 : /* Sync the top level pgdata directory. */
3637 0 : do_syncfs(".");
3638 : /* If any tablespaces are configured, sync each of those. */
3639 0 : dir = AllocateDir(PG_TBLSPC_DIR);
3640 0 : while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3641 : {
3642 : char path[MAXPGPATH];
3643 :
3644 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3645 0 : continue;
3646 :
3647 0 : snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3648 0 : do_syncfs(path);
3649 : }
3650 0 : FreeDir(dir);
3651 : /* If pg_wal is a symlink, process that too. */
3652 0 : if (xlog_is_symlink)
3653 0 : do_syncfs("pg_wal");
3654 0 : return;
3655 : }
3656 : #endif /* !HAVE_SYNCFS */
3657 :
3658 : #ifdef PG_FLUSH_DATA_WORKS
3659 : /* Prepare to report progress of the pre-fsync phase. */
3660 0 : begin_startup_progress_phase();
3661 :
3662 : /*
3663 : * If possible, hint to the kernel that we're soon going to fsync the data
3664 : * directory and its contents. Errors in this step are even less
3665 : * interesting than normal, so log them only at DEBUG1.
3666 : */
3667 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
3668 0 : if (xlog_is_symlink)
3669 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3670 0 : walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3671 : #endif
3672 :
3673 : /* Prepare to report progress syncing the data directory via fsync. */
3674 0 : begin_startup_progress_phase();
3675 :
3676 : /*
3677 : * Now we do the fsync()s in the same order.
3678 : *
3679 : * The main call ignores symlinks, so in addition to specially processing
3680 : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3681 : * process_symlinks = true. Note that if there are any plain directories
3682 : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3683 : * so we don't worry about optimizing it.
3684 : */
3685 0 : walkdir(".", datadir_fsync_fname, false, LOG);
3686 0 : if (xlog_is_symlink)
3687 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3688 0 : walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
3689 : }
3690 :
3691 : /*
3692 : * walkdir: recursively walk a directory, applying the action to each
3693 : * regular file and directory (including the named directory itself).
3694 : *
3695 : * If process_symlinks is true, the action and recursion are also applied
3696 : * to regular files and directories that are pointed to by symlinks in the
3697 : * given directory; otherwise symlinks are ignored. Symlinks are always
3698 : * ignored in subdirectories, ie we intentionally don't pass down the
3699 : * process_symlinks flag to recursive calls.
3700 : *
3701 : * Errors are reported at level elevel, which might be ERROR or less.
3702 : *
3703 : * See also walkdir in file_utils.c, which is a frontend version of this
3704 : * logic.
3705 : */
3706 : static void
3707 382 : walkdir(const char *path,
3708 : void (*action) (const char *fname, bool isdir, int elevel),
3709 : bool process_symlinks,
3710 : int elevel)
3711 : {
3712 : DIR *dir;
3713 : struct dirent *de;
3714 :
3715 382 : dir = AllocateDir(path);
3716 :
3717 3778 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3718 : {
3719 : char subpath[MAXPGPATH * 2];
3720 :
3721 3396 : CHECK_FOR_INTERRUPTS();
3722 :
3723 3396 : if (strcmp(de->d_name, ".") == 0 ||
3724 3014 : strcmp(de->d_name, "..") == 0)
3725 764 : continue;
3726 :
3727 2632 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3728 :
3729 2632 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3730 : {
3731 2632 : case PGFILETYPE_REG:
3732 2632 : (*action) (subpath, false, elevel);
3733 2632 : break;
3734 0 : case PGFILETYPE_DIR:
3735 0 : walkdir(subpath, action, false, elevel);
3736 0 : break;
3737 0 : default:
3738 :
3739 : /*
3740 : * Errors are already reported directly by get_dirent_type(),
3741 : * and any remaining symlinks and unknown file types are
3742 : * ignored.
3743 : */
3744 0 : break;
3745 : }
3746 : }
3747 :
3748 382 : FreeDir(dir); /* we ignore any error here */
3749 :
3750 : /*
3751 : * It's important to fsync the destination directory itself as individual
3752 : * file fsyncs don't guarantee that the directory entry for the file is
3753 : * synced. However, skip this if AllocateDir failed; the action function
3754 : * might not be robust against that.
3755 : */
3756 382 : if (dir)
3757 382 : (*action) (path, true, elevel);
3758 382 : }
3759 :
3760 :
3761 : /*
3762 : * Hint to the OS that it should get ready to fsync() this file.
3763 : *
3764 : * Ignores errors trying to open unreadable files, and logs other errors at a
3765 : * caller-specified level.
3766 : */
3767 : #ifdef PG_FLUSH_DATA_WORKS
3768 :
3769 : static void
3770 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3771 : {
3772 : int fd;
3773 :
3774 : /* Don't try to flush directories, it'll likely just fail */
3775 0 : if (isdir)
3776 0 : return;
3777 :
3778 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3779 : fname);
3780 :
3781 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3782 :
3783 0 : if (fd < 0)
3784 : {
3785 0 : if (errno == EACCES)
3786 0 : return;
3787 0 : ereport(elevel,
3788 : (errcode_for_file_access(),
3789 : errmsg("could not open file \"%s\": %m", fname)));
3790 0 : return;
3791 : }
3792 :
3793 : /*
3794 : * pg_flush_data() ignores errors, which is ok because this is only a
3795 : * hint.
3796 : */
3797 0 : pg_flush_data(fd, 0, 0);
3798 :
3799 0 : if (CloseTransientFile(fd) != 0)
3800 0 : ereport(elevel,
3801 : (errcode_for_file_access(),
3802 : errmsg("could not close file \"%s\": %m", fname)));
3803 : }
3804 :
3805 : #endif /* PG_FLUSH_DATA_WORKS */
3806 :
3807 : static void
3808 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3809 : {
3810 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3811 : fname);
3812 :
3813 : /*
3814 : * We want to silently ignoring errors about unreadable files. Pass that
3815 : * desire on to fsync_fname_ext().
3816 : */
3817 0 : fsync_fname_ext(fname, isdir, true, elevel);
3818 0 : }
3819 :
3820 : static void
3821 3014 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3822 : {
3823 3014 : if (isdir)
3824 : {
3825 382 : if (rmdir(fname) != 0 && errno != ENOENT)
3826 0 : ereport(elevel,
3827 : (errcode_for_file_access(),
3828 : errmsg("could not remove directory \"%s\": %m", fname)));
3829 : }
3830 : else
3831 : {
3832 : /* Use PathNameDeleteTemporaryFile to report filesize */
3833 2632 : PathNameDeleteTemporaryFile(fname, false);
3834 : }
3835 3014 : }
3836 :
3837 : /*
3838 : * fsync_fname_ext -- Try to fsync a file or directory
3839 : *
3840 : * If ignore_perm is true, ignore errors upon trying to open unreadable
3841 : * files. Logs other errors at a caller-specified level.
3842 : *
3843 : * Returns 0 if the operation succeeded, -1 otherwise.
3844 : */
3845 : int
3846 85870 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3847 : {
3848 : int fd;
3849 : int flags;
3850 : int returncode;
3851 :
3852 : /*
3853 : * Some OSs require directories to be opened read-only whereas other
3854 : * systems don't allow us to fsync files opened read-only; so we need both
3855 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3856 : * not writable by our userid, but we assume that's OK.
3857 : */
3858 85870 : flags = PG_BINARY;
3859 85870 : if (!isdir)
3860 31962 : flags |= O_RDWR;
3861 : else
3862 53908 : flags |= O_RDONLY;
3863 :
3864 85870 : fd = OpenTransientFile(fname, flags);
3865 :
3866 : /*
3867 : * Some OSs don't allow us to open directories at all (Windows returns
3868 : * EACCES), just ignore the error in that case. If desired also silently
3869 : * ignoring errors about unreadable files. Log others.
3870 : */
3871 85870 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3872 0 : return 0;
3873 85870 : else if (fd < 0 && ignore_perm && errno == EACCES)
3874 0 : return 0;
3875 85870 : else if (fd < 0)
3876 : {
3877 0 : ereport(elevel,
3878 : (errcode_for_file_access(),
3879 : errmsg("could not open file \"%s\": %m", fname)));
3880 0 : return -1;
3881 : }
3882 :
3883 85870 : returncode = pg_fsync(fd);
3884 :
3885 : /*
3886 : * Some OSes don't allow us to fsync directories at all, so we can ignore
3887 : * those errors. Anything else needs to be logged.
3888 : */
3889 85870 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3890 : {
3891 : int save_errno;
3892 :
3893 : /* close file upon error, might not be in transaction context */
3894 0 : save_errno = errno;
3895 0 : (void) CloseTransientFile(fd);
3896 0 : errno = save_errno;
3897 :
3898 0 : ereport(elevel,
3899 : (errcode_for_file_access(),
3900 : errmsg("could not fsync file \"%s\": %m", fname)));
3901 0 : return -1;
3902 : }
3903 :
3904 85870 : if (CloseTransientFile(fd) != 0)
3905 : {
3906 0 : ereport(elevel,
3907 : (errcode_for_file_access(),
3908 : errmsg("could not close file \"%s\": %m", fname)));
3909 0 : return -1;
3910 : }
3911 :
3912 85870 : return 0;
3913 : }
3914 :
3915 : /*
3916 : * fsync_parent_path -- fsync the parent path of a file or directory
3917 : *
3918 : * This is aimed at making file operations persistent on disk in case of
3919 : * an OS crash or power failure.
3920 : */
3921 : static int
3922 15668 : fsync_parent_path(const char *fname, int elevel)
3923 : {
3924 : char parentpath[MAXPGPATH];
3925 :
3926 15668 : strlcpy(parentpath, fname, MAXPGPATH);
3927 15668 : get_parent_directory(parentpath);
3928 :
3929 : /*
3930 : * get_parent_directory() returns an empty string if the input argument is
3931 : * just a file name (see comments in path.c), so handle that as being the
3932 : * current directory.
3933 : */
3934 15668 : if (strlen(parentpath) == 0)
3935 444 : strlcpy(parentpath, ".", MAXPGPATH);
3936 :
3937 15668 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3938 0 : return -1;
3939 :
3940 15668 : return 0;
3941 : }
3942 :
3943 : /*
3944 : * Create a PostgreSQL data sub-directory
3945 : *
3946 : * The data directory itself, and most of its sub-directories, are created at
3947 : * initdb time, but we do have some occasions when we create directories in
3948 : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3949 : * make sure that those directories are created consistently. Today, that means
3950 : * making sure that the created directory has the correct permissions, which is
3951 : * what pg_dir_create_mode tracks for us.
3952 : *
3953 : * Note that we also set the umask() based on what we understand the correct
3954 : * permissions to be (see file_perm.c).
3955 : *
3956 : * For permissions other than the default, mkdir() can be used directly, but
3957 : * be sure to consider carefully such cases -- a sub-directory with incorrect
3958 : * permissions in a PostgreSQL data directory could cause backups and other
3959 : * processes to fail.
3960 : */
3961 : int
3962 3140 : MakePGDirectory(const char *directoryName)
3963 : {
3964 3140 : return mkdir(directoryName, pg_dir_create_mode);
3965 : }
3966 :
3967 : /*
3968 : * Return the passed-in error level, or PANIC if data_sync_retry is off.
3969 : *
3970 : * Failure to fsync any data file is cause for immediate panic, unless
3971 : * data_sync_retry is enabled. Data may have been written to the operating
3972 : * system and removed from our buffer pool already, and if we are running on
3973 : * an operating system that forgets dirty data on write-back failure, there
3974 : * may be only one copy of the data remaining: in the WAL. A later attempt to
3975 : * fsync again might falsely report success. Therefore we must not allow any
3976 : * further checkpoints to be attempted. data_sync_retry can in theory be
3977 : * enabled on systems known not to drop dirty buffered data on write-back
3978 : * failure (with the likely outcome that checkpoints will continue to fail
3979 : * until the underlying problem is fixed).
3980 : *
3981 : * Any code that reports a failure from fsync() or related functions should
3982 : * filter the error level with this function.
3983 : */
3984 : int
3985 43926 : data_sync_elevel(int elevel)
3986 : {
3987 43926 : return data_sync_retry ? elevel : PANIC;
3988 : }
3989 :
3990 : bool
3991 2358 : check_debug_io_direct(char **newval, void **extra, GucSource source)
3992 : {
3993 2358 : bool result = true;
3994 : int flags;
3995 :
3996 : #if PG_O_DIRECT == 0
3997 : if (strcmp(*newval, "") != 0)
3998 : {
3999 : GUC_check_errdetail("\"%s\" is not supported on this platform.",
4000 : "debug_io_direct");
4001 : result = false;
4002 : }
4003 : flags = 0;
4004 : #else
4005 : List *elemlist;
4006 : ListCell *l;
4007 : char *rawstring;
4008 :
4009 : /* Need a modifiable copy of string */
4010 2358 : rawstring = pstrdup(*newval);
4011 :
4012 2358 : if (!SplitGUCList(rawstring, ',', &elemlist))
4013 : {
4014 0 : GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4015 : "debug_io_direct");
4016 0 : pfree(rawstring);
4017 0 : list_free(elemlist);
4018 0 : return false;
4019 : }
4020 :
4021 2358 : flags = 0;
4022 2370 : foreach(l, elemlist)
4023 : {
4024 12 : char *item = (char *) lfirst(l);
4025 :
4026 12 : if (pg_strcasecmp(item, "data") == 0)
4027 4 : flags |= IO_DIRECT_DATA;
4028 8 : else if (pg_strcasecmp(item, "wal") == 0)
4029 4 : flags |= IO_DIRECT_WAL;
4030 4 : else if (pg_strcasecmp(item, "wal_init") == 0)
4031 4 : flags |= IO_DIRECT_WAL_INIT;
4032 : else
4033 : {
4034 0 : GUC_check_errdetail("Invalid option \"%s\".", item);
4035 0 : result = false;
4036 0 : break;
4037 : }
4038 : }
4039 :
4040 : /*
4041 : * It's possible to configure block sizes smaller than our assumed I/O
4042 : * alignment size, which could result in invalid I/O requests.
4043 : */
4044 : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4045 : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4046 : {
4047 : GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4048 : "debug_io_direct", "XLOG_BLCKSZ");
4049 : result = false;
4050 : }
4051 : #endif
4052 : #if BLCKSZ < PG_IO_ALIGN_SIZE
4053 : if (result && (flags & IO_DIRECT_DATA))
4054 : {
4055 : GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4056 : "debug_io_direct", "BLCKSZ");
4057 : result = false;
4058 : }
4059 : #endif
4060 :
4061 2358 : pfree(rawstring);
4062 2358 : list_free(elemlist);
4063 : #endif
4064 :
4065 2358 : if (!result)
4066 0 : return result;
4067 :
4068 : /* Save the flags in *extra, for use by assign_debug_io_direct */
4069 2358 : *extra = guc_malloc(LOG, sizeof(int));
4070 2358 : if (!*extra)
4071 0 : return false;
4072 2358 : *((int *) *extra) = flags;
4073 :
4074 2358 : return result;
4075 : }
4076 :
4077 : void
4078 2358 : assign_debug_io_direct(const char *newval, void *extra)
4079 : {
4080 2358 : int *flags = (int *) extra;
4081 :
4082 2358 : io_direct_flags = *flags;
4083 2358 : }
4084 :
4085 : /* ResourceOwner callbacks */
4086 :
4087 : static void
4088 8 : ResOwnerReleaseFile(Datum res)
4089 : {
4090 8 : File file = (File) DatumGetInt32(res);
4091 : Vfd *vfdP;
4092 :
4093 : Assert(FileIsValid(file));
4094 :
4095 8 : vfdP = &VfdCache[file];
4096 8 : vfdP->resowner = NULL;
4097 :
4098 8 : FileClose(file);
4099 8 : }
4100 :
4101 : static char *
4102 0 : ResOwnerPrintFile(Datum res)
4103 : {
4104 0 : return psprintf("File %d", DatumGetInt32(res));
4105 : }
|