Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * fd.c
4 : * Virtual file descriptor code.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/fd.c
11 : *
12 : * NOTES:
13 : *
14 : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : * The server opens many file descriptors for a variety of reasons,
16 : * including base tables, scratch files (e.g., sort and hash spool
17 : * files), and random calls to C library routines like system(3); it
18 : * is quite easy to exceed system limits on the number of open files a
19 : * single process can have. (This is around 1024 on many modern
20 : * operating systems, but may be lower on others.)
21 : *
22 : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : * being opened and closed as needed. Obviously, if a routine is
24 : * opened using these interfaces, all subsequent operations must also
25 : * be through these interfaces (the File type is not a real file
26 : * descriptor).
27 : *
28 : * For this scheme to work, most (if not all) routines throughout the
29 : * server should use these interfaces instead of calling the C library
30 : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : * may find ourselves short of real file descriptors anyway.
32 : *
33 : * INTERFACE ROUTINES
34 : *
35 : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : * File is closed, either explicitly or implicitly at end of transaction or
38 : * process exit. PathNameOpenFile is intended for files that are held open
39 : * for a long time, like relation files. It is the caller's responsibility
40 : * to close them, there is no automatic mechanism in fd.c for that.
41 : *
42 : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : * temporary files that have names so that they can be shared between
44 : * backends. Such files are automatically closed and count against the
45 : * temporary file limit of the backend that creates them, but unlike anonymous
46 : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : * ownership mechanism that provides automatic cleanup for shared files when
48 : * the last of a group of backends detaches.
49 : *
50 : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : * They behave like the corresponding native functions, except that the handle
53 : * is registered with the current subtransaction, and will be automatically
54 : * closed at abort. These are intended mainly for short operations like
55 : * reading a configuration file; there is a limit on the number of files that
56 : * can be opened using these functions at any one time.
57 : *
58 : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : * release file descriptors in use by the virtual file descriptors if
60 : * necessary. There is no automatic cleanup of file descriptors returned by
61 : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : * descriptor by calling close(2).
63 : *
64 : * If a non-virtual file descriptor needs to be held open for any length of
65 : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : *
70 : *-------------------------------------------------------------------------
71 : */
72 :
73 : #include "postgres.h"
74 :
75 : #include <dirent.h>
76 : #include <sys/file.h>
77 : #include <sys/param.h>
78 : #include <sys/resource.h> /* for getrlimit */
79 : #include <sys/stat.h>
80 : #include <sys/types.h>
81 : #ifndef WIN32
82 : #include <sys/mman.h>
83 : #endif
84 : #include <limits.h>
85 : #include <unistd.h>
86 : #include <fcntl.h>
87 :
88 : #include "access/xact.h"
89 : #include "access/xlog.h"
90 : #include "catalog/pg_tablespace.h"
91 : #include "common/file_perm.h"
92 : #include "common/file_utils.h"
93 : #include "common/pg_prng.h"
94 : #include "miscadmin.h"
95 : #include "pgstat.h"
96 : #include "postmaster/startup.h"
97 : #include "storage/aio.h"
98 : #include "storage/fd.h"
99 : #include "storage/ipc.h"
100 : #include "utils/guc.h"
101 : #include "utils/guc_hooks.h"
102 : #include "utils/resowner.h"
103 : #include "utils/varlena.h"
104 :
105 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 : #if defined(HAVE_SYNC_FILE_RANGE)
107 : #define PG_FLUSH_DATA_WORKS 1
108 : #elif !defined(WIN32) && defined(MS_ASYNC)
109 : #define PG_FLUSH_DATA_WORKS 1
110 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 : #define PG_FLUSH_DATA_WORKS 1
112 : #endif
113 :
114 : /*
115 : * We must leave some file descriptors free for system(), the dynamic loader,
116 : * and other code that tries to open files without consulting fd.c. This
117 : * is the number left free. (While we try fairly hard to prevent EMFILE
118 : * errors, there's never any guarantee that we won't get ENFILE due to
119 : * other processes chewing up FDs. So it's a bad idea to try to open files
120 : * without consulting fd.c. Nonetheless we cannot control all code.)
121 : *
122 : * Because this is just a fixed setting, we are effectively assuming that
123 : * no such code will leave FDs open over the long term; otherwise the slop
124 : * is likely to be insufficient. Note in particular that we expect that
125 : * loading a shared library does not result in any permanent increase in
126 : * the number of open files. (This appears to be true on most if not
127 : * all platforms as of Feb 2004.)
128 : */
129 : #define NUM_RESERVED_FDS 10
130 :
131 : /*
132 : * If we have fewer than this many usable FDs after allowing for the reserved
133 : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 : * much less than that. Note that this value ensures numExternalFDs can be
135 : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 : * will not pass unless that can grow to at least 14.)
137 : */
138 : #define FD_MINFREE 48
139 :
140 : /*
141 : * A number of platforms allow individual processes to open many more files
142 : * than they can really support when *many* processes do the same thing.
143 : * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 : * what the postmaster's initial probe suggests will work.
145 : */
146 : int max_files_per_process = 1000;
147 :
148 : /*
149 : * Maximum number of file descriptors to open for operations that fd.c knows
150 : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 : * to a conservative value, and remains that way indefinitely in bootstrap or
152 : * standalone-backend cases. In normal postmaster operation, the postmaster
153 : * calls set_max_safe_fds() late in initialization to update the value, and
154 : * that value is then inherited by forked subprocesses.
155 : *
156 : * Note: the value of max_files_per_process is taken into account while
157 : * setting this variable, and so need not be tested separately.
158 : */
159 : int max_safe_fds = FD_MINFREE; /* default if not changed */
160 :
161 : /* Whether it is safe to continue running after fsync() fails. */
162 : bool data_sync_retry = false;
163 :
164 : /* How SyncDataDirectory() should do its job. */
165 : int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
166 :
167 : /* Which kinds of files should be opened with PG_O_DIRECT. */
168 : int io_direct_flags;
169 :
170 : /* Debugging.... */
171 :
172 : #ifdef FDDEBUG
173 : #define DO_DB(A) \
174 : do { \
175 : int _do_db_save_errno = errno; \
176 : A; \
177 : errno = _do_db_save_errno; \
178 : } while (0)
179 : #else
180 : #define DO_DB(A) \
181 : ((void) 0)
182 : #endif
183 :
184 : #define VFD_CLOSED (-1)
185 :
186 : #define FileIsValid(file) \
187 : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188 :
189 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190 :
191 : /* these are the assigned bits in fdstate below: */
192 : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193 : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194 : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195 :
196 : typedef struct vfd
197 : {
198 : int fd; /* current FD, or VFD_CLOSED if none */
199 : unsigned short fdstate; /* bitflags for VFD's state */
200 : ResourceOwner resowner; /* owner, for automatic cleanup */
201 : File nextFree; /* link to next free VFD, if in freelist */
202 : File lruMoreRecently; /* doubly linked recency-of-use list */
203 : File lruLessRecently;
204 : off_t fileSize; /* current size of file (0 if not temporary) */
205 : char *fileName; /* name of file, or NULL for unused VFD */
206 : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 : int fileFlags; /* open(2) flags for (re)opening the file */
208 : mode_t fileMode; /* mode to pass to open(2) */
209 : } Vfd;
210 :
211 : /*
212 : * Virtual File Descriptor array pointer and size. This grows as
213 : * needed. 'File' values are indexes into this array.
214 : * Note that VfdCache[0] is not a usable VFD, just a list header.
215 : */
216 : static Vfd *VfdCache;
217 : static Size SizeVfdCache = 0;
218 :
219 : /*
220 : * Number of file descriptors known to be in use by VFD entries.
221 : */
222 : static int nfile = 0;
223 :
224 : /*
225 : * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 : * to close
227 : */
228 : static bool have_xact_temporary_files = false;
229 :
230 : /*
231 : * Tracks the total size of all temporary files. Note: when temp_file_limit
232 : * is being enforced, this cannot overflow since the limit cannot be more
233 : * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 : * overflow, but we don't care.
235 : */
236 : static uint64 temporary_files_size = 0;
237 :
238 : /* Temporary file access initialized and not yet shut down? */
239 : #ifdef USE_ASSERT_CHECKING
240 : static bool temporary_files_allowed = false;
241 : #endif
242 :
243 : /*
244 : * List of OS handles opened with AllocateFile, AllocateDir and
245 : * OpenTransientFile.
246 : */
247 : typedef enum
248 : {
249 : AllocateDescFile,
250 : AllocateDescPipe,
251 : AllocateDescDir,
252 : AllocateDescRawFD,
253 : } AllocateDescKind;
254 :
255 : typedef struct
256 : {
257 : AllocateDescKind kind;
258 : SubTransactionId create_subid;
259 : union
260 : {
261 : FILE *file;
262 : DIR *dir;
263 : int fd;
264 : } desc;
265 : } AllocateDesc;
266 :
267 : static int numAllocatedDescs = 0;
268 : static int maxAllocatedDescs = 0;
269 : static AllocateDesc *allocatedDescs = NULL;
270 :
271 : /*
272 : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 : */
274 : static int numExternalFDs = 0;
275 :
276 : /*
277 : * Number of temporary files opened during the current session;
278 : * this is used in generation of tempfile names.
279 : */
280 : static long tempFileCounter = 0;
281 :
282 : /*
283 : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 : * indicating that the current database's default tablespace should be used.)
285 : * When numTempTableSpaces is -1, this has not been set in the current
286 : * transaction.
287 : */
288 : static Oid *tempTableSpaces = NULL;
289 : static int numTempTableSpaces = -1;
290 : static int nextTempTableSpace = 0;
291 :
292 :
293 : /*--------------------
294 : *
295 : * Private Routines
296 : *
297 : * Delete - delete a file from the Lru ring
298 : * LruDelete - remove a file from the Lru ring and close its FD
299 : * Insert - put a file at the front of the Lru ring
300 : * LruInsert - put a file at the front of the Lru ring and open it
301 : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 : * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 : * FreeVfd - free a file record
305 : *
306 : * The Least Recently Used ring is a doubly linked list that begins and
307 : * ends on element zero. Element zero is special -- it doesn't represent
308 : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 : * anchor that shows us the beginning/end of the ring.
310 : * Only VFD elements that are currently really open (have an FD assigned) are
311 : * in the Lru ring. Elements that are "virtually" open can be recognized
312 : * by having a non-null fileName field.
313 : *
314 : * example:
315 : *
316 : * /--less----\ /---------\
317 : * v \ v \
318 : * #0 --more---> LeastRecentlyUsed --more-\ \
319 : * ^\ | |
320 : * \\less--> MostRecentlyUsedFile <---/ |
321 : * \more---/ \--less--/
322 : *
323 : *--------------------
324 : */
325 : static void Delete(File file);
326 : static void LruDelete(File file);
327 : static void Insert(File file);
328 : static int LruInsert(File file);
329 : static bool ReleaseLruFile(void);
330 : static void ReleaseLruFiles(void);
331 : static File AllocateVfd(void);
332 : static void FreeVfd(File file);
333 :
334 : static int FileAccess(File file);
335 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336 : static bool reserveAllocatedDesc(void);
337 : static int FreeDesc(AllocateDesc *desc);
338 :
339 : static void BeforeShmemExit_Files(int code, Datum arg);
340 : static void CleanupTempFiles(bool isCommit, bool isProcExit);
341 : static void RemovePgTempRelationFiles(const char *tsdirname);
342 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343 :
344 : static void walkdir(const char *path,
345 : void (*action) (const char *fname, bool isdir, int elevel),
346 : bool process_symlinks,
347 : int elevel);
348 : #ifdef PG_FLUSH_DATA_WORKS
349 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350 : #endif
351 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352 : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353 :
354 : static int fsync_parent_path(const char *fname, int elevel);
355 :
356 :
357 : /* ResourceOwner callbacks to hold virtual file descriptors */
358 : static void ResOwnerReleaseFile(Datum res);
359 : static char *ResOwnerPrintFile(Datum res);
360 :
361 : static const ResourceOwnerDesc file_resowner_desc =
362 : {
363 : .name = "File",
364 : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365 : .release_priority = RELEASE_PRIO_FILES,
366 : .ReleaseResource = ResOwnerReleaseFile,
367 : .DebugPrint = ResOwnerPrintFile
368 : };
369 :
370 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
371 : static inline void
372 7350 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
373 : {
374 7350 : ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
375 7350 : }
376 : static inline void
377 7342 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
378 : {
379 7342 : ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
380 7342 : }
381 :
382 : /*
383 : * pg_fsync --- do fsync with or without writethrough
384 : */
385 : int
386 128874 : pg_fsync(int fd)
387 : {
388 : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389 : struct stat st;
390 :
391 : /*
392 : * Some operating system implementations of fsync() have requirements
393 : * about the file access modes that were used when their file descriptor
394 : * argument was opened, and these requirements differ depending on whether
395 : * the file descriptor is for a directory.
396 : *
397 : * For any file descriptor that may eventually be handed to fsync(), we
398 : * should have opened it with access modes that are compatible with
399 : * fsync() on all supported systems, otherwise the code may not be
400 : * portable, even if it runs ok on the current system.
401 : *
402 : * We assert here that a descriptor for a file was opened with write
403 : * permissions (i.e., not O_RDONLY) and for a directory without write
404 : * permissions (O_RDONLY). Notice that the assertion check is made even
405 : * if fsync() is disabled.
406 : *
407 : * If fstat() fails, ignore it and let the follow-up fsync() complain.
408 : */
409 : if (fstat(fd, &st) == 0)
410 : {
411 : int desc_flags = fcntl(fd, F_GETFL);
412 :
413 : desc_flags &= O_ACCMODE;
414 :
415 : if (S_ISDIR(st.st_mode))
416 : Assert(desc_flags == O_RDONLY);
417 : else
418 : Assert(desc_flags != O_RDONLY);
419 : }
420 : errno = 0;
421 : #endif
422 :
423 : /* #if is to skip the wal_sync_method test if there's no need for it */
424 : #if defined(HAVE_FSYNC_WRITETHROUGH)
425 : if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
426 : return pg_fsync_writethrough(fd);
427 : else
428 : #endif
429 128874 : return pg_fsync_no_writethrough(fd);
430 : }
431 :
432 :
433 : /*
434 : * pg_fsync_no_writethrough --- same as fsync except does nothing if
435 : * enableFsync is off
436 : */
437 : int
438 128874 : pg_fsync_no_writethrough(int fd)
439 : {
440 : int rc;
441 :
442 128874 : if (!enableFsync)
443 128874 : return 0;
444 :
445 0 : retry:
446 0 : rc = fsync(fd);
447 :
448 0 : if (rc == -1 && errno == EINTR)
449 0 : goto retry;
450 :
451 0 : return rc;
452 : }
453 :
454 : /*
455 : * pg_fsync_writethrough
456 : */
457 : int
458 0 : pg_fsync_writethrough(int fd)
459 : {
460 0 : if (enableFsync)
461 : {
462 : #if defined(F_FULLFSYNC)
463 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
464 : #else
465 0 : errno = ENOSYS;
466 0 : return -1;
467 : #endif
468 : }
469 : else
470 0 : return 0;
471 : }
472 :
473 : /*
474 : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
475 : */
476 : int
477 0 : pg_fdatasync(int fd)
478 : {
479 : int rc;
480 :
481 0 : if (!enableFsync)
482 0 : return 0;
483 :
484 0 : retry:
485 0 : rc = fdatasync(fd);
486 :
487 0 : if (rc == -1 && errno == EINTR)
488 0 : goto retry;
489 :
490 0 : return rc;
491 : }
492 :
493 : /*
494 : * pg_file_exists -- check that a file exists.
495 : *
496 : * This requires an absolute path to the file. Returns true if the file is
497 : * not a directory, false otherwise.
498 : */
499 : bool
500 37070 : pg_file_exists(const char *name)
501 : {
502 : struct stat st;
503 :
504 : Assert(name != NULL);
505 :
506 37070 : if (stat(name, &st) == 0)
507 19626 : return !S_ISDIR(st.st_mode);
508 17444 : else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
509 0 : ereport(ERROR,
510 : (errcode_for_file_access(),
511 : errmsg("could not access file \"%s\": %m", name)));
512 :
513 17444 : return false;
514 : }
515 :
516 : /*
517 : * pg_flush_data --- advise OS that the described dirty data should be flushed
518 : *
519 : * offset of 0 with nbytes 0 means that the entire file should be flushed
520 : */
521 : void
522 70606 : pg_flush_data(int fd, off_t offset, off_t nbytes)
523 : {
524 : /*
525 : * Right now file flushing is primarily used to avoid making later
526 : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
527 : * if fsyncs are disabled - that's a decision we might want to make
528 : * configurable at some point.
529 : */
530 70606 : if (!enableFsync)
531 70606 : return;
532 :
533 : /*
534 : * We compile all alternatives that are supported on the current platform,
535 : * to find portability problems more easily.
536 : */
537 : #if defined(HAVE_SYNC_FILE_RANGE)
538 : {
539 : int rc;
540 : static bool not_implemented_by_kernel = false;
541 :
542 0 : if (not_implemented_by_kernel)
543 0 : return;
544 :
545 0 : retry:
546 :
547 : /*
548 : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
549 : * tells the OS that writeback for the specified blocks should be
550 : * started, but that we don't want to wait for completion. Note that
551 : * this call might block if too much dirty data exists in the range.
552 : * This is the preferable method on OSs supporting it, as it works
553 : * reliably when available (contrast to msync()) and doesn't flush out
554 : * clean data (like FADV_DONTNEED).
555 : */
556 0 : rc = sync_file_range(fd, offset, nbytes,
557 : SYNC_FILE_RANGE_WRITE);
558 0 : if (rc != 0)
559 : {
560 : int elevel;
561 :
562 0 : if (rc == EINTR)
563 0 : goto retry;
564 :
565 : /*
566 : * For systems that don't have an implementation of
567 : * sync_file_range() such as Windows WSL, generate only one
568 : * warning and then suppress all further attempts by this process.
569 : */
570 0 : if (errno == ENOSYS)
571 : {
572 0 : elevel = WARNING;
573 0 : not_implemented_by_kernel = true;
574 : }
575 : else
576 0 : elevel = data_sync_elevel(WARNING);
577 :
578 0 : ereport(elevel,
579 : (errcode_for_file_access(),
580 : errmsg("could not flush dirty data: %m")));
581 : }
582 :
583 0 : return;
584 : }
585 : #endif
586 : #if !defined(WIN32) && defined(MS_ASYNC)
587 : {
588 : void *p;
589 : static int pagesize = 0;
590 :
591 : /*
592 : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
593 : * writeback. On linux it only does so if MS_SYNC is specified, but
594 : * then it does the writeback synchronously. Luckily all common linux
595 : * systems have sync_file_range(). This is preferable over
596 : * FADV_DONTNEED because it doesn't flush out clean data.
597 : *
598 : * We map the file (mmap()), tell the kernel to sync back the contents
599 : * (msync()), and then remove the mapping again (munmap()).
600 : */
601 :
602 : /* mmap() needs actual length if we want to map whole file */
603 : if (offset == 0 && nbytes == 0)
604 : {
605 : nbytes = lseek(fd, 0, SEEK_END);
606 : if (nbytes < 0)
607 : {
608 : ereport(WARNING,
609 : (errcode_for_file_access(),
610 : errmsg("could not determine dirty data size: %m")));
611 : return;
612 : }
613 : }
614 :
615 : /*
616 : * Some platforms reject partial-page mmap() attempts. To deal with
617 : * that, just truncate the request to a page boundary. If any extra
618 : * bytes don't get flushed, well, it's only a hint anyway.
619 : */
620 :
621 : /* fetch pagesize only once */
622 : if (pagesize == 0)
623 : pagesize = sysconf(_SC_PAGESIZE);
624 :
625 : /* align length to pagesize, dropping any fractional page */
626 : if (pagesize > 0)
627 : nbytes = (nbytes / pagesize) * pagesize;
628 :
629 : /* fractional-page request is a no-op */
630 : if (nbytes <= 0)
631 : return;
632 :
633 : /*
634 : * mmap could well fail, particularly on 32-bit platforms where there
635 : * may simply not be enough address space. If so, silently fall
636 : * through to the next implementation.
637 : */
638 : if (nbytes <= (off_t) SSIZE_MAX)
639 : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
640 : else
641 : p = MAP_FAILED;
642 :
643 : if (p != MAP_FAILED)
644 : {
645 : int rc;
646 :
647 : rc = msync(p, (size_t) nbytes, MS_ASYNC);
648 : if (rc != 0)
649 : {
650 : ereport(data_sync_elevel(WARNING),
651 : (errcode_for_file_access(),
652 : errmsg("could not flush dirty data: %m")));
653 : /* NB: need to fall through to munmap()! */
654 : }
655 :
656 : rc = munmap(p, (size_t) nbytes);
657 : if (rc != 0)
658 : {
659 : /* FATAL error because mapping would remain */
660 : ereport(FATAL,
661 : (errcode_for_file_access(),
662 : errmsg("could not munmap() while flushing data: %m")));
663 : }
664 :
665 : return;
666 : }
667 : }
668 : #endif
669 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
670 : {
671 : int rc;
672 :
673 : /*
674 : * Signal the kernel that the passed in range should not be cached
675 : * anymore. This has the, desired, side effect of writing out dirty
676 : * data, and the, undesired, side effect of likely discarding useful
677 : * clean cached blocks. For the latter reason this is the least
678 : * preferable method.
679 : */
680 :
681 : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
682 :
683 : if (rc != 0)
684 : {
685 : /* don't error out, this is just a performance optimization */
686 : ereport(WARNING,
687 : (errcode_for_file_access(),
688 : errmsg("could not flush dirty data: %m")));
689 : }
690 :
691 : return;
692 : }
693 : #endif
694 : }
695 :
696 : /*
697 : * Truncate an open file to a given length.
698 : */
699 : static int
700 1086 : pg_ftruncate(int fd, off_t length)
701 : {
702 : int ret;
703 :
704 1086 : retry:
705 1086 : ret = ftruncate(fd, length);
706 :
707 1086 : if (ret == -1 && errno == EINTR)
708 0 : goto retry;
709 :
710 1086 : return ret;
711 : }
712 :
713 : /*
714 : * Truncate a file to a given length by name.
715 : */
716 : int
717 435272 : pg_truncate(const char *path, off_t length)
718 : {
719 : int ret;
720 : #ifdef WIN32
721 : int save_errno;
722 : int fd;
723 :
724 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
725 : if (fd >= 0)
726 : {
727 : ret = pg_ftruncate(fd, length);
728 : save_errno = errno;
729 : CloseTransientFile(fd);
730 : errno = save_errno;
731 : }
732 : else
733 : ret = -1;
734 : #else
735 :
736 435272 : retry:
737 435272 : ret = truncate(path, length);
738 :
739 435272 : if (ret == -1 && errno == EINTR)
740 0 : goto retry;
741 : #endif
742 :
743 435272 : return ret;
744 : }
745 :
746 : /*
747 : * fsync_fname -- fsync a file or directory, handling errors properly
748 : *
749 : * Try to fsync a file or directory. When doing the latter, ignore errors that
750 : * indicate the OS just doesn't allow/require fsyncing directories.
751 : */
752 : void
753 40930 : fsync_fname(const char *fname, bool isdir)
754 : {
755 40930 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
756 40930 : }
757 :
758 : /*
759 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
760 : *
761 : * This routine ensures that, after returning, the effect of renaming file
762 : * persists in case of a crash. A crash while this routine is running will
763 : * leave you with either the pre-existing or the moved file in place of the
764 : * new file; no mixed state or truncated files are possible.
765 : *
766 : * It does so by using fsync on the old filename and the possibly existing
767 : * target filename before the rename, and the target file and directory after.
768 : *
769 : * Note that rename() cannot be used across arbitrary directories, as they
770 : * might not be on the same filesystem. Therefore this routine does not
771 : * support renaming across directories.
772 : *
773 : * Log errors with the caller specified severity.
774 : *
775 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
776 : * valid upon return.
777 : */
778 : int
779 12604 : durable_rename(const char *oldfile, const char *newfile, int elevel)
780 : {
781 : int fd;
782 :
783 : /*
784 : * First fsync the old and target path (if it exists), to ensure that they
785 : * are properly persistent on disk. Syncing the target file is not
786 : * strictly necessary, but it makes it easier to reason about crashes;
787 : * because it's then guaranteed that either source or target file exists
788 : * after a crash.
789 : */
790 12604 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
791 0 : return -1;
792 :
793 12604 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
794 12604 : if (fd < 0)
795 : {
796 8824 : if (errno != ENOENT)
797 : {
798 0 : ereport(elevel,
799 : (errcode_for_file_access(),
800 : errmsg("could not open file \"%s\": %m", newfile)));
801 0 : return -1;
802 : }
803 : }
804 : else
805 : {
806 3780 : if (pg_fsync(fd) != 0)
807 : {
808 : int save_errno;
809 :
810 : /* close file upon error, might not be in transaction context */
811 0 : save_errno = errno;
812 0 : CloseTransientFile(fd);
813 0 : errno = save_errno;
814 :
815 0 : ereport(elevel,
816 : (errcode_for_file_access(),
817 : errmsg("could not fsync file \"%s\": %m", newfile)));
818 0 : return -1;
819 : }
820 :
821 3780 : if (CloseTransientFile(fd) != 0)
822 : {
823 0 : ereport(elevel,
824 : (errcode_for_file_access(),
825 : errmsg("could not close file \"%s\": %m", newfile)));
826 0 : return -1;
827 : }
828 : }
829 :
830 : /* Time to do the real deal... */
831 12604 : if (rename(oldfile, newfile) < 0)
832 : {
833 0 : ereport(elevel,
834 : (errcode_for_file_access(),
835 : errmsg("could not rename file \"%s\" to \"%s\": %m",
836 : oldfile, newfile)));
837 0 : return -1;
838 : }
839 :
840 : /*
841 : * To guarantee renaming the file is persistent, fsync the file with its
842 : * new name, and its containing directory.
843 : */
844 12604 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
845 0 : return -1;
846 :
847 12604 : if (fsync_parent_path(newfile, elevel) != 0)
848 0 : return -1;
849 :
850 12604 : return 0;
851 : }
852 :
853 : /*
854 : * durable_unlink -- remove a file in a durable manner
855 : *
856 : * This routine ensures that, after returning, the effect of removing file
857 : * persists in case of a crash. A crash while this routine is running will
858 : * leave the system in no mixed state.
859 : *
860 : * It does so by using fsync on the parent directory of the file after the
861 : * actual removal is done.
862 : *
863 : * Log errors with the severity specified by caller.
864 : *
865 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
866 : * valid upon return.
867 : */
868 : int
869 2584 : durable_unlink(const char *fname, int elevel)
870 : {
871 2584 : if (unlink(fname) < 0)
872 : {
873 76 : ereport(elevel,
874 : (errcode_for_file_access(),
875 : errmsg("could not remove file \"%s\": %m",
876 : fname)));
877 76 : return -1;
878 : }
879 :
880 : /*
881 : * To guarantee that the removal of the file is persistent, fsync its
882 : * parent directory.
883 : */
884 2508 : if (fsync_parent_path(fname, elevel) != 0)
885 0 : return -1;
886 :
887 2508 : return 0;
888 : }
889 :
890 : /*
891 : * InitFileAccess --- initialize this module during backend startup
892 : *
893 : * This is called during either normal or standalone backend start.
894 : * It is *not* called in the postmaster.
895 : *
896 : * Note that this does not initialize temporary file access, that is
897 : * separately initialized via InitTemporaryFileAccess().
898 : */
899 : void
900 42720 : InitFileAccess(void)
901 : {
902 : Assert(SizeVfdCache == 0); /* call me only once */
903 :
904 : /* initialize cache header entry */
905 42720 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
906 42720 : if (VfdCache == NULL)
907 0 : ereport(FATAL,
908 : (errcode(ERRCODE_OUT_OF_MEMORY),
909 : errmsg("out of memory")));
910 :
911 341760 : MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
912 42720 : VfdCache->fd = VFD_CLOSED;
913 :
914 42720 : SizeVfdCache = 1;
915 42720 : }
916 :
917 : /*
918 : * InitTemporaryFileAccess --- initialize temporary file access during startup
919 : *
920 : * This is called during either normal or standalone backend start.
921 : * It is *not* called in the postmaster.
922 : *
923 : * This is separate from InitFileAccess() because temporary file cleanup can
924 : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
925 : * our reporting has to happen before that. Low level file access should be
926 : * available for longer, hence the separate initialization / shutdown of
927 : * temporary file handling.
928 : */
929 : void
930 42720 : InitTemporaryFileAccess(void)
931 : {
932 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
933 : Assert(!temporary_files_allowed); /* call me only once */
934 :
935 : /*
936 : * Register before-shmem-exit hook to ensure temp files are dropped while
937 : * we can still report stats.
938 : */
939 42720 : before_shmem_exit(BeforeShmemExit_Files, 0);
940 :
941 : #ifdef USE_ASSERT_CHECKING
942 : temporary_files_allowed = true;
943 : #endif
944 42720 : }
945 :
946 : /*
947 : * count_usable_fds --- count how many FDs the system will let us open,
948 : * and estimate how many are already open.
949 : *
950 : * We stop counting if usable_fds reaches max_to_probe. Note: a small
951 : * value of max_to_probe might result in an underestimate of already_open;
952 : * we must fill in any "gaps" in the set of used FDs before the calculation
953 : * of already_open will give the right answer. In practice, max_to_probe
954 : * of a couple of dozen should be enough to ensure good results.
955 : *
956 : * We assume stderr (FD 2) is available for dup'ing. While the calling
957 : * script could theoretically close that, it would be a really bad idea,
958 : * since then one risks loss of error messages from, e.g., libc.
959 : */
960 : static void
961 2116 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
962 : {
963 : int *fd;
964 : int size;
965 2116 : int used = 0;
966 2116 : int highestfd = 0;
967 : int j;
968 :
969 : #ifdef HAVE_GETRLIMIT
970 : struct rlimit rlim;
971 : int getrlimit_status;
972 : #endif
973 :
974 2116 : size = 1024;
975 2116 : fd = (int *) palloc(size * sizeof(int));
976 :
977 : #ifdef HAVE_GETRLIMIT
978 2116 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
979 2116 : if (getrlimit_status != 0)
980 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
981 : #endif /* HAVE_GETRLIMIT */
982 :
983 : /* dup until failure or probe limit reached */
984 : for (;;)
985 2113884 : {
986 : int thisfd;
987 :
988 : #ifdef HAVE_GETRLIMIT
989 :
990 : /*
991 : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
992 : * some platforms
993 : */
994 2116000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
995 0 : break;
996 : #endif
997 :
998 2116000 : thisfd = dup(2);
999 2116000 : if (thisfd < 0)
1000 : {
1001 : /* Expect EMFILE or ENFILE, else it's fishy */
1002 0 : if (errno != EMFILE && errno != ENFILE)
1003 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1004 0 : break;
1005 : }
1006 :
1007 2116000 : if (used >= size)
1008 : {
1009 0 : size *= 2;
1010 0 : fd = (int *) repalloc(fd, size * sizeof(int));
1011 : }
1012 2116000 : fd[used++] = thisfd;
1013 :
1014 2116000 : if (highestfd < thisfd)
1015 2116000 : highestfd = thisfd;
1016 :
1017 2116000 : if (used >= max_to_probe)
1018 2116 : break;
1019 : }
1020 :
1021 : /* release the files we opened */
1022 2118116 : for (j = 0; j < used; j++)
1023 2116000 : close(fd[j]);
1024 :
1025 2116 : pfree(fd);
1026 :
1027 : /*
1028 : * Return results. usable_fds is just the number of successful dups. We
1029 : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1030 : * number) and so already_open is highestfd+1 - usable_fds.
1031 : */
1032 2116 : *usable_fds = used;
1033 2116 : *already_open = highestfd + 1 - used;
1034 2116 : }
1035 :
1036 : /*
1037 : * set_max_safe_fds
1038 : * Determine number of file descriptors that fd.c is allowed to use
1039 : */
1040 : void
1041 2116 : set_max_safe_fds(void)
1042 : {
1043 : int usable_fds;
1044 : int already_open;
1045 :
1046 : /*----------
1047 : * We want to set max_safe_fds to
1048 : * MIN(usable_fds, max_files_per_process)
1049 : * less the slop factor for files that are opened without consulting
1050 : * fd.c. This ensures that we won't allow to open more than
1051 : * max_files_per_process, or the experimentally-determined EMFILE limit,
1052 : * additional files.
1053 : *----------
1054 : */
1055 2116 : count_usable_fds(max_files_per_process,
1056 : &usable_fds, &already_open);
1057 :
1058 2116 : max_safe_fds = Min(usable_fds, max_files_per_process);
1059 :
1060 : /*
1061 : * Take off the FDs reserved for system() etc.
1062 : */
1063 2116 : max_safe_fds -= NUM_RESERVED_FDS;
1064 :
1065 : /*
1066 : * Make sure we still have enough to get by.
1067 : */
1068 2116 : if (max_safe_fds < FD_MINFREE)
1069 0 : ereport(FATAL,
1070 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1071 : errmsg("insufficient file descriptors available to start server process"),
1072 : errdetail("System allows %d, server needs at least %d, %d files are already open.",
1073 : max_safe_fds + NUM_RESERVED_FDS,
1074 : FD_MINFREE + NUM_RESERVED_FDS,
1075 : already_open)));
1076 :
1077 2116 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1078 : max_safe_fds, usable_fds, already_open);
1079 2116 : }
1080 :
1081 : /*
1082 : * Open a file with BasicOpenFilePerm() and pass default file mode for the
1083 : * fileMode parameter.
1084 : */
1085 : int
1086 66474 : BasicOpenFile(const char *fileName, int fileFlags)
1087 : {
1088 66474 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1089 : }
1090 :
1091 : /*
1092 : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1093 : *
1094 : * This is exported for use by places that really want a plain kernel FD,
1095 : * but need to be proof against running out of FDs. Once an FD has been
1096 : * successfully returned, it is the caller's responsibility to ensure that
1097 : * it will not be leaked on ereport()! Most users should *not* call this
1098 : * routine directly, but instead use the VFD abstraction level, which
1099 : * provides protection against descriptor leaks as well as management of
1100 : * files that need to be open for more than a short period of time.
1101 : *
1102 : * Ideally this should be the *only* direct call of open() in the backend.
1103 : * In practice, the postmaster calls open() directly, and there are some
1104 : * direct open() calls done early in backend startup. Those are OK since
1105 : * this module wouldn't have any open files to close at that point anyway.
1106 : */
1107 : int
1108 18640190 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1109 : {
1110 : int fd;
1111 :
1112 18640190 : tryAgain:
1113 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1114 :
1115 : /*
1116 : * The value we defined to stand in for O_DIRECT when simulating it with
1117 : * F_NOCACHE had better not collide with any of the standard flags.
1118 : */
1119 : StaticAssertStmt((PG_O_DIRECT &
1120 : (O_APPEND |
1121 : O_CLOEXEC |
1122 : O_CREAT |
1123 : O_DSYNC |
1124 : O_EXCL |
1125 : O_RDWR |
1126 : O_RDONLY |
1127 : O_SYNC |
1128 : O_TRUNC |
1129 : O_WRONLY)) == 0,
1130 : "PG_O_DIRECT value collides with standard flag");
1131 : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1132 : #else
1133 18640190 : fd = open(fileName, fileFlags, fileMode);
1134 : #endif
1135 :
1136 18640190 : if (fd >= 0)
1137 : {
1138 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1139 : if (fileFlags & PG_O_DIRECT)
1140 : {
1141 : if (fcntl(fd, F_NOCACHE, 1) < 0)
1142 : {
1143 : int save_errno = errno;
1144 :
1145 : close(fd);
1146 : errno = save_errno;
1147 : return -1;
1148 : }
1149 : }
1150 : #endif
1151 :
1152 17696916 : return fd; /* success! */
1153 : }
1154 :
1155 943274 : if (errno == EMFILE || errno == ENFILE)
1156 : {
1157 0 : int save_errno = errno;
1158 :
1159 0 : ereport(LOG,
1160 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1161 : errmsg("out of file descriptors: %m; release and retry")));
1162 0 : errno = 0;
1163 0 : if (ReleaseLruFile())
1164 0 : goto tryAgain;
1165 0 : errno = save_errno;
1166 : }
1167 :
1168 943274 : return -1; /* failure */
1169 : }
1170 :
1171 : /*
1172 : * AcquireExternalFD - attempt to reserve an external file descriptor
1173 : *
1174 : * This should be used by callers that need to hold a file descriptor open
1175 : * over more than a short interval, but cannot use any of the other facilities
1176 : * provided by this module.
1177 : *
1178 : * The difference between this and the underlying ReserveExternalFD function
1179 : * is that this will report failure (by setting errno and returning false)
1180 : * if "too many" external FDs are already reserved. This should be used in
1181 : * any code where the total number of FDs to be reserved is not predictable
1182 : * and small.
1183 : */
1184 : bool
1185 311056 : AcquireExternalFD(void)
1186 : {
1187 : /*
1188 : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1189 : * "external" FDs.
1190 : */
1191 311056 : if (numExternalFDs < max_safe_fds / 3)
1192 : {
1193 311056 : ReserveExternalFD();
1194 311056 : return true;
1195 : }
1196 0 : errno = EMFILE;
1197 0 : return false;
1198 : }
1199 :
1200 : /*
1201 : * ReserveExternalFD - report external consumption of a file descriptor
1202 : *
1203 : * This should be used by callers that need to hold a file descriptor open
1204 : * over more than a short interval, but cannot use any of the other facilities
1205 : * provided by this module. This just tracks the use of the FD and closes
1206 : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1207 : *
1208 : * Call this directly only in code where failure to reserve the FD would be
1209 : * fatal; for example, the WAL-writing code does so, since the alternative is
1210 : * session failure. Also, it's very unwise to do so in code that could
1211 : * consume more than one FD per process.
1212 : *
1213 : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1214 : * available, it doesn't matter too much whether this is called before or
1215 : * after actually opening the FD; but doing so beforehand reduces the risk of
1216 : * an EMFILE failure if not everybody played nice. In any case, it's solely
1217 : * caller's responsibility to keep the external-FD count in sync with reality.
1218 : */
1219 : void
1220 461520 : ReserveExternalFD(void)
1221 : {
1222 : /*
1223 : * Release VFDs if needed to stay safe. Because we do this before
1224 : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1225 : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1226 : */
1227 461520 : ReleaseLruFiles();
1228 :
1229 461520 : numExternalFDs++;
1230 461520 : }
1231 :
1232 : /*
1233 : * ReleaseExternalFD - report release of an external file descriptor
1234 : *
1235 : * This is guaranteed not to change errno, so it can be used in failure paths.
1236 : */
1237 : void
1238 424144 : ReleaseExternalFD(void)
1239 : {
1240 : Assert(numExternalFDs > 0);
1241 424144 : numExternalFDs--;
1242 424144 : }
1243 :
1244 :
1245 : #if defined(FDDEBUG)
1246 :
1247 : static void
1248 : _dump_lru(void)
1249 : {
1250 : int mru = VfdCache[0].lruLessRecently;
1251 : Vfd *vfdP = &VfdCache[mru];
1252 : char buf[2048];
1253 :
1254 : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1255 : while (mru != 0)
1256 : {
1257 : mru = vfdP->lruLessRecently;
1258 : vfdP = &VfdCache[mru];
1259 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1260 : }
1261 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1262 : elog(LOG, "%s", buf);
1263 : }
1264 : #endif /* FDDEBUG */
1265 :
1266 : static void
1267 2644358 : Delete(File file)
1268 : {
1269 : Vfd *vfdP;
1270 :
1271 : Assert(file != 0);
1272 :
1273 : DO_DB(elog(LOG, "Delete %d (%s)",
1274 : file, VfdCache[file].fileName));
1275 : DO_DB(_dump_lru());
1276 :
1277 2644358 : vfdP = &VfdCache[file];
1278 :
1279 2644358 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1280 2644358 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1281 :
1282 : DO_DB(_dump_lru());
1283 2644358 : }
1284 :
1285 : static void
1286 8046 : LruDelete(File file)
1287 : {
1288 : Vfd *vfdP;
1289 :
1290 : Assert(file != 0);
1291 :
1292 : DO_DB(elog(LOG, "LruDelete %d (%s)",
1293 : file, VfdCache[file].fileName));
1294 :
1295 8046 : vfdP = &VfdCache[file];
1296 :
1297 8046 : pgaio_closing_fd(vfdP->fd);
1298 :
1299 : /*
1300 : * Close the file. We aren't expecting this to fail; if it does, better
1301 : * to leak the FD than to mess up our internal state.
1302 : */
1303 8046 : if (close(vfdP->fd) != 0)
1304 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1305 : "could not close file \"%s\": %m", vfdP->fileName);
1306 8046 : vfdP->fd = VFD_CLOSED;
1307 8046 : --nfile;
1308 :
1309 : /* delete the vfd record from the LRU ring */
1310 8046 : Delete(file);
1311 8046 : }
1312 :
1313 : static void
1314 3738936 : Insert(File file)
1315 : {
1316 : Vfd *vfdP;
1317 :
1318 : Assert(file != 0);
1319 :
1320 : DO_DB(elog(LOG, "Insert %d (%s)",
1321 : file, VfdCache[file].fileName));
1322 : DO_DB(_dump_lru());
1323 :
1324 3738936 : vfdP = &VfdCache[file];
1325 :
1326 3738936 : vfdP->lruMoreRecently = 0;
1327 3738936 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1328 3738936 : VfdCache[0].lruLessRecently = file;
1329 3738936 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1330 :
1331 : DO_DB(_dump_lru());
1332 3738936 : }
1333 :
1334 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1335 : static int
1336 112 : LruInsert(File file)
1337 : {
1338 : Vfd *vfdP;
1339 :
1340 : Assert(file != 0);
1341 :
1342 : DO_DB(elog(LOG, "LruInsert %d (%s)",
1343 : file, VfdCache[file].fileName));
1344 :
1345 112 : vfdP = &VfdCache[file];
1346 :
1347 112 : if (FileIsNotOpen(file))
1348 : {
1349 : /* Close excess kernel FDs. */
1350 112 : ReleaseLruFiles();
1351 :
1352 : /*
1353 : * The open could still fail for lack of file descriptors, eg due to
1354 : * overall system file table being full. So, be prepared to release
1355 : * another FD if necessary...
1356 : */
1357 112 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1358 : vfdP->fileMode);
1359 112 : if (vfdP->fd < 0)
1360 : {
1361 : DO_DB(elog(LOG, "re-open failed: %m"));
1362 0 : return -1;
1363 : }
1364 : else
1365 : {
1366 112 : ++nfile;
1367 : }
1368 : }
1369 :
1370 : /*
1371 : * put it at the head of the Lru ring
1372 : */
1373 :
1374 112 : Insert(file);
1375 :
1376 112 : return 0;
1377 : }
1378 :
1379 : /*
1380 : * Release one kernel FD by closing the least-recently-used VFD.
1381 : */
1382 : static bool
1383 7800 : ReleaseLruFile(void)
1384 : {
1385 : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1386 :
1387 7800 : if (nfile > 0)
1388 : {
1389 : /*
1390 : * There are opened files and so there should be at least one used vfd
1391 : * in the ring.
1392 : */
1393 : Assert(VfdCache[0].lruMoreRecently != 0);
1394 7800 : LruDelete(VfdCache[0].lruMoreRecently);
1395 7800 : return true; /* freed a file */
1396 : }
1397 0 : return false; /* no files available to free */
1398 : }
1399 :
1400 : /*
1401 : * Release kernel FDs as needed to get under the max_safe_fds limit.
1402 : * After calling this, it's OK to try to open another file.
1403 : */
1404 : static void
1405 19293836 : ReleaseLruFiles(void)
1406 : {
1407 19301636 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1408 : {
1409 7800 : if (!ReleaseLruFile())
1410 0 : break;
1411 : }
1412 19293836 : }
1413 :
1414 : static File
1415 3147412 : AllocateVfd(void)
1416 : {
1417 : Index i;
1418 : File file;
1419 :
1420 : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1421 :
1422 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1423 :
1424 3147412 : if (VfdCache[0].nextFree == 0)
1425 : {
1426 : /*
1427 : * The free list is empty so it is time to increase the size of the
1428 : * array. We choose to double it each time this happens. However,
1429 : * there's not much point in starting *real* small.
1430 : */
1431 54368 : Size newCacheSize = SizeVfdCache * 2;
1432 : Vfd *newVfdCache;
1433 :
1434 54368 : if (newCacheSize < 32)
1435 36644 : newCacheSize = 32;
1436 :
1437 : /*
1438 : * Be careful not to clobber VfdCache ptr if realloc fails.
1439 : */
1440 54368 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1441 54368 : if (newVfdCache == NULL)
1442 0 : ereport(ERROR,
1443 : (errcode(ERRCODE_OUT_OF_MEMORY),
1444 : errmsg("out of memory")));
1445 54368 : VfdCache = newVfdCache;
1446 :
1447 : /*
1448 : * Initialize the new entries and link them into the free list.
1449 : */
1450 2730812 : for (i = SizeVfdCache; i < newCacheSize; i++)
1451 : {
1452 21411552 : MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1453 2676444 : VfdCache[i].nextFree = i + 1;
1454 2676444 : VfdCache[i].fd = VFD_CLOSED;
1455 : }
1456 54368 : VfdCache[newCacheSize - 1].nextFree = 0;
1457 54368 : VfdCache[0].nextFree = SizeVfdCache;
1458 :
1459 : /*
1460 : * Record the new size
1461 : */
1462 54368 : SizeVfdCache = newCacheSize;
1463 : }
1464 :
1465 3147412 : file = VfdCache[0].nextFree;
1466 :
1467 3147412 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1468 :
1469 3147412 : return file;
1470 : }
1471 :
1472 : static void
1473 2047010 : FreeVfd(File file)
1474 : {
1475 2047010 : Vfd *vfdP = &VfdCache[file];
1476 :
1477 : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1478 : file, vfdP->fileName ? vfdP->fileName : ""));
1479 :
1480 2047010 : if (vfdP->fileName != NULL)
1481 : {
1482 1116854 : free(vfdP->fileName);
1483 1116854 : vfdP->fileName = NULL;
1484 : }
1485 2047010 : vfdP->fdstate = 0x0;
1486 :
1487 2047010 : vfdP->nextFree = VfdCache[0].nextFree;
1488 2047010 : VfdCache[0].nextFree = file;
1489 2047010 : }
1490 :
1491 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1492 : static int
1493 5992370 : FileAccess(File file)
1494 : {
1495 : int returnValue;
1496 :
1497 : DO_DB(elog(LOG, "FileAccess %d (%s)",
1498 : file, VfdCache[file].fileName));
1499 :
1500 : /*
1501 : * Is the file open? If not, open it and put it at the head of the LRU
1502 : * ring (possibly closing the least recently used file to get an FD).
1503 : */
1504 :
1505 5992370 : if (FileIsNotOpen(file))
1506 : {
1507 112 : returnValue = LruInsert(file);
1508 112 : if (returnValue != 0)
1509 0 : return returnValue;
1510 : }
1511 5992258 : else if (VfdCache[0].lruLessRecently != file)
1512 : {
1513 : /*
1514 : * We now know that the file is open and that it is not the last one
1515 : * accessed, so we need to move it to the head of the Lru ring.
1516 : */
1517 :
1518 1521568 : Delete(file);
1519 1521568 : Insert(file);
1520 : }
1521 :
1522 5992370 : return 0;
1523 : }
1524 :
1525 : /*
1526 : * Called whenever a temporary file is deleted to report its size.
1527 : */
1528 : static void
1529 4424 : ReportTemporaryFileUsage(const char *path, off_t size)
1530 : {
1531 4424 : pgstat_report_tempfile(size);
1532 :
1533 4424 : if (log_temp_files >= 0)
1534 : {
1535 1460 : if ((size / 1024) >= log_temp_files)
1536 224 : ereport(LOG,
1537 : (errmsg("temporary file: path \"%s\", size %lu",
1538 : path, (unsigned long) size)));
1539 : }
1540 4424 : }
1541 :
1542 : /*
1543 : * Called to register a temporary file for automatic close.
1544 : * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1545 : * before the file was opened.
1546 : */
1547 : static void
1548 7350 : RegisterTemporaryFile(File file)
1549 : {
1550 7350 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1551 7350 : VfdCache[file].resowner = CurrentResourceOwner;
1552 :
1553 : /* Backup mechanism for closing at end of xact. */
1554 7350 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1555 7350 : have_xact_temporary_files = true;
1556 7350 : }
1557 :
1558 : /*
1559 : * Called when we get a shared invalidation message on some relation.
1560 : */
1561 : #ifdef NOT_USED
1562 : void
1563 : FileInvalidate(File file)
1564 : {
1565 : Assert(FileIsValid(file));
1566 : if (!FileIsNotOpen(file))
1567 : LruDelete(file);
1568 : }
1569 : #endif
1570 :
1571 : /*
1572 : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1573 : * fileMode parameter.
1574 : */
1575 : File
1576 3147412 : PathNameOpenFile(const char *fileName, int fileFlags)
1577 : {
1578 3147412 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1579 : }
1580 :
1581 : /*
1582 : * open a file in an arbitrary directory
1583 : *
1584 : * NB: if the passed pathname is relative (which it usually is),
1585 : * it will be interpreted relative to the process' working directory
1586 : * (which should always be $PGDATA when this code is running).
1587 : */
1588 : File
1589 3147412 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1590 : {
1591 : char *fnamecopy;
1592 : File file;
1593 : Vfd *vfdP;
1594 :
1595 : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1596 : fileName, fileFlags, fileMode));
1597 :
1598 : /*
1599 : * We need a malloc'd copy of the file name; fail cleanly if no room.
1600 : */
1601 3147412 : fnamecopy = strdup(fileName);
1602 3147412 : if (fnamecopy == NULL)
1603 0 : ereport(ERROR,
1604 : (errcode(ERRCODE_OUT_OF_MEMORY),
1605 : errmsg("out of memory")));
1606 :
1607 3147412 : file = AllocateVfd();
1608 3147412 : vfdP = &VfdCache[file];
1609 :
1610 : /* Close excess kernel FDs. */
1611 3147412 : ReleaseLruFiles();
1612 :
1613 : /*
1614 : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1615 : * client shouldn't be expected to know which kernel descriptors are
1616 : * currently open, so it wouldn't make sense for them to be inherited by
1617 : * executed subprograms.
1618 : */
1619 3147412 : fileFlags |= O_CLOEXEC;
1620 :
1621 3147412 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1622 :
1623 3147412 : if (vfdP->fd < 0)
1624 : {
1625 930156 : int save_errno = errno;
1626 :
1627 930156 : FreeVfd(file);
1628 930156 : free(fnamecopy);
1629 930156 : errno = save_errno;
1630 930156 : return -1;
1631 : }
1632 2217256 : ++nfile;
1633 : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1634 : vfdP->fd));
1635 :
1636 2217256 : vfdP->fileName = fnamecopy;
1637 : /* Saved flags are adjusted to be OK for re-opening file */
1638 2217256 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1639 2217256 : vfdP->fileMode = fileMode;
1640 2217256 : vfdP->fileSize = 0;
1641 2217256 : vfdP->fdstate = 0x0;
1642 2217256 : vfdP->resowner = NULL;
1643 :
1644 2217256 : Insert(file);
1645 :
1646 2217256 : return file;
1647 : }
1648 :
1649 : /*
1650 : * Create directory 'directory'. If necessary, create 'basedir', which must
1651 : * be the directory above it. This is designed for creating the top-level
1652 : * temporary directory on demand before creating a directory underneath it.
1653 : * Do nothing if the directory already exists.
1654 : *
1655 : * Directories created within the top-level temporary directory should begin
1656 : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1657 : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1658 : * that do not need any particular prefix.
1659 : */
1660 : void
1661 352 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1662 : {
1663 352 : if (MakePGDirectory(directory) < 0)
1664 : {
1665 34 : if (errno == EEXIST)
1666 12 : return;
1667 :
1668 : /*
1669 : * Failed. Try to create basedir first in case it's missing. Tolerate
1670 : * EEXIST to close a race against another process following the same
1671 : * algorithm.
1672 : */
1673 22 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1674 0 : ereport(ERROR,
1675 : (errcode_for_file_access(),
1676 : errmsg("cannot create temporary directory \"%s\": %m",
1677 : basedir)));
1678 :
1679 : /* Try again. */
1680 22 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1681 0 : ereport(ERROR,
1682 : (errcode_for_file_access(),
1683 : errmsg("cannot create temporary subdirectory \"%s\": %m",
1684 : directory)));
1685 : }
1686 : }
1687 :
1688 : /*
1689 : * Delete a directory and everything in it, if it exists.
1690 : */
1691 : void
1692 418 : PathNameDeleteTemporaryDir(const char *dirname)
1693 : {
1694 : struct stat statbuf;
1695 :
1696 : /* Silently ignore missing directory. */
1697 418 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1698 80 : return;
1699 :
1700 : /*
1701 : * Currently, walkdir doesn't offer a way for our passed in function to
1702 : * maintain state. Perhaps it should, so that we could tell the caller
1703 : * whether this operation succeeded or failed. Since this operation is
1704 : * used in a cleanup path, we wouldn't actually behave differently: we'll
1705 : * just log failures.
1706 : */
1707 338 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1708 : }
1709 :
1710 : /*
1711 : * Open a temporary file that will disappear when we close it.
1712 : *
1713 : * This routine takes care of generating an appropriate tempfile name.
1714 : * There's no need to pass in fileFlags or fileMode either, since only
1715 : * one setting makes any sense for a temp file.
1716 : *
1717 : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1718 : * to ensure it's closed and deleted when it's no longer needed, typically at
1719 : * the end-of-transaction. In most cases, you don't want temporary files to
1720 : * outlive the transaction that created them, so this should be false -- but
1721 : * if you need "somewhat" temporary storage, this might be useful. In either
1722 : * case, the file is removed when the File is explicitly closed.
1723 : */
1724 : File
1725 2414 : OpenTemporaryFile(bool interXact)
1726 : {
1727 2414 : File file = 0;
1728 :
1729 : Assert(temporary_files_allowed); /* check temp file access is up */
1730 :
1731 : /*
1732 : * Make sure the current resource owner has space for this File before we
1733 : * open it, if we'll be registering it below.
1734 : */
1735 2414 : if (!interXact)
1736 2414 : ResourceOwnerEnlarge(CurrentResourceOwner);
1737 :
1738 : /*
1739 : * If some temp tablespace(s) have been given to us, try to use the next
1740 : * one. If a given tablespace can't be found, we silently fall back to
1741 : * the database's default tablespace.
1742 : *
1743 : * BUT: if the temp file is slated to outlive the current transaction,
1744 : * force it into the database's default tablespace, so that it will not
1745 : * pose a threat to possible tablespace drop attempts.
1746 : */
1747 2414 : if (numTempTableSpaces > 0 && !interXact)
1748 : {
1749 2 : Oid tblspcOid = GetNextTempTableSpace();
1750 :
1751 2 : if (OidIsValid(tblspcOid))
1752 2 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1753 : }
1754 :
1755 : /*
1756 : * If not, or if tablespace is bad, create in database's default
1757 : * tablespace. MyDatabaseTableSpace should normally be set before we get
1758 : * here, but just in case it isn't, fall back to pg_default tablespace.
1759 : */
1760 2414 : if (file <= 0)
1761 2412 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1762 : MyDatabaseTableSpace :
1763 : DEFAULTTABLESPACE_OID,
1764 : true);
1765 :
1766 : /* Mark it for deletion at close and temporary file size limit */
1767 2414 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1768 :
1769 : /* Register it with the current resource owner */
1770 2414 : if (!interXact)
1771 2414 : RegisterTemporaryFile(file);
1772 :
1773 2414 : return file;
1774 : }
1775 :
1776 : /*
1777 : * Return the path of the temp directory in a given tablespace.
1778 : */
1779 : void
1780 14736 : TempTablespacePath(char *path, Oid tablespace)
1781 : {
1782 : /*
1783 : * Identify the tempfile directory for this tablespace.
1784 : *
1785 : * If someone tries to specify pg_global, use pg_default instead.
1786 : */
1787 14736 : if (tablespace == InvalidOid ||
1788 2 : tablespace == DEFAULTTABLESPACE_OID ||
1789 : tablespace == GLOBALTABLESPACE_OID)
1790 14734 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1791 : else
1792 : {
1793 : /* All other tablespaces are accessed via symlinks */
1794 2 : snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1795 : PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
1796 : PG_TEMP_FILES_DIR);
1797 : }
1798 14736 : }
1799 :
1800 : /*
1801 : * Open a temporary file in a specific tablespace.
1802 : * Subroutine for OpenTemporaryFile, which see for details.
1803 : */
1804 : static File
1805 2414 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1806 : {
1807 : char tempdirpath[MAXPGPATH];
1808 : char tempfilepath[MAXPGPATH];
1809 : File file;
1810 :
1811 2414 : TempTablespacePath(tempdirpath, tblspcOid);
1812 :
1813 : /*
1814 : * Generate a tempfile name that should be unique within the current
1815 : * database instance.
1816 : */
1817 2414 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1818 : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1819 :
1820 : /*
1821 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1822 : * temp file that can be reused.
1823 : */
1824 2414 : file = PathNameOpenFile(tempfilepath,
1825 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1826 2414 : if (file <= 0)
1827 : {
1828 : /*
1829 : * We might need to create the tablespace's tempfile directory, if no
1830 : * one has yet done so.
1831 : *
1832 : * Don't check for an error from MakePGDirectory; it could fail if
1833 : * someone else just did the same thing. If it doesn't work then
1834 : * we'll bomb out on the second create attempt, instead.
1835 : */
1836 186 : (void) MakePGDirectory(tempdirpath);
1837 :
1838 186 : file = PathNameOpenFile(tempfilepath,
1839 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1840 186 : if (file <= 0 && rejectError)
1841 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1842 : tempfilepath);
1843 : }
1844 :
1845 2414 : return file;
1846 : }
1847 :
1848 :
1849 : /*
1850 : * Create a new file. The directory containing it must already exist. Files
1851 : * created this way are subject to temp_file_limit and are automatically
1852 : * closed at end of transaction, but are not automatically deleted on close
1853 : * because they are intended to be shared between cooperating backends.
1854 : *
1855 : * If the file is inside the top-level temporary directory, its name should
1856 : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1857 : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1858 : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1859 : * the prefix isn't needed.
1860 : */
1861 : File
1862 2362 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1863 : {
1864 : File file;
1865 :
1866 : Assert(temporary_files_allowed); /* check temp file access is up */
1867 :
1868 2362 : ResourceOwnerEnlarge(CurrentResourceOwner);
1869 :
1870 : /*
1871 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1872 : * temp file that can be reused.
1873 : */
1874 2362 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1875 2362 : if (file <= 0)
1876 : {
1877 352 : if (error_on_failure)
1878 0 : ereport(ERROR,
1879 : (errcode_for_file_access(),
1880 : errmsg("could not create temporary file \"%s\": %m",
1881 : path)));
1882 : else
1883 352 : return file;
1884 : }
1885 :
1886 : /* Mark it for temp_file_limit accounting. */
1887 2010 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1888 :
1889 : /* Register it for automatic close. */
1890 2010 : RegisterTemporaryFile(file);
1891 :
1892 2010 : return file;
1893 : }
1894 :
1895 : /*
1896 : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1897 : * another backend. Files opened this way don't count against the
1898 : * temp_file_limit of the caller, are automatically closed at the end of the
1899 : * transaction but are not deleted on close.
1900 : */
1901 : File
1902 6398 : PathNameOpenTemporaryFile(const char *path, int mode)
1903 : {
1904 : File file;
1905 :
1906 : Assert(temporary_files_allowed); /* check temp file access is up */
1907 :
1908 6398 : ResourceOwnerEnlarge(CurrentResourceOwner);
1909 :
1910 6398 : file = PathNameOpenFile(path, mode | PG_BINARY);
1911 :
1912 : /* If no such file, then we don't raise an error. */
1913 6398 : if (file <= 0 && errno != ENOENT)
1914 0 : ereport(ERROR,
1915 : (errcode_for_file_access(),
1916 : errmsg("could not open temporary file \"%s\": %m",
1917 : path)));
1918 :
1919 6398 : if (file > 0)
1920 : {
1921 : /* Register it for automatic close. */
1922 2926 : RegisterTemporaryFile(file);
1923 : }
1924 :
1925 6398 : return file;
1926 : }
1927 :
1928 : /*
1929 : * Delete a file by pathname. Return true if the file existed, false if
1930 : * didn't.
1931 : */
1932 : bool
1933 4724 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1934 : {
1935 : struct stat filestats;
1936 : int stat_errno;
1937 :
1938 : /* Get the final size for pgstat reporting. */
1939 4724 : if (stat(path, &filestats) != 0)
1940 2714 : stat_errno = errno;
1941 : else
1942 2010 : stat_errno = 0;
1943 :
1944 : /*
1945 : * Unlike FileClose's automatic file deletion code, we tolerate
1946 : * non-existence to support BufFileDeleteFileSet which doesn't know how
1947 : * many segments it has to delete until it runs out.
1948 : */
1949 4724 : if (stat_errno == ENOENT)
1950 2714 : return false;
1951 :
1952 2010 : if (unlink(path) < 0)
1953 : {
1954 0 : if (errno != ENOENT)
1955 0 : ereport(error_on_failure ? ERROR : LOG,
1956 : (errcode_for_file_access(),
1957 : errmsg("could not unlink temporary file \"%s\": %m",
1958 : path)));
1959 0 : return false;
1960 : }
1961 :
1962 2010 : if (stat_errno == 0)
1963 2010 : ReportTemporaryFileUsage(path, filestats.st_size);
1964 : else
1965 : {
1966 0 : errno = stat_errno;
1967 0 : ereport(LOG,
1968 : (errcode_for_file_access(),
1969 : errmsg("could not stat file \"%s\": %m", path)));
1970 : }
1971 :
1972 2010 : return true;
1973 : }
1974 :
1975 : /*
1976 : * close a file when done with it
1977 : */
1978 : void
1979 1116854 : FileClose(File file)
1980 : {
1981 : Vfd *vfdP;
1982 :
1983 : Assert(FileIsValid(file));
1984 :
1985 : DO_DB(elog(LOG, "FileClose: %d (%s)",
1986 : file, VfdCache[file].fileName));
1987 :
1988 1116854 : vfdP = &VfdCache[file];
1989 :
1990 1116854 : if (!FileIsNotOpen(file))
1991 : {
1992 1114744 : pgaio_closing_fd(vfdP->fd);
1993 :
1994 : /* close the file */
1995 1114744 : if (close(vfdP->fd) != 0)
1996 : {
1997 : /*
1998 : * We may need to panic on failure to close non-temporary files;
1999 : * see LruDelete.
2000 : */
2001 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
2002 : "could not close file \"%s\": %m", vfdP->fileName);
2003 : }
2004 :
2005 1114744 : --nfile;
2006 1114744 : vfdP->fd = VFD_CLOSED;
2007 :
2008 : /* remove the file from the lru ring */
2009 1114744 : Delete(file);
2010 : }
2011 :
2012 1116854 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2013 : {
2014 : /* Subtract its size from current usage (do first in case of error) */
2015 4424 : temporary_files_size -= vfdP->fileSize;
2016 4424 : vfdP->fileSize = 0;
2017 : }
2018 :
2019 : /*
2020 : * Delete the file if it was temporary, and make a log entry if wanted
2021 : */
2022 1116854 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2023 : {
2024 : struct stat filestats;
2025 : int stat_errno;
2026 :
2027 : /*
2028 : * If we get an error, as could happen within the ereport/elog calls,
2029 : * we'll come right back here during transaction abort. Reset the
2030 : * flag to ensure that we can't get into an infinite loop. This code
2031 : * is arranged to ensure that the worst-case consequence is failing to
2032 : * emit log message(s), not failing to attempt the unlink.
2033 : */
2034 2414 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2035 :
2036 :
2037 : /* first try the stat() */
2038 2414 : if (stat(vfdP->fileName, &filestats))
2039 0 : stat_errno = errno;
2040 : else
2041 2414 : stat_errno = 0;
2042 :
2043 : /* in any case do the unlink */
2044 2414 : if (unlink(vfdP->fileName))
2045 0 : ereport(LOG,
2046 : (errcode_for_file_access(),
2047 : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2048 :
2049 : /* and last report the stat results */
2050 2414 : if (stat_errno == 0)
2051 2414 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2052 : else
2053 : {
2054 0 : errno = stat_errno;
2055 0 : ereport(LOG,
2056 : (errcode_for_file_access(),
2057 : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2058 : }
2059 : }
2060 :
2061 : /* Unregister it from the resource owner */
2062 1116854 : if (vfdP->resowner)
2063 7342 : ResourceOwnerForgetFile(vfdP->resowner, file);
2064 :
2065 : /*
2066 : * Return the Vfd slot to the free list
2067 : */
2068 1116854 : FreeVfd(file);
2069 1116854 : }
2070 :
2071 : /*
2072 : * FilePrefetch - initiate asynchronous read of a given range of the file.
2073 : *
2074 : * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2075 : *
2076 : * posix_fadvise() is the simplest standardized interface that accomplishes
2077 : * this.
2078 : */
2079 : int
2080 16914 : FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2081 : {
2082 : Assert(FileIsValid(file));
2083 :
2084 : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2085 : file, VfdCache[file].fileName,
2086 : (int64) offset, (int64) amount));
2087 :
2088 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2089 : {
2090 : int returnCode;
2091 :
2092 16914 : returnCode = FileAccess(file);
2093 16914 : if (returnCode < 0)
2094 0 : return returnCode;
2095 :
2096 16914 : retry:
2097 16914 : pgstat_report_wait_start(wait_event_info);
2098 16914 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2099 : POSIX_FADV_WILLNEED);
2100 16914 : pgstat_report_wait_end();
2101 :
2102 16914 : if (returnCode == EINTR)
2103 0 : goto retry;
2104 :
2105 16914 : return returnCode;
2106 : }
2107 : #elif defined(__darwin__)
2108 : {
2109 : struct radvisory
2110 : {
2111 : off_t ra_offset; /* offset into the file */
2112 : int ra_count; /* size of the read */
2113 : } ra;
2114 : int returnCode;
2115 :
2116 : returnCode = FileAccess(file);
2117 : if (returnCode < 0)
2118 : return returnCode;
2119 :
2120 : ra.ra_offset = offset;
2121 : ra.ra_count = amount;
2122 : pgstat_report_wait_start(wait_event_info);
2123 : returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2124 : pgstat_report_wait_end();
2125 : if (returnCode != -1)
2126 : return 0;
2127 : else
2128 : return errno;
2129 : }
2130 : #else
2131 : return 0;
2132 : #endif
2133 : }
2134 :
2135 : void
2136 0 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2137 : {
2138 : int returnCode;
2139 :
2140 : Assert(FileIsValid(file));
2141 :
2142 : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2143 : file, VfdCache[file].fileName,
2144 : (int64) offset, (int64) nbytes));
2145 :
2146 0 : if (nbytes <= 0)
2147 0 : return;
2148 :
2149 0 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
2150 0 : return;
2151 :
2152 0 : returnCode = FileAccess(file);
2153 0 : if (returnCode < 0)
2154 0 : return;
2155 :
2156 0 : pgstat_report_wait_start(wait_event_info);
2157 0 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
2158 0 : pgstat_report_wait_end();
2159 : }
2160 :
2161 : ssize_t
2162 810460 : FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2163 : uint32 wait_event_info)
2164 : {
2165 : ssize_t returnCode;
2166 : Vfd *vfdP;
2167 :
2168 : Assert(FileIsValid(file));
2169 :
2170 : DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2171 : file, VfdCache[file].fileName,
2172 : (int64) offset,
2173 : iovcnt));
2174 :
2175 810460 : returnCode = FileAccess(file);
2176 810460 : if (returnCode < 0)
2177 0 : return returnCode;
2178 :
2179 810460 : vfdP = &VfdCache[file];
2180 :
2181 810460 : retry:
2182 810460 : pgstat_report_wait_start(wait_event_info);
2183 810460 : returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2184 810460 : pgstat_report_wait_end();
2185 :
2186 810460 : if (returnCode < 0)
2187 : {
2188 : /*
2189 : * Windows may run out of kernel buffers and return "Insufficient
2190 : * system resources" error. Wait a bit and retry to solve it.
2191 : *
2192 : * It is rumored that EINTR is also possible on some Unix filesystems,
2193 : * in which case immediate retry is indicated.
2194 : */
2195 : #ifdef WIN32
2196 : DWORD error = GetLastError();
2197 :
2198 : switch (error)
2199 : {
2200 : case ERROR_NO_SYSTEM_RESOURCES:
2201 : pg_usleep(1000L);
2202 : errno = EINTR;
2203 : break;
2204 : default:
2205 : _dosmaperr(error);
2206 : break;
2207 : }
2208 : #endif
2209 : /* OK to retry if interrupted */
2210 0 : if (errno == EINTR)
2211 0 : goto retry;
2212 : }
2213 :
2214 810460 : return returnCode;
2215 : }
2216 :
2217 : int
2218 2407282 : FileStartReadV(PgAioHandle *ioh, File file,
2219 : int iovcnt, off_t offset,
2220 : uint32 wait_event_info)
2221 : {
2222 : int returnCode;
2223 : Vfd *vfdP;
2224 :
2225 : Assert(FileIsValid(file));
2226 :
2227 : DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2228 : file, VfdCache[file].fileName,
2229 : (int64) offset,
2230 : iovcnt));
2231 :
2232 2407282 : returnCode = FileAccess(file);
2233 2407282 : if (returnCode < 0)
2234 0 : return returnCode;
2235 :
2236 2407282 : vfdP = &VfdCache[file];
2237 :
2238 2407282 : pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2239 :
2240 2407282 : return 0;
2241 : }
2242 :
2243 : ssize_t
2244 1476066 : FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2245 : uint32 wait_event_info)
2246 : {
2247 : ssize_t returnCode;
2248 : Vfd *vfdP;
2249 :
2250 : Assert(FileIsValid(file));
2251 :
2252 : DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2253 : file, VfdCache[file].fileName,
2254 : (int64) offset,
2255 : iovcnt));
2256 :
2257 1476066 : returnCode = FileAccess(file);
2258 1476066 : if (returnCode < 0)
2259 0 : return returnCode;
2260 :
2261 1476066 : vfdP = &VfdCache[file];
2262 :
2263 : /*
2264 : * If enforcing temp_file_limit and it's a temp file, check to see if the
2265 : * write would overrun temp_file_limit, and throw error if so. Note: it's
2266 : * really a modularity violation to throw error here; we should set errno
2267 : * and return -1. However, there's no way to report a suitable error
2268 : * message if we do that. All current callers would just throw error
2269 : * immediately anyway, so this is safe at present.
2270 : */
2271 1476066 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2272 : {
2273 0 : off_t past_write = offset;
2274 :
2275 0 : for (int i = 0; i < iovcnt; ++i)
2276 0 : past_write += iov[i].iov_len;
2277 :
2278 0 : if (past_write > vfdP->fileSize)
2279 : {
2280 0 : uint64 newTotal = temporary_files_size;
2281 :
2282 0 : newTotal += past_write - vfdP->fileSize;
2283 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2284 0 : ereport(ERROR,
2285 : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2286 : errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2287 : temp_file_limit)));
2288 : }
2289 : }
2290 :
2291 1476066 : retry:
2292 1476066 : pgstat_report_wait_start(wait_event_info);
2293 1476066 : returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2294 1476066 : pgstat_report_wait_end();
2295 :
2296 1476066 : if (returnCode >= 0)
2297 : {
2298 : /*
2299 : * Some callers expect short writes to set errno, and traditionally we
2300 : * have assumed that they imply disk space shortage. We don't want to
2301 : * waste CPU cycles adding up the total size here, so we'll just set
2302 : * it for all successful writes in case such a caller determines that
2303 : * the write was short and ereports "%m".
2304 : */
2305 1476066 : errno = ENOSPC;
2306 :
2307 : /*
2308 : * Maintain fileSize and temporary_files_size if it's a temp file.
2309 : */
2310 1476066 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2311 : {
2312 107166 : off_t past_write = offset + returnCode;
2313 :
2314 107166 : if (past_write > vfdP->fileSize)
2315 : {
2316 73826 : temporary_files_size += past_write - vfdP->fileSize;
2317 73826 : vfdP->fileSize = past_write;
2318 : }
2319 : }
2320 : }
2321 : else
2322 : {
2323 : /*
2324 : * See comments in FileReadV()
2325 : */
2326 : #ifdef WIN32
2327 : DWORD error = GetLastError();
2328 :
2329 : switch (error)
2330 : {
2331 : case ERROR_NO_SYSTEM_RESOURCES:
2332 : pg_usleep(1000L);
2333 : errno = EINTR;
2334 : break;
2335 : default:
2336 : _dosmaperr(error);
2337 : break;
2338 : }
2339 : #endif
2340 : /* OK to retry if interrupted */
2341 0 : if (errno == EINTR)
2342 0 : goto retry;
2343 : }
2344 :
2345 1476066 : return returnCode;
2346 : }
2347 :
2348 : int
2349 1728 : FileSync(File file, uint32 wait_event_info)
2350 : {
2351 : int returnCode;
2352 :
2353 : Assert(FileIsValid(file));
2354 :
2355 : DO_DB(elog(LOG, "FileSync: %d (%s)",
2356 : file, VfdCache[file].fileName));
2357 :
2358 1728 : returnCode = FileAccess(file);
2359 1728 : if (returnCode < 0)
2360 0 : return returnCode;
2361 :
2362 1728 : pgstat_report_wait_start(wait_event_info);
2363 1728 : returnCode = pg_fsync(VfdCache[file].fd);
2364 1728 : pgstat_report_wait_end();
2365 :
2366 1728 : return returnCode;
2367 : }
2368 :
2369 : /*
2370 : * Zero a region of the file.
2371 : *
2372 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2373 : * appropriate error.
2374 : */
2375 : int
2376 426718 : FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2377 : {
2378 : int returnCode;
2379 : ssize_t written;
2380 :
2381 : Assert(FileIsValid(file));
2382 :
2383 : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2384 : file, VfdCache[file].fileName,
2385 : (int64) offset, (int64) amount));
2386 :
2387 426718 : returnCode = FileAccess(file);
2388 426718 : if (returnCode < 0)
2389 0 : return returnCode;
2390 :
2391 426718 : pgstat_report_wait_start(wait_event_info);
2392 426718 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2393 426718 : pgstat_report_wait_end();
2394 :
2395 426718 : if (written < 0)
2396 0 : return -1;
2397 426718 : else if (written != amount)
2398 : {
2399 : /* if errno is unset, assume problem is no disk space */
2400 0 : if (errno == 0)
2401 0 : errno = ENOSPC;
2402 0 : return -1;
2403 : }
2404 :
2405 426718 : return 0;
2406 : }
2407 :
2408 : /*
2409 : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2410 : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2411 : * use FileZero() instead.
2412 : *
2413 : * Note that at least glibc() implements posix_fallocate() in userspace if not
2414 : * implemented by the filesystem. That's not the case for all environments
2415 : * though.
2416 : *
2417 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2418 : * appropriate error.
2419 : */
2420 : int
2421 1330 : FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2422 : {
2423 : #ifdef HAVE_POSIX_FALLOCATE
2424 : int returnCode;
2425 :
2426 : Assert(FileIsValid(file));
2427 :
2428 : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2429 : file, VfdCache[file].fileName,
2430 : (int64) offset, (int64) amount));
2431 :
2432 1330 : returnCode = FileAccess(file);
2433 1330 : if (returnCode < 0)
2434 0 : return -1;
2435 :
2436 1330 : retry:
2437 1330 : pgstat_report_wait_start(wait_event_info);
2438 1330 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2439 1330 : pgstat_report_wait_end();
2440 :
2441 1330 : if (returnCode == 0)
2442 1330 : return 0;
2443 0 : else if (returnCode == EINTR)
2444 0 : goto retry;
2445 :
2446 : /* for compatibility with %m printing etc */
2447 0 : errno = returnCode;
2448 :
2449 : /*
2450 : * Return in cases of a "real" failure, if fallocate is not supported,
2451 : * fall through to the FileZero() backed implementation.
2452 : */
2453 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2454 0 : return -1;
2455 : #endif
2456 :
2457 0 : return FileZero(file, offset, amount, wait_event_info);
2458 : }
2459 :
2460 : off_t
2461 5097272 : FileSize(File file)
2462 : {
2463 : Assert(FileIsValid(file));
2464 :
2465 : DO_DB(elog(LOG, "FileSize %d (%s)",
2466 : file, VfdCache[file].fileName));
2467 :
2468 5097272 : if (FileIsNotOpen(file))
2469 : {
2470 34 : if (FileAccess(file) < 0)
2471 0 : return (off_t) -1;
2472 : }
2473 :
2474 5097272 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2475 : }
2476 :
2477 : int
2478 1086 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
2479 : {
2480 : int returnCode;
2481 :
2482 : Assert(FileIsValid(file));
2483 :
2484 : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2485 : file, VfdCache[file].fileName));
2486 :
2487 1086 : returnCode = FileAccess(file);
2488 1086 : if (returnCode < 0)
2489 0 : return returnCode;
2490 :
2491 1086 : pgstat_report_wait_start(wait_event_info);
2492 1086 : returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2493 1086 : pgstat_report_wait_end();
2494 :
2495 1086 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2496 : {
2497 : /* adjust our state for truncation of a temp file */
2498 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2499 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2500 0 : VfdCache[file].fileSize = offset;
2501 : }
2502 :
2503 1086 : return returnCode;
2504 : }
2505 :
2506 : /*
2507 : * Return the pathname associated with an open file.
2508 : *
2509 : * The returned string points to an internal buffer, which is valid until
2510 : * the file is closed.
2511 : */
2512 : char *
2513 44 : FilePathName(File file)
2514 : {
2515 : Assert(FileIsValid(file));
2516 :
2517 44 : return VfdCache[file].fileName;
2518 : }
2519 :
2520 : /*
2521 : * Return the raw file descriptor of an opened file.
2522 : *
2523 : * The returned file descriptor will be valid until the file is closed, but
2524 : * there are a lot of things that can make that happen. So the caller should
2525 : * be careful not to do much of anything else before it finishes using the
2526 : * returned file descriptor.
2527 : */
2528 : int
2529 850752 : FileGetRawDesc(File file)
2530 : {
2531 : int returnCode;
2532 :
2533 850752 : returnCode = FileAccess(file);
2534 850752 : if (returnCode < 0)
2535 0 : return returnCode;
2536 :
2537 : Assert(FileIsValid(file));
2538 850752 : return VfdCache[file].fd;
2539 : }
2540 :
2541 : /*
2542 : * FileGetRawFlags - returns the file flags on open(2)
2543 : */
2544 : int
2545 0 : FileGetRawFlags(File file)
2546 : {
2547 : Assert(FileIsValid(file));
2548 0 : return VfdCache[file].fileFlags;
2549 : }
2550 :
2551 : /*
2552 : * FileGetRawMode - returns the mode bitmask passed to open(2)
2553 : */
2554 : mode_t
2555 0 : FileGetRawMode(File file)
2556 : {
2557 : Assert(FileIsValid(file));
2558 0 : return VfdCache[file].fileMode;
2559 : }
2560 :
2561 : /*
2562 : * Make room for another allocatedDescs[] array entry if needed and possible.
2563 : * Returns true if an array element is available.
2564 : */
2565 : static bool
2566 15684792 : reserveAllocatedDesc(void)
2567 : {
2568 : AllocateDesc *newDescs;
2569 : int newMax;
2570 :
2571 : /* Quick out if array already has a free slot. */
2572 15684792 : if (numAllocatedDescs < maxAllocatedDescs)
2573 15682640 : return true;
2574 :
2575 : /*
2576 : * If the array hasn't yet been created in the current process, initialize
2577 : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2578 : * we will ever need, anyway. We don't want to look at max_safe_fds
2579 : * immediately because set_max_safe_fds() may not have run yet.
2580 : */
2581 2152 : if (allocatedDescs == NULL)
2582 : {
2583 2152 : newMax = FD_MINFREE / 3;
2584 2152 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2585 : /* Out of memory already? Treat as fatal error. */
2586 2152 : if (newDescs == NULL)
2587 0 : ereport(ERROR,
2588 : (errcode(ERRCODE_OUT_OF_MEMORY),
2589 : errmsg("out of memory")));
2590 2152 : allocatedDescs = newDescs;
2591 2152 : maxAllocatedDescs = newMax;
2592 2152 : return true;
2593 : }
2594 :
2595 : /*
2596 : * Consider enlarging the array beyond the initial allocation used above.
2597 : * By the time this happens, max_safe_fds should be known accurately.
2598 : *
2599 : * We mustn't let allocated descriptors hog all the available FDs, and in
2600 : * practice we'd better leave a reasonable number of FDs for VFD use. So
2601 : * set the maximum to max_safe_fds / 3. (This should certainly be at
2602 : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2603 : * tightening the restriction here.) Recall that "external" FDs are
2604 : * allowed to consume another third of max_safe_fds.
2605 : */
2606 0 : newMax = max_safe_fds / 3;
2607 0 : if (newMax > maxAllocatedDescs)
2608 : {
2609 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2610 : newMax * sizeof(AllocateDesc));
2611 : /* Treat out-of-memory as a non-fatal error. */
2612 0 : if (newDescs == NULL)
2613 0 : return false;
2614 0 : allocatedDescs = newDescs;
2615 0 : maxAllocatedDescs = newMax;
2616 0 : return true;
2617 : }
2618 :
2619 : /* Can't enlarge allocatedDescs[] any more. */
2620 0 : return false;
2621 : }
2622 :
2623 : /*
2624 : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2625 : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2626 : * necessary to open the file. When done, call FreeFile rather than fclose.
2627 : *
2628 : * Note that files that will be open for any significant length of time
2629 : * should NOT be handled this way, since they cannot share kernel file
2630 : * descriptors with other files; there is grave risk of running out of FDs
2631 : * if anyone locks down too many FDs. Most callers of this routine are
2632 : * simply reading a config file that they will read and close immediately.
2633 : *
2634 : * fd.c will automatically close all files opened with AllocateFile at
2635 : * transaction commit or abort; this prevents FD leakage if a routine
2636 : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2637 : *
2638 : * Ideally this should be the *only* direct call of fopen() in the backend.
2639 : */
2640 : FILE *
2641 167984 : AllocateFile(const char *name, const char *mode)
2642 : {
2643 : FILE *file;
2644 :
2645 : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2646 : numAllocatedDescs, name));
2647 :
2648 : /* Can we allocate another non-virtual FD? */
2649 167984 : if (!reserveAllocatedDesc())
2650 0 : ereport(ERROR,
2651 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2652 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2653 : maxAllocatedDescs, name)));
2654 :
2655 : /* Close excess kernel FDs. */
2656 167984 : ReleaseLruFiles();
2657 :
2658 167984 : TryAgain:
2659 167984 : if ((file = fopen(name, mode)) != NULL)
2660 : {
2661 154290 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2662 :
2663 154290 : desc->kind = AllocateDescFile;
2664 154290 : desc->desc.file = file;
2665 154290 : desc->create_subid = GetCurrentSubTransactionId();
2666 154290 : numAllocatedDescs++;
2667 154290 : return desc->desc.file;
2668 : }
2669 :
2670 13694 : if (errno == EMFILE || errno == ENFILE)
2671 : {
2672 0 : int save_errno = errno;
2673 :
2674 0 : ereport(LOG,
2675 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2676 : errmsg("out of file descriptors: %m; release and retry")));
2677 0 : errno = 0;
2678 0 : if (ReleaseLruFile())
2679 0 : goto TryAgain;
2680 0 : errno = save_errno;
2681 : }
2682 :
2683 13694 : return NULL;
2684 : }
2685 :
2686 : /*
2687 : * Open a file with OpenTransientFilePerm() and pass default file mode for
2688 : * the fileMode parameter.
2689 : */
2690 : int
2691 15425964 : OpenTransientFile(const char *fileName, int fileFlags)
2692 : {
2693 15425964 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2694 : }
2695 :
2696 : /*
2697 : * Like AllocateFile, but returns an unbuffered fd like open(2)
2698 : */
2699 : int
2700 15425976 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2701 : {
2702 : int fd;
2703 :
2704 : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2705 : numAllocatedDescs, fileName));
2706 :
2707 : /* Can we allocate another non-virtual FD? */
2708 15425976 : if (!reserveAllocatedDesc())
2709 0 : ereport(ERROR,
2710 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2711 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2712 : maxAllocatedDescs, fileName)));
2713 :
2714 : /* Close excess kernel FDs. */
2715 15425976 : ReleaseLruFiles();
2716 :
2717 15425976 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2718 :
2719 15425976 : if (fd >= 0)
2720 : {
2721 15416202 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2722 :
2723 15416202 : desc->kind = AllocateDescRawFD;
2724 15416202 : desc->desc.fd = fd;
2725 15416202 : desc->create_subid = GetCurrentSubTransactionId();
2726 15416202 : numAllocatedDescs++;
2727 :
2728 15416202 : return fd;
2729 : }
2730 :
2731 9774 : return -1; /* failure */
2732 : }
2733 :
2734 : /*
2735 : * Routines that want to initiate a pipe stream should use OpenPipeStream
2736 : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2737 : * necessary. When done, call ClosePipeStream rather than pclose.
2738 : *
2739 : * This function also ensures that the popen'd program is run with default
2740 : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2741 : * uses. This ensures desirable response to, eg, closing a read pipe early.
2742 : */
2743 : FILE *
2744 116 : OpenPipeStream(const char *command, const char *mode)
2745 : {
2746 : FILE *file;
2747 : int save_errno;
2748 :
2749 : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2750 : numAllocatedDescs, command));
2751 :
2752 : /* Can we allocate another non-virtual FD? */
2753 116 : if (!reserveAllocatedDesc())
2754 0 : ereport(ERROR,
2755 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2756 : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2757 : maxAllocatedDescs, command)));
2758 :
2759 : /* Close excess kernel FDs. */
2760 116 : ReleaseLruFiles();
2761 :
2762 116 : TryAgain:
2763 116 : fflush(NULL);
2764 116 : pqsignal(SIGPIPE, SIG_DFL);
2765 116 : errno = 0;
2766 116 : file = popen(command, mode);
2767 116 : save_errno = errno;
2768 116 : pqsignal(SIGPIPE, SIG_IGN);
2769 116 : errno = save_errno;
2770 116 : if (file != NULL)
2771 : {
2772 116 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2773 :
2774 116 : desc->kind = AllocateDescPipe;
2775 116 : desc->desc.file = file;
2776 116 : desc->create_subid = GetCurrentSubTransactionId();
2777 116 : numAllocatedDescs++;
2778 116 : return desc->desc.file;
2779 : }
2780 :
2781 0 : if (errno == EMFILE || errno == ENFILE)
2782 : {
2783 0 : ereport(LOG,
2784 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2785 : errmsg("out of file descriptors: %m; release and retry")));
2786 0 : if (ReleaseLruFile())
2787 0 : goto TryAgain;
2788 0 : errno = save_errno;
2789 : }
2790 :
2791 0 : return NULL;
2792 : }
2793 :
2794 : /*
2795 : * Free an AllocateDesc of any type.
2796 : *
2797 : * The argument *must* point into the allocatedDescs[] array.
2798 : */
2799 : static int
2800 15659596 : FreeDesc(AllocateDesc *desc)
2801 : {
2802 : int result;
2803 :
2804 : /* Close the underlying object */
2805 15659596 : switch (desc->kind)
2806 : {
2807 154290 : case AllocateDescFile:
2808 154290 : result = fclose(desc->desc.file);
2809 154290 : break;
2810 116 : case AllocateDescPipe:
2811 116 : result = pclose(desc->desc.file);
2812 116 : break;
2813 88988 : case AllocateDescDir:
2814 88988 : result = closedir(desc->desc.dir);
2815 88988 : break;
2816 15416202 : case AllocateDescRawFD:
2817 15416202 : pgaio_closing_fd(desc->desc.fd);
2818 15416202 : result = close(desc->desc.fd);
2819 15416202 : break;
2820 0 : default:
2821 0 : elog(ERROR, "AllocateDesc kind not recognized");
2822 : result = 0; /* keep compiler quiet */
2823 : break;
2824 : }
2825 :
2826 : /* Compact storage in the allocatedDescs array */
2827 15659596 : numAllocatedDescs--;
2828 15659596 : *desc = allocatedDescs[numAllocatedDescs];
2829 :
2830 15659596 : return result;
2831 : }
2832 :
2833 : /*
2834 : * Close a file returned by AllocateFile.
2835 : *
2836 : * Note we do not check fclose's return value --- it is up to the caller
2837 : * to handle close errors.
2838 : */
2839 : int
2840 154258 : FreeFile(FILE *file)
2841 : {
2842 : int i;
2843 :
2844 : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2845 :
2846 : /* Remove file from list of allocated files, if it's present */
2847 154260 : for (i = numAllocatedDescs; --i >= 0;)
2848 : {
2849 154260 : AllocateDesc *desc = &allocatedDescs[i];
2850 :
2851 154260 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2852 154258 : return FreeDesc(desc);
2853 : }
2854 :
2855 : /* Only get here if someone passes us a file not in allocatedDescs */
2856 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2857 :
2858 0 : return fclose(file);
2859 : }
2860 :
2861 : /*
2862 : * Close a file returned by OpenTransientFile.
2863 : *
2864 : * Note we do not check close's return value --- it is up to the caller
2865 : * to handle close errors.
2866 : */
2867 : int
2868 15416200 : CloseTransientFile(int fd)
2869 : {
2870 : int i;
2871 :
2872 : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2873 :
2874 : /* Remove fd from list of allocated files, if it's present */
2875 15416218 : for (i = numAllocatedDescs; --i >= 0;)
2876 : {
2877 15416218 : AllocateDesc *desc = &allocatedDescs[i];
2878 :
2879 15416218 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2880 15416200 : return FreeDesc(desc);
2881 : }
2882 :
2883 : /* Only get here if someone passes us a file not in allocatedDescs */
2884 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2885 :
2886 0 : pgaio_closing_fd(fd);
2887 :
2888 0 : return close(fd);
2889 : }
2890 :
2891 : /*
2892 : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2893 : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2894 : * necessary to open the directory, and with closing it after an elog.
2895 : * When done, call FreeDir rather than closedir.
2896 : *
2897 : * Returns NULL, with errno set, on failure. Note that failure detection
2898 : * is commonly left to the following call of ReadDir or ReadDirExtended;
2899 : * see the comments for ReadDir.
2900 : *
2901 : * Ideally this should be the *only* direct call of opendir() in the backend.
2902 : */
2903 : DIR *
2904 90716 : AllocateDir(const char *dirname)
2905 : {
2906 : DIR *dir;
2907 :
2908 : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2909 : numAllocatedDescs, dirname));
2910 :
2911 : /* Can we allocate another non-virtual FD? */
2912 90716 : if (!reserveAllocatedDesc())
2913 0 : ereport(ERROR,
2914 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2915 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2916 : maxAllocatedDescs, dirname)));
2917 :
2918 : /* Close excess kernel FDs. */
2919 90716 : ReleaseLruFiles();
2920 :
2921 90716 : TryAgain:
2922 90716 : if ((dir = opendir(dirname)) != NULL)
2923 : {
2924 88988 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2925 :
2926 88988 : desc->kind = AllocateDescDir;
2927 88988 : desc->desc.dir = dir;
2928 88988 : desc->create_subid = GetCurrentSubTransactionId();
2929 88988 : numAllocatedDescs++;
2930 88988 : return desc->desc.dir;
2931 : }
2932 :
2933 1728 : if (errno == EMFILE || errno == ENFILE)
2934 : {
2935 0 : int save_errno = errno;
2936 :
2937 0 : ereport(LOG,
2938 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2939 : errmsg("out of file descriptors: %m; release and retry")));
2940 0 : errno = 0;
2941 0 : if (ReleaseLruFile())
2942 0 : goto TryAgain;
2943 0 : errno = save_errno;
2944 : }
2945 :
2946 1728 : return NULL;
2947 : }
2948 :
2949 : /*
2950 : * Read a directory opened with AllocateDir, ereport'ing any error.
2951 : *
2952 : * This is easier to use than raw readdir() since it takes care of some
2953 : * otherwise rather tedious and error-prone manipulation of errno. Also,
2954 : * if you are happy with a generic error message for AllocateDir failure,
2955 : * you can just do
2956 : *
2957 : * dir = AllocateDir(path);
2958 : * while ((dirent = ReadDir(dir, path)) != NULL)
2959 : * process dirent;
2960 : * FreeDir(dir);
2961 : *
2962 : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2963 : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2964 : * use this shortcut.)
2965 : *
2966 : * The pathname passed to AllocateDir must be passed to this routine too,
2967 : * but it is only used for error reporting.
2968 : */
2969 : struct dirent *
2970 5066802 : ReadDir(DIR *dir, const char *dirname)
2971 : {
2972 5066802 : return ReadDirExtended(dir, dirname, ERROR);
2973 : }
2974 :
2975 : /*
2976 : * Alternate version of ReadDir that allows caller to specify the elevel
2977 : * for any error report (whether it's reporting an initial failure of
2978 : * AllocateDir or a subsequent directory read failure).
2979 : *
2980 : * If elevel < ERROR, returns NULL after any error. With the normal coding
2981 : * pattern, this will result in falling out of the loop immediately as
2982 : * though the directory contained no (more) entries.
2983 : */
2984 : struct dirent *
2985 8214856 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2986 : {
2987 : struct dirent *dent;
2988 :
2989 : /* Give a generic message for AllocateDir failure, if caller didn't */
2990 8214856 : if (dir == NULL)
2991 : {
2992 6 : ereport(elevel,
2993 : (errcode_for_file_access(),
2994 : errmsg("could not open directory \"%s\": %m",
2995 : dirname)));
2996 0 : return NULL;
2997 : }
2998 :
2999 8214850 : errno = 0;
3000 8214850 : if ((dent = readdir(dir)) != NULL)
3001 8148192 : return dent;
3002 :
3003 66658 : if (errno)
3004 0 : ereport(elevel,
3005 : (errcode_for_file_access(),
3006 : errmsg("could not read directory \"%s\": %m",
3007 : dirname)));
3008 66658 : return NULL;
3009 : }
3010 :
3011 : /*
3012 : * Close a directory opened with AllocateDir.
3013 : *
3014 : * Returns closedir's return value (with errno set if it's not 0).
3015 : * Note we do not check the return value --- it is up to the caller
3016 : * to handle close errors if wanted.
3017 : *
3018 : * Does nothing if dir == NULL; we assume that directory open failure was
3019 : * already reported if desired.
3020 : */
3021 : int
3022 88742 : FreeDir(DIR *dir)
3023 : {
3024 : int i;
3025 :
3026 : /* Nothing to do if AllocateDir failed */
3027 88742 : if (dir == NULL)
3028 0 : return 0;
3029 :
3030 : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3031 :
3032 : /* Remove dir from list of allocated dirs, if it's present */
3033 88742 : for (i = numAllocatedDescs; --i >= 0;)
3034 : {
3035 88742 : AllocateDesc *desc = &allocatedDescs[i];
3036 :
3037 88742 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3038 88742 : return FreeDesc(desc);
3039 : }
3040 :
3041 : /* Only get here if someone passes us a dir not in allocatedDescs */
3042 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3043 :
3044 0 : return closedir(dir);
3045 : }
3046 :
3047 :
3048 : /*
3049 : * Close a pipe stream returned by OpenPipeStream.
3050 : */
3051 : int
3052 116 : ClosePipeStream(FILE *file)
3053 : {
3054 : int i;
3055 :
3056 : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3057 :
3058 : /* Remove file from list of allocated files, if it's present */
3059 116 : for (i = numAllocatedDescs; --i >= 0;)
3060 : {
3061 116 : AllocateDesc *desc = &allocatedDescs[i];
3062 :
3063 116 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3064 116 : return FreeDesc(desc);
3065 : }
3066 :
3067 : /* Only get here if someone passes us a file not in allocatedDescs */
3068 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3069 :
3070 0 : return pclose(file);
3071 : }
3072 :
3073 : /*
3074 : * closeAllVfds
3075 : *
3076 : * Force all VFDs into the physically-closed state, so that the fewest
3077 : * possible number of kernel file descriptors are in use. There is no
3078 : * change in the logical state of the VFDs.
3079 : */
3080 : void
3081 52 : closeAllVfds(void)
3082 : {
3083 : Index i;
3084 :
3085 52 : if (SizeVfdCache > 0)
3086 : {
3087 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3088 1664 : for (i = 1; i < SizeVfdCache; i++)
3089 : {
3090 1612 : if (!FileIsNotOpen(i))
3091 246 : LruDelete(i);
3092 : }
3093 : }
3094 52 : }
3095 :
3096 :
3097 : /*
3098 : * SetTempTablespaces
3099 : *
3100 : * Define a list (actually an array) of OIDs of tablespaces to use for
3101 : * temporary files. This list will be used until end of transaction,
3102 : * unless this function is called again before then. It is caller's
3103 : * responsibility that the passed-in array has adequate lifespan (typically
3104 : * it'd be allocated in TopTransactionContext).
3105 : *
3106 : * Some entries of the array may be InvalidOid, indicating that the current
3107 : * database's default tablespace should be used.
3108 : */
3109 : void
3110 6112 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3111 : {
3112 : Assert(numSpaces >= 0);
3113 6112 : tempTableSpaces = tableSpaces;
3114 6112 : numTempTableSpaces = numSpaces;
3115 :
3116 : /*
3117 : * Select a random starting point in the list. This is to minimize
3118 : * conflicts between backends that are most likely sharing the same list
3119 : * of temp tablespaces. Note that if we create multiple temp files in the
3120 : * same transaction, we'll advance circularly through the list --- this
3121 : * ensures that large temporary sort files are nicely spread across all
3122 : * available tablespaces.
3123 : */
3124 6112 : if (numSpaces > 1)
3125 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
3126 0 : 0, numSpaces - 1);
3127 : else
3128 6112 : nextTempTableSpace = 0;
3129 6112 : }
3130 :
3131 : /*
3132 : * TempTablespacesAreSet
3133 : *
3134 : * Returns true if SetTempTablespaces has been called in current transaction.
3135 : * (This is just so that tablespaces.c doesn't need its own per-transaction
3136 : * state.)
3137 : */
3138 : bool
3139 8024 : TempTablespacesAreSet(void)
3140 : {
3141 8024 : return (numTempTableSpaces >= 0);
3142 : }
3143 :
3144 : /*
3145 : * GetTempTablespaces
3146 : *
3147 : * Populate an array with the OIDs of the tablespaces that should be used for
3148 : * temporary files. (Some entries may be InvalidOid, indicating that the
3149 : * current database's default tablespace should be used.) At most numSpaces
3150 : * entries will be filled.
3151 : * Returns the number of OIDs that were copied into the output array.
3152 : */
3153 : int
3154 370 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3155 : {
3156 : int i;
3157 :
3158 : Assert(TempTablespacesAreSet());
3159 370 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3160 0 : tableSpaces[i] = tempTableSpaces[i];
3161 :
3162 370 : return i;
3163 : }
3164 :
3165 : /*
3166 : * GetNextTempTableSpace
3167 : *
3168 : * Select the next temp tablespace to use. A result of InvalidOid means
3169 : * to use the current database's default tablespace.
3170 : */
3171 : Oid
3172 4212 : GetNextTempTableSpace(void)
3173 : {
3174 4212 : if (numTempTableSpaces > 0)
3175 : {
3176 : /* Advance nextTempTableSpace counter with wraparound */
3177 2 : if (++nextTempTableSpace >= numTempTableSpaces)
3178 2 : nextTempTableSpace = 0;
3179 2 : return tempTableSpaces[nextTempTableSpace];
3180 : }
3181 4210 : return InvalidOid;
3182 : }
3183 :
3184 :
3185 : /*
3186 : * AtEOSubXact_Files
3187 : *
3188 : * Take care of subtransaction commit/abort. At abort, we close temp files
3189 : * that the subtransaction may have opened. At commit, we reassign the
3190 : * files that were opened to the parent subtransaction.
3191 : */
3192 : void
3193 20084 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3194 : SubTransactionId parentSubid)
3195 : {
3196 : Index i;
3197 :
3198 20084 : for (i = 0; i < numAllocatedDescs; i++)
3199 : {
3200 0 : if (allocatedDescs[i].create_subid == mySubid)
3201 : {
3202 0 : if (isCommit)
3203 0 : allocatedDescs[i].create_subid = parentSubid;
3204 : else
3205 : {
3206 : /* have to recheck the item after FreeDesc (ugly) */
3207 0 : FreeDesc(&allocatedDescs[i--]);
3208 : }
3209 : }
3210 : }
3211 20084 : }
3212 :
3213 : /*
3214 : * AtEOXact_Files
3215 : *
3216 : * This routine is called during transaction commit or abort. All still-open
3217 : * per-transaction temporary file VFDs are closed, which also causes the
3218 : * underlying files to be deleted (although they should've been closed already
3219 : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3220 : * closed. We also forget any transaction-local temp tablespace list.
3221 : *
3222 : * The isCommit flag is used only to decide whether to emit warnings about
3223 : * unclosed files.
3224 : */
3225 : void
3226 1122122 : AtEOXact_Files(bool isCommit)
3227 : {
3228 1122122 : CleanupTempFiles(isCommit, false);
3229 1122122 : tempTableSpaces = NULL;
3230 1122122 : numTempTableSpaces = -1;
3231 1122122 : }
3232 :
3233 : /*
3234 : * BeforeShmemExit_Files
3235 : *
3236 : * before_shmem_exit hook to clean up temp files during backend shutdown.
3237 : * Here, we want to clean up *all* temp files including interXact ones.
3238 : */
3239 : static void
3240 42720 : BeforeShmemExit_Files(int code, Datum arg)
3241 : {
3242 42720 : CleanupTempFiles(false, true);
3243 :
3244 : /* prevent further temp files from being created */
3245 : #ifdef USE_ASSERT_CHECKING
3246 : temporary_files_allowed = false;
3247 : #endif
3248 42720 : }
3249 :
3250 : /*
3251 : * Close temporary files and delete their underlying files.
3252 : *
3253 : * isCommit: if true, this is normal transaction commit, and we don't
3254 : * expect any remaining files; warn if there are some.
3255 : *
3256 : * isProcExit: if true, this is being called as the backend process is
3257 : * exiting. If that's the case, we should remove all temporary files; if
3258 : * that's not the case, we are being called for transaction commit/abort
3259 : * and should only remove transaction-local temp files. In either case,
3260 : * also clean up "allocated" stdio files, dirs and fds.
3261 : */
3262 : static void
3263 1164842 : CleanupTempFiles(bool isCommit, bool isProcExit)
3264 : {
3265 : Index i;
3266 :
3267 : /*
3268 : * Careful here: at proc_exit we need extra cleanup, not just
3269 : * xact_temporary files.
3270 : */
3271 1164842 : if (isProcExit || have_xact_temporary_files)
3272 : {
3273 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3274 2792764 : for (i = 1; i < SizeVfdCache; i++)
3275 : {
3276 2748522 : unsigned short fdstate = VfdCache[i].fdstate;
3277 :
3278 2748522 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3279 8 : VfdCache[i].fileName != NULL)
3280 : {
3281 : /*
3282 : * If we're in the process of exiting a backend process, close
3283 : * all temporary files. Otherwise, only close temporary files
3284 : * local to the current transaction. They should be closed by
3285 : * the ResourceOwner mechanism already, so this is just a
3286 : * debugging cross-check.
3287 : */
3288 8 : if (isProcExit)
3289 8 : FileClose(i);
3290 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
3291 : {
3292 0 : elog(WARNING,
3293 : "temporary file %s not closed at end-of-transaction",
3294 : VfdCache[i].fileName);
3295 0 : FileClose(i);
3296 : }
3297 : }
3298 : }
3299 :
3300 44242 : have_xact_temporary_files = false;
3301 : }
3302 :
3303 : /* Complain if any allocated files remain open at commit. */
3304 1164842 : if (isCommit && numAllocatedDescs > 0)
3305 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3306 : numAllocatedDescs);
3307 :
3308 : /* Clean up "allocated" stdio files, dirs and fds. */
3309 1165122 : while (numAllocatedDescs > 0)
3310 280 : FreeDesc(&allocatedDescs[0]);
3311 1164842 : }
3312 :
3313 :
3314 : /*
3315 : * Remove temporary and temporary relation files left over from a prior
3316 : * postmaster session
3317 : *
3318 : * This should be called during postmaster startup. It will forcibly
3319 : * remove any leftover files created by OpenTemporaryFile and any leftover
3320 : * temporary relation files created by mdcreate.
3321 : *
3322 : * During post-backend-crash restart cycle, this routine is called when
3323 : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3324 : * queries are using temp files could result in useless storage usage that can
3325 : * only be reclaimed by a service restart. The argument against enabling it is
3326 : * that someone might want to examine the temporary files for debugging
3327 : * purposes. This does however mean that OpenTemporaryFile had better allow for
3328 : * collision with an existing temp file name.
3329 : *
3330 : * NOTE: this function and its subroutines generally report syscall failures
3331 : * with ereport(LOG) and keep going. Removing temp files is not so critical
3332 : * that we should fail to start the database when we can't do it.
3333 : */
3334 : void
3335 1706 : RemovePgTempFiles(void)
3336 : {
3337 : char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3338 : DIR *spc_dir;
3339 : struct dirent *spc_de;
3340 :
3341 : /*
3342 : * First process temp files in pg_default ($PGDATA/base)
3343 : */
3344 1706 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3345 1706 : RemovePgTempFilesInDir(temp_path, true, false);
3346 1706 : RemovePgTempRelationFiles("base");
3347 :
3348 : /*
3349 : * Cycle through temp directories for all non-default tablespaces.
3350 : */
3351 1706 : spc_dir = AllocateDir(PG_TBLSPC_DIR);
3352 :
3353 5238 : while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3354 : {
3355 3532 : if (strcmp(spc_de->d_name, ".") == 0 ||
3356 1826 : strcmp(spc_de->d_name, "..") == 0)
3357 3412 : continue;
3358 :
3359 120 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3360 120 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
3361 : PG_TEMP_FILES_DIR);
3362 120 : RemovePgTempFilesInDir(temp_path, true, false);
3363 :
3364 120 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3365 120 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3366 120 : RemovePgTempRelationFiles(temp_path);
3367 : }
3368 :
3369 1706 : FreeDir(spc_dir);
3370 :
3371 : /*
3372 : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3373 : * DataDir as well. However, that is *not* cleaned here because doing so
3374 : * would create a race condition. It's done separately, earlier in
3375 : * postmaster startup.
3376 : */
3377 1706 : }
3378 :
3379 : /*
3380 : * Process one pgsql_tmp directory for RemovePgTempFiles.
3381 : *
3382 : * If missing_ok is true, it's all right for the named directory to not exist.
3383 : * Any other problem results in a LOG message. (missing_ok should be true at
3384 : * the top level, since pgsql_tmp directories are not created until needed.)
3385 : *
3386 : * At the top level, this should be called with unlink_all = false, so that
3387 : * only files matching the temporary name prefix will be unlinked. When
3388 : * recursing it will be called with unlink_all = true to unlink everything
3389 : * under a top-level temporary directory.
3390 : *
3391 : * (These two flags could be replaced by one, but it seems clearer to keep
3392 : * them separate.)
3393 : */
3394 : void
3395 1828 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3396 : {
3397 : DIR *temp_dir;
3398 : struct dirent *temp_de;
3399 : char rm_path[MAXPGPATH * 2];
3400 :
3401 1828 : temp_dir = AllocateDir(tmpdirname);
3402 :
3403 1828 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
3404 1696 : return;
3405 :
3406 402 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3407 : {
3408 270 : if (strcmp(temp_de->d_name, ".") == 0 ||
3409 138 : strcmp(temp_de->d_name, "..") == 0)
3410 264 : continue;
3411 :
3412 6 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3413 6 : tmpdirname, temp_de->d_name);
3414 :
3415 6 : if (unlink_all ||
3416 6 : strncmp(temp_de->d_name,
3417 : PG_TEMP_FILE_PREFIX,
3418 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
3419 6 : {
3420 6 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3421 :
3422 6 : if (type == PGFILETYPE_ERROR)
3423 0 : continue;
3424 6 : else if (type == PGFILETYPE_DIR)
3425 : {
3426 : /* recursively remove contents, then directory itself */
3427 2 : RemovePgTempFilesInDir(rm_path, false, true);
3428 :
3429 2 : if (rmdir(rm_path) < 0)
3430 0 : ereport(LOG,
3431 : (errcode_for_file_access(),
3432 : errmsg("could not remove directory \"%s\": %m",
3433 : rm_path)));
3434 : }
3435 : else
3436 : {
3437 4 : if (unlink(rm_path) < 0)
3438 0 : ereport(LOG,
3439 : (errcode_for_file_access(),
3440 : errmsg("could not remove file \"%s\": %m",
3441 : rm_path)));
3442 : }
3443 : }
3444 : else
3445 0 : ereport(LOG,
3446 : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3447 : rm_path)));
3448 : }
3449 :
3450 132 : FreeDir(temp_dir);
3451 : }
3452 :
3453 : /* Process one tablespace directory, look for per-DB subdirectories */
3454 : static void
3455 1826 : RemovePgTempRelationFiles(const char *tsdirname)
3456 : {
3457 : DIR *ts_dir;
3458 : struct dirent *de;
3459 : char dbspace_path[MAXPGPATH * 2];
3460 :
3461 1826 : ts_dir = AllocateDir(tsdirname);
3462 :
3463 11392 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3464 : {
3465 : /*
3466 : * We're only interested in the per-database directories, which have
3467 : * numeric names. Note that this code will also (properly) ignore "."
3468 : * and "..".
3469 : */
3470 9566 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3471 3782 : continue;
3472 :
3473 5784 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3474 5784 : tsdirname, de->d_name);
3475 5784 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3476 : }
3477 :
3478 1826 : FreeDir(ts_dir);
3479 1826 : }
3480 :
3481 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3482 : static void
3483 5784 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3484 : {
3485 : DIR *dbspace_dir;
3486 : struct dirent *de;
3487 : char rm_path[MAXPGPATH * 2];
3488 :
3489 5784 : dbspace_dir = AllocateDir(dbspacedirname);
3490 :
3491 1760314 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3492 : {
3493 1754530 : if (!looks_like_temp_rel_name(de->d_name))
3494 1754512 : continue;
3495 :
3496 18 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3497 18 : dbspacedirname, de->d_name);
3498 :
3499 18 : if (unlink(rm_path) < 0)
3500 0 : ereport(LOG,
3501 : (errcode_for_file_access(),
3502 : errmsg("could not remove file \"%s\": %m",
3503 : rm_path)));
3504 : }
3505 :
3506 5784 : FreeDir(dbspace_dir);
3507 5784 : }
3508 :
3509 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3510 : bool
3511 2345990 : looks_like_temp_rel_name(const char *name)
3512 : {
3513 : int pos;
3514 : int savepos;
3515 :
3516 : /* Must start with "t". */
3517 2345990 : if (name[0] != 't')
3518 2345900 : return false;
3519 :
3520 : /* Followed by a non-empty string of digits and then an underscore. */
3521 412 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3522 : ;
3523 90 : if (pos == 1 || name[pos] != '_')
3524 0 : return false;
3525 :
3526 : /* Followed by another nonempty string of digits. */
3527 462 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3528 : ;
3529 90 : if (savepos == pos)
3530 0 : return false;
3531 :
3532 : /* We might have _forkname or .segment or both. */
3533 90 : if (name[pos] == '_')
3534 : {
3535 40 : int forkchar = forkname_chars(&name[pos + 1], NULL);
3536 :
3537 40 : if (forkchar <= 0)
3538 0 : return false;
3539 40 : pos += forkchar + 1;
3540 : }
3541 90 : if (name[pos] == '.')
3542 : {
3543 : int segchar;
3544 :
3545 80 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3546 : ;
3547 40 : if (segchar <= 1)
3548 0 : return false;
3549 40 : pos += segchar;
3550 : }
3551 :
3552 : /* Now we should be at the end. */
3553 90 : if (name[pos] != '\0')
3554 0 : return false;
3555 90 : return true;
3556 : }
3557 :
3558 : #ifdef HAVE_SYNCFS
3559 : static void
3560 0 : do_syncfs(const char *path)
3561 : {
3562 : int fd;
3563 :
3564 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3565 : path);
3566 :
3567 0 : fd = OpenTransientFile(path, O_RDONLY);
3568 0 : if (fd < 0)
3569 : {
3570 0 : ereport(LOG,
3571 : (errcode_for_file_access(),
3572 : errmsg("could not open file \"%s\": %m", path)));
3573 0 : return;
3574 : }
3575 0 : if (syncfs(fd) < 0)
3576 0 : ereport(LOG,
3577 : (errcode_for_file_access(),
3578 : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3579 0 : CloseTransientFile(fd);
3580 : }
3581 : #endif
3582 :
3583 : /*
3584 : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3585 : * all potential filesystem, depending on recovery_init_sync_method setting.
3586 : *
3587 : * We fsync regular files and directories wherever they are, but we
3588 : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3589 : * Other symlinks are presumed to point at files we're not responsible
3590 : * for fsyncing, and might not have privileges to write at all.
3591 : *
3592 : * Errors are logged but not considered fatal; that's because this is used
3593 : * only during database startup, to deal with the possibility that there are
3594 : * issued-but-unsynced writes pending against the data directory. We want to
3595 : * ensure that such writes reach disk before anything that's done in the new
3596 : * run. However, aborting on error would result in failure to start for
3597 : * harmless cases such as read-only files in the data directory, and that's
3598 : * not good either.
3599 : *
3600 : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3601 : * rewriting all changes again during recovery.
3602 : *
3603 : * Note we assume we're chdir'd into PGDATA to begin with.
3604 : */
3605 : void
3606 348 : SyncDataDirectory(void)
3607 : {
3608 : bool xlog_is_symlink;
3609 :
3610 : /* We can skip this whole thing if fsync is disabled. */
3611 348 : if (!enableFsync)
3612 348 : return;
3613 :
3614 : /*
3615 : * If pg_wal is a symlink, we'll need to recurse into it separately,
3616 : * because the first walkdir below will ignore it.
3617 : */
3618 0 : xlog_is_symlink = false;
3619 :
3620 : {
3621 : struct stat st;
3622 :
3623 0 : if (lstat("pg_wal", &st) < 0)
3624 0 : ereport(LOG,
3625 : (errcode_for_file_access(),
3626 : errmsg("could not stat file \"%s\": %m",
3627 : "pg_wal")));
3628 0 : else if (S_ISLNK(st.st_mode))
3629 0 : xlog_is_symlink = true;
3630 : }
3631 :
3632 : #ifdef HAVE_SYNCFS
3633 0 : if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
3634 : {
3635 : DIR *dir;
3636 : struct dirent *de;
3637 :
3638 : /*
3639 : * On Linux, we don't have to open every single file one by one. We
3640 : * can use syncfs() to sync whole filesystems. We only expect
3641 : * filesystem boundaries to exist where we tolerate symlinks, namely
3642 : * pg_wal and the tablespaces, so we call syncfs() for each of those
3643 : * directories.
3644 : */
3645 :
3646 : /* Prepare to report progress syncing the data directory via syncfs. */
3647 0 : begin_startup_progress_phase();
3648 :
3649 : /* Sync the top level pgdata directory. */
3650 0 : do_syncfs(".");
3651 : /* If any tablespaces are configured, sync each of those. */
3652 0 : dir = AllocateDir(PG_TBLSPC_DIR);
3653 0 : while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3654 : {
3655 : char path[MAXPGPATH];
3656 :
3657 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3658 0 : continue;
3659 :
3660 0 : snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3661 0 : do_syncfs(path);
3662 : }
3663 0 : FreeDir(dir);
3664 : /* If pg_wal is a symlink, process that too. */
3665 0 : if (xlog_is_symlink)
3666 0 : do_syncfs("pg_wal");
3667 0 : return;
3668 : }
3669 : #endif /* !HAVE_SYNCFS */
3670 :
3671 : #ifdef PG_FLUSH_DATA_WORKS
3672 : /* Prepare to report progress of the pre-fsync phase. */
3673 0 : begin_startup_progress_phase();
3674 :
3675 : /*
3676 : * If possible, hint to the kernel that we're soon going to fsync the data
3677 : * directory and its contents. Errors in this step are even less
3678 : * interesting than normal, so log them only at DEBUG1.
3679 : */
3680 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
3681 0 : if (xlog_is_symlink)
3682 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3683 0 : walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3684 : #endif
3685 :
3686 : /* Prepare to report progress syncing the data directory via fsync. */
3687 0 : begin_startup_progress_phase();
3688 :
3689 : /*
3690 : * Now we do the fsync()s in the same order.
3691 : *
3692 : * The main call ignores symlinks, so in addition to specially processing
3693 : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3694 : * process_symlinks = true. Note that if there are any plain directories
3695 : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3696 : * so we don't worry about optimizing it.
3697 : */
3698 0 : walkdir(".", datadir_fsync_fname, false, LOG);
3699 0 : if (xlog_is_symlink)
3700 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3701 0 : walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
3702 : }
3703 :
3704 : /*
3705 : * walkdir: recursively walk a directory, applying the action to each
3706 : * regular file and directory (including the named directory itself).
3707 : *
3708 : * If process_symlinks is true, the action and recursion are also applied
3709 : * to regular files and directories that are pointed to by symlinks in the
3710 : * given directory; otherwise symlinks are ignored. Symlinks are always
3711 : * ignored in subdirectories, ie we intentionally don't pass down the
3712 : * process_symlinks flag to recursive calls.
3713 : *
3714 : * Errors are reported at level elevel, which might be ERROR or less.
3715 : *
3716 : * See also walkdir in file_utils.c, which is a frontend version of this
3717 : * logic.
3718 : */
3719 : static void
3720 338 : walkdir(const char *path,
3721 : void (*action) (const char *fname, bool isdir, int elevel),
3722 : bool process_symlinks,
3723 : int elevel)
3724 : {
3725 : DIR *dir;
3726 : struct dirent *de;
3727 :
3728 338 : dir = AllocateDir(path);
3729 :
3730 2946 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3731 : {
3732 : char subpath[MAXPGPATH * 2];
3733 :
3734 2608 : CHECK_FOR_INTERRUPTS();
3735 :
3736 2608 : if (strcmp(de->d_name, ".") == 0 ||
3737 2270 : strcmp(de->d_name, "..") == 0)
3738 676 : continue;
3739 :
3740 1932 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3741 :
3742 1932 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3743 : {
3744 1932 : case PGFILETYPE_REG:
3745 1932 : (*action) (subpath, false, elevel);
3746 1932 : break;
3747 0 : case PGFILETYPE_DIR:
3748 0 : walkdir(subpath, action, false, elevel);
3749 0 : break;
3750 0 : default:
3751 :
3752 : /*
3753 : * Errors are already reported directly by get_dirent_type(),
3754 : * and any remaining symlinks and unknown file types are
3755 : * ignored.
3756 : */
3757 0 : break;
3758 : }
3759 : }
3760 :
3761 338 : FreeDir(dir); /* we ignore any error here */
3762 :
3763 : /*
3764 : * It's important to fsync the destination directory itself as individual
3765 : * file fsyncs don't guarantee that the directory entry for the file is
3766 : * synced. However, skip this if AllocateDir failed; the action function
3767 : * might not be robust against that.
3768 : */
3769 338 : if (dir)
3770 338 : (*action) (path, true, elevel);
3771 338 : }
3772 :
3773 :
3774 : /*
3775 : * Hint to the OS that it should get ready to fsync() this file.
3776 : *
3777 : * Ignores errors trying to open unreadable files, and logs other errors at a
3778 : * caller-specified level.
3779 : */
3780 : #ifdef PG_FLUSH_DATA_WORKS
3781 :
3782 : static void
3783 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3784 : {
3785 : int fd;
3786 :
3787 : /* Don't try to flush directories, it'll likely just fail */
3788 0 : if (isdir)
3789 0 : return;
3790 :
3791 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3792 : fname);
3793 :
3794 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3795 :
3796 0 : if (fd < 0)
3797 : {
3798 0 : if (errno == EACCES)
3799 0 : return;
3800 0 : ereport(elevel,
3801 : (errcode_for_file_access(),
3802 : errmsg("could not open file \"%s\": %m", fname)));
3803 0 : return;
3804 : }
3805 :
3806 : /*
3807 : * pg_flush_data() ignores errors, which is ok because this is only a
3808 : * hint.
3809 : */
3810 0 : pg_flush_data(fd, 0, 0);
3811 :
3812 0 : if (CloseTransientFile(fd) != 0)
3813 0 : ereport(elevel,
3814 : (errcode_for_file_access(),
3815 : errmsg("could not close file \"%s\": %m", fname)));
3816 : }
3817 :
3818 : #endif /* PG_FLUSH_DATA_WORKS */
3819 :
3820 : static void
3821 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3822 : {
3823 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3824 : fname);
3825 :
3826 : /*
3827 : * We want to silently ignoring errors about unreadable files. Pass that
3828 : * desire on to fsync_fname_ext().
3829 : */
3830 0 : fsync_fname_ext(fname, isdir, true, elevel);
3831 0 : }
3832 :
3833 : static void
3834 2270 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3835 : {
3836 2270 : if (isdir)
3837 : {
3838 338 : if (rmdir(fname) != 0 && errno != ENOENT)
3839 0 : ereport(elevel,
3840 : (errcode_for_file_access(),
3841 : errmsg("could not remove directory \"%s\": %m", fname)));
3842 : }
3843 : else
3844 : {
3845 : /* Use PathNameDeleteTemporaryFile to report filesize */
3846 1932 : PathNameDeleteTemporaryFile(fname, false);
3847 : }
3848 2270 : }
3849 :
3850 : /*
3851 : * fsync_fname_ext -- Try to fsync a file or directory
3852 : *
3853 : * If ignore_perm is true, ignore errors upon trying to open unreadable
3854 : * files. Logs other errors at a caller-specified level.
3855 : *
3856 : * Returns 0 if the operation succeeded, -1 otherwise.
3857 : */
3858 : int
3859 81250 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3860 : {
3861 : int fd;
3862 : int flags;
3863 : int returncode;
3864 :
3865 : /*
3866 : * Some OSs require directories to be opened read-only whereas other
3867 : * systems don't allow us to fsync files opened read-only; so we need both
3868 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3869 : * not writable by our userid, but we assume that's OK.
3870 : */
3871 81250 : flags = PG_BINARY;
3872 81250 : if (!isdir)
3873 30432 : flags |= O_RDWR;
3874 : else
3875 50818 : flags |= O_RDONLY;
3876 :
3877 81250 : fd = OpenTransientFile(fname, flags);
3878 :
3879 : /*
3880 : * Some OSs don't allow us to open directories at all (Windows returns
3881 : * EACCES), just ignore the error in that case. If desired also silently
3882 : * ignoring errors about unreadable files. Log others.
3883 : */
3884 81250 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3885 0 : return 0;
3886 81250 : else if (fd < 0 && ignore_perm && errno == EACCES)
3887 0 : return 0;
3888 81250 : else if (fd < 0)
3889 : {
3890 0 : ereport(elevel,
3891 : (errcode_for_file_access(),
3892 : errmsg("could not open file \"%s\": %m", fname)));
3893 0 : return -1;
3894 : }
3895 :
3896 81250 : returncode = pg_fsync(fd);
3897 :
3898 : /*
3899 : * Some OSes don't allow us to fsync directories at all, so we can ignore
3900 : * those errors. Anything else needs to be logged.
3901 : */
3902 81250 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3903 : {
3904 : int save_errno;
3905 :
3906 : /* close file upon error, might not be in transaction context */
3907 0 : save_errno = errno;
3908 0 : (void) CloseTransientFile(fd);
3909 0 : errno = save_errno;
3910 :
3911 0 : ereport(elevel,
3912 : (errcode_for_file_access(),
3913 : errmsg("could not fsync file \"%s\": %m", fname)));
3914 0 : return -1;
3915 : }
3916 :
3917 81250 : if (CloseTransientFile(fd) != 0)
3918 : {
3919 0 : ereport(elevel,
3920 : (errcode_for_file_access(),
3921 : errmsg("could not close file \"%s\": %m", fname)));
3922 0 : return -1;
3923 : }
3924 :
3925 81250 : return 0;
3926 : }
3927 :
3928 : /*
3929 : * fsync_parent_path -- fsync the parent path of a file or directory
3930 : *
3931 : * This is aimed at making file operations persistent on disk in case of
3932 : * an OS crash or power failure.
3933 : */
3934 : static int
3935 15112 : fsync_parent_path(const char *fname, int elevel)
3936 : {
3937 : char parentpath[MAXPGPATH];
3938 :
3939 15112 : strlcpy(parentpath, fname, MAXPGPATH);
3940 15112 : get_parent_directory(parentpath);
3941 :
3942 : /*
3943 : * get_parent_directory() returns an empty string if the input argument is
3944 : * just a file name (see comments in path.c), so handle that as being the
3945 : * current directory.
3946 : */
3947 15112 : if (strlen(parentpath) == 0)
3948 394 : strlcpy(parentpath, ".", MAXPGPATH);
3949 :
3950 15112 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3951 0 : return -1;
3952 :
3953 15112 : return 0;
3954 : }
3955 :
3956 : /*
3957 : * Create a PostgreSQL data sub-directory
3958 : *
3959 : * The data directory itself, and most of its sub-directories, are created at
3960 : * initdb time, but we do have some occasions when we create directories in
3961 : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3962 : * make sure that those directories are created consistently. Today, that means
3963 : * making sure that the created directory has the correct permissions, which is
3964 : * what pg_dir_create_mode tracks for us.
3965 : *
3966 : * Note that we also set the umask() based on what we understand the correct
3967 : * permissions to be (see file_perm.c).
3968 : *
3969 : * For permissions other than the default, mkdir() can be used directly, but
3970 : * be sure to consider carefully such cases -- a sub-directory with incorrect
3971 : * permissions in a PostgreSQL data directory could cause backups and other
3972 : * processes to fail.
3973 : */
3974 : int
3975 2888 : MakePGDirectory(const char *directoryName)
3976 : {
3977 2888 : return mkdir(directoryName, pg_dir_create_mode);
3978 : }
3979 :
3980 : /*
3981 : * Return the passed-in error level, or PANIC if data_sync_retry is off.
3982 : *
3983 : * Failure to fsync any data file is cause for immediate panic, unless
3984 : * data_sync_retry is enabled. Data may have been written to the operating
3985 : * system and removed from our buffer pool already, and if we are running on
3986 : * an operating system that forgets dirty data on write-back failure, there
3987 : * may be only one copy of the data remaining: in the WAL. A later attempt to
3988 : * fsync again might falsely report success. Therefore we must not allow any
3989 : * further checkpoints to be attempted. data_sync_retry can in theory be
3990 : * enabled on systems known not to drop dirty buffered data on write-back
3991 : * failure (with the likely outcome that checkpoints will continue to fail
3992 : * until the underlying problem is fixed).
3993 : *
3994 : * Any code that reports a failure from fsync() or related functions should
3995 : * filter the error level with this function.
3996 : */
3997 : int
3998 40930 : data_sync_elevel(int elevel)
3999 : {
4000 40930 : return data_sync_retry ? elevel : PANIC;
4001 : }
4002 :
4003 : bool
4004 2204 : check_debug_io_direct(char **newval, void **extra, GucSource source)
4005 : {
4006 2204 : bool result = true;
4007 : int flags;
4008 :
4009 : #if PG_O_DIRECT == 0
4010 : if (strcmp(*newval, "") != 0)
4011 : {
4012 : GUC_check_errdetail("\"%s\" is not supported on this platform.",
4013 : "debug_io_direct");
4014 : result = false;
4015 : }
4016 : flags = 0;
4017 : #else
4018 : List *elemlist;
4019 : ListCell *l;
4020 : char *rawstring;
4021 :
4022 : /* Need a modifiable copy of string */
4023 2204 : rawstring = pstrdup(*newval);
4024 :
4025 2204 : if (!SplitGUCList(rawstring, ',', &elemlist))
4026 : {
4027 0 : GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4028 : "debug_io_direct");
4029 0 : pfree(rawstring);
4030 0 : list_free(elemlist);
4031 0 : return false;
4032 : }
4033 :
4034 2204 : flags = 0;
4035 2216 : foreach(l, elemlist)
4036 : {
4037 12 : char *item = (char *) lfirst(l);
4038 :
4039 12 : if (pg_strcasecmp(item, "data") == 0)
4040 4 : flags |= IO_DIRECT_DATA;
4041 8 : else if (pg_strcasecmp(item, "wal") == 0)
4042 4 : flags |= IO_DIRECT_WAL;
4043 4 : else if (pg_strcasecmp(item, "wal_init") == 0)
4044 4 : flags |= IO_DIRECT_WAL_INIT;
4045 : else
4046 : {
4047 0 : GUC_check_errdetail("Invalid option \"%s\".", item);
4048 0 : result = false;
4049 0 : break;
4050 : }
4051 : }
4052 :
4053 : /*
4054 : * It's possible to configure block sizes smaller than our assumed I/O
4055 : * alignment size, which could result in invalid I/O requests.
4056 : */
4057 : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4058 : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4059 : {
4060 : GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4061 : "debug_io_direct", "XLOG_BLCKSZ");
4062 : result = false;
4063 : }
4064 : #endif
4065 : #if BLCKSZ < PG_IO_ALIGN_SIZE
4066 : if (result && (flags & IO_DIRECT_DATA))
4067 : {
4068 : GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4069 : "debug_io_direct", "BLCKSZ");
4070 : result = false;
4071 : }
4072 : #endif
4073 :
4074 2204 : pfree(rawstring);
4075 2204 : list_free(elemlist);
4076 : #endif
4077 :
4078 2204 : if (!result)
4079 0 : return result;
4080 :
4081 : /* Save the flags in *extra, for use by assign_debug_io_direct */
4082 2204 : *extra = guc_malloc(LOG, sizeof(int));
4083 2204 : if (!*extra)
4084 0 : return false;
4085 2204 : *((int *) *extra) = flags;
4086 :
4087 2204 : return result;
4088 : }
4089 :
4090 : void
4091 2204 : assign_debug_io_direct(const char *newval, void *extra)
4092 : {
4093 2204 : int *flags = (int *) extra;
4094 :
4095 2204 : io_direct_flags = *flags;
4096 2204 : }
4097 :
4098 : /* ResourceOwner callbacks */
4099 :
4100 : static void
4101 8 : ResOwnerReleaseFile(Datum res)
4102 : {
4103 8 : File file = (File) DatumGetInt32(res);
4104 : Vfd *vfdP;
4105 :
4106 : Assert(FileIsValid(file));
4107 :
4108 8 : vfdP = &VfdCache[file];
4109 8 : vfdP->resowner = NULL;
4110 :
4111 8 : FileClose(file);
4112 8 : }
4113 :
4114 : static char *
4115 0 : ResOwnerPrintFile(Datum res)
4116 : {
4117 0 : return psprintf("File %d", DatumGetInt32(res));
4118 : }
|