Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * fd.c
4 : * Virtual file descriptor code.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/fd.c
11 : *
12 : * NOTES:
13 : *
14 : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : * The server opens many file descriptors for a variety of reasons,
16 : * including base tables, scratch files (e.g., sort and hash spool
17 : * files), and random calls to C library routines like system(3); it
18 : * is quite easy to exceed system limits on the number of open files a
19 : * single process can have. (This is around 1024 on many modern
20 : * operating systems, but may be lower on others.)
21 : *
22 : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : * being opened and closed as needed. Obviously, if a routine is
24 : * opened using these interfaces, all subsequent operations must also
25 : * be through these interfaces (the File type is not a real file
26 : * descriptor).
27 : *
28 : * For this scheme to work, most (if not all) routines throughout the
29 : * server should use these interfaces instead of calling the C library
30 : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : * may find ourselves short of real file descriptors anyway.
32 : *
33 : * INTERFACE ROUTINES
34 : *
35 : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : * File is closed, either explicitly or implicitly at end of transaction or
38 : * process exit. PathNameOpenFile is intended for files that are held open
39 : * for a long time, like relation files. It is the caller's responsibility
40 : * to close them, there is no automatic mechanism in fd.c for that.
41 : *
42 : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : * temporary files that have names so that they can be shared between
44 : * backends. Such files are automatically closed and count against the
45 : * temporary file limit of the backend that creates them, but unlike anonymous
46 : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : * ownership mechanism that provides automatic cleanup for shared files when
48 : * the last of a group of backends detaches.
49 : *
50 : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : * They behave like the corresponding native functions, except that the handle
53 : * is registered with the current subtransaction, and will be automatically
54 : * closed at abort. These are intended mainly for short operations like
55 : * reading a configuration file; there is a limit on the number of files that
56 : * can be opened using these functions at any one time.
57 : *
58 : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : * release file descriptors in use by the virtual file descriptors if
60 : * necessary. There is no automatic cleanup of file descriptors returned by
61 : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : * descriptor by calling close(2).
63 : *
64 : * If a non-virtual file descriptor needs to be held open for any length of
65 : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : *
70 : *-------------------------------------------------------------------------
71 : */
72 :
73 : #include "postgres.h"
74 :
75 : #include <dirent.h>
76 : #include <sys/file.h>
77 : #include <sys/param.h>
78 : #include <sys/resource.h> /* for getrlimit */
79 : #include <sys/stat.h>
80 : #include <sys/types.h>
81 : #ifndef WIN32
82 : #include <sys/mman.h>
83 : #endif
84 : #include <limits.h>
85 : #include <unistd.h>
86 : #include <fcntl.h>
87 :
88 : #include "access/xact.h"
89 : #include "access/xlog.h"
90 : #include "catalog/pg_tablespace.h"
91 : #include "common/file_perm.h"
92 : #include "common/file_utils.h"
93 : #include "common/pg_prng.h"
94 : #include "miscadmin.h"
95 : #include "pgstat.h"
96 : #include "postmaster/startup.h"
97 : #include "storage/aio.h"
98 : #include "storage/fd.h"
99 : #include "storage/ipc.h"
100 : #include "utils/guc.h"
101 : #include "utils/guc_hooks.h"
102 : #include "utils/resowner.h"
103 : #include "utils/varlena.h"
104 :
105 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 : #if defined(HAVE_SYNC_FILE_RANGE)
107 : #define PG_FLUSH_DATA_WORKS 1
108 : #elif !defined(WIN32) && defined(MS_ASYNC)
109 : #define PG_FLUSH_DATA_WORKS 1
110 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 : #define PG_FLUSH_DATA_WORKS 1
112 : #endif
113 :
114 : /*
115 : * We must leave some file descriptors free for system(), the dynamic loader,
116 : * and other code that tries to open files without consulting fd.c. This
117 : * is the number left free. (While we try fairly hard to prevent EMFILE
118 : * errors, there's never any guarantee that we won't get ENFILE due to
119 : * other processes chewing up FDs. So it's a bad idea to try to open files
120 : * without consulting fd.c. Nonetheless we cannot control all code.)
121 : *
122 : * Because this is just a fixed setting, we are effectively assuming that
123 : * no such code will leave FDs open over the long term; otherwise the slop
124 : * is likely to be insufficient. Note in particular that we expect that
125 : * loading a shared library does not result in any permanent increase in
126 : * the number of open files. (This appears to be true on most if not
127 : * all platforms as of Feb 2004.)
128 : */
129 : #define NUM_RESERVED_FDS 10
130 :
131 : /*
132 : * If we have fewer than this many usable FDs after allowing for the reserved
133 : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 : * much less than that. Note that this value ensures numExternalFDs can be
135 : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 : * will not pass unless that can grow to at least 14.)
137 : */
138 : #define FD_MINFREE 48
139 :
140 : /*
141 : * A number of platforms allow individual processes to open many more files
142 : * than they can really support when *many* processes do the same thing.
143 : * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 : * what the postmaster's initial probe suggests will work.
145 : */
146 : int max_files_per_process = 1000;
147 :
148 : /*
149 : * Maximum number of file descriptors to open for operations that fd.c knows
150 : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 : * to a conservative value, and remains that way indefinitely in bootstrap or
152 : * standalone-backend cases. In normal postmaster operation, the postmaster
153 : * calls set_max_safe_fds() late in initialization to update the value, and
154 : * that value is then inherited by forked subprocesses.
155 : *
156 : * Note: the value of max_files_per_process is taken into account while
157 : * setting this variable, and so need not be tested separately.
158 : */
159 : int max_safe_fds = FD_MINFREE; /* default if not changed */
160 :
161 : /* Whether it is safe to continue running after fsync() fails. */
162 : bool data_sync_retry = false;
163 :
164 : /* How SyncDataDirectory() should do its job. */
165 : int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
166 :
167 : /* Which kinds of files should be opened with PG_O_DIRECT. */
168 : int io_direct_flags;
169 :
170 : /* Debugging.... */
171 :
172 : #ifdef FDDEBUG
173 : #define DO_DB(A) \
174 : do { \
175 : int _do_db_save_errno = errno; \
176 : A; \
177 : errno = _do_db_save_errno; \
178 : } while (0)
179 : #else
180 : #define DO_DB(A) \
181 : ((void) 0)
182 : #endif
183 :
184 : #define VFD_CLOSED (-1)
185 :
186 : #define FileIsValid(file) \
187 : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188 :
189 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190 :
191 : /* these are the assigned bits in fdstate below: */
192 : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193 : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194 : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195 :
196 : typedef struct vfd
197 : {
198 : int fd; /* current FD, or VFD_CLOSED if none */
199 : unsigned short fdstate; /* bitflags for VFD's state */
200 : ResourceOwner resowner; /* owner, for automatic cleanup */
201 : File nextFree; /* link to next free VFD, if in freelist */
202 : File lruMoreRecently; /* doubly linked recency-of-use list */
203 : File lruLessRecently;
204 : off_t fileSize; /* current size of file (0 if not temporary) */
205 : char *fileName; /* name of file, or NULL for unused VFD */
206 : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 : int fileFlags; /* open(2) flags for (re)opening the file */
208 : mode_t fileMode; /* mode to pass to open(2) */
209 : } Vfd;
210 :
211 : /*
212 : * Virtual File Descriptor array pointer and size. This grows as
213 : * needed. 'File' values are indexes into this array.
214 : * Note that VfdCache[0] is not a usable VFD, just a list header.
215 : */
216 : static Vfd *VfdCache;
217 : static Size SizeVfdCache = 0;
218 :
219 : /*
220 : * Number of file descriptors known to be in use by VFD entries.
221 : */
222 : static int nfile = 0;
223 :
224 : /*
225 : * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 : * to close
227 : */
228 : static bool have_xact_temporary_files = false;
229 :
230 : /*
231 : * Tracks the total size of all temporary files. Note: when temp_file_limit
232 : * is being enforced, this cannot overflow since the limit cannot be more
233 : * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 : * overflow, but we don't care.
235 : */
236 : static uint64 temporary_files_size = 0;
237 :
238 : /* Temporary file access initialized and not yet shut down? */
239 : #ifdef USE_ASSERT_CHECKING
240 : static bool temporary_files_allowed = false;
241 : #endif
242 :
243 : /*
244 : * List of OS handles opened with AllocateFile, AllocateDir and
245 : * OpenTransientFile.
246 : */
247 : typedef enum
248 : {
249 : AllocateDescFile,
250 : AllocateDescPipe,
251 : AllocateDescDir,
252 : AllocateDescRawFD,
253 : } AllocateDescKind;
254 :
255 : typedef struct
256 : {
257 : AllocateDescKind kind;
258 : SubTransactionId create_subid;
259 : union
260 : {
261 : FILE *file;
262 : DIR *dir;
263 : int fd;
264 : } desc;
265 : } AllocateDesc;
266 :
267 : static int numAllocatedDescs = 0;
268 : static int maxAllocatedDescs = 0;
269 : static AllocateDesc *allocatedDescs = NULL;
270 :
271 : /*
272 : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 : */
274 : static int numExternalFDs = 0;
275 :
276 : /*
277 : * Number of temporary files opened during the current session;
278 : * this is used in generation of tempfile names.
279 : */
280 : static long tempFileCounter = 0;
281 :
282 : /*
283 : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 : * indicating that the current database's default tablespace should be used.)
285 : * When numTempTableSpaces is -1, this has not been set in the current
286 : * transaction.
287 : */
288 : static Oid *tempTableSpaces = NULL;
289 : static int numTempTableSpaces = -1;
290 : static int nextTempTableSpace = 0;
291 :
292 :
293 : /*--------------------
294 : *
295 : * Private Routines
296 : *
297 : * Delete - delete a file from the Lru ring
298 : * LruDelete - remove a file from the Lru ring and close its FD
299 : * Insert - put a file at the front of the Lru ring
300 : * LruInsert - put a file at the front of the Lru ring and open it
301 : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 : * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 : * FreeVfd - free a file record
305 : *
306 : * The Least Recently Used ring is a doubly linked list that begins and
307 : * ends on element zero. Element zero is special -- it doesn't represent
308 : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 : * anchor that shows us the beginning/end of the ring.
310 : * Only VFD elements that are currently really open (have an FD assigned) are
311 : * in the Lru ring. Elements that are "virtually" open can be recognized
312 : * by having a non-null fileName field.
313 : *
314 : * example:
315 : *
316 : * /--less----\ /---------\
317 : * v \ v \
318 : * #0 --more---> LeastRecentlyUsed --more-\ \
319 : * ^\ | |
320 : * \\less--> MostRecentlyUsedFile <---/ |
321 : * \more---/ \--less--/
322 : *
323 : *--------------------
324 : */
325 : static void Delete(File file);
326 : static void LruDelete(File file);
327 : static void Insert(File file);
328 : static int LruInsert(File file);
329 : static bool ReleaseLruFile(void);
330 : static void ReleaseLruFiles(void);
331 : static File AllocateVfd(void);
332 : static void FreeVfd(File file);
333 :
334 : static int FileAccess(File file);
335 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336 : static bool reserveAllocatedDesc(void);
337 : static int FreeDesc(AllocateDesc *desc);
338 :
339 : static void BeforeShmemExit_Files(int code, Datum arg);
340 : static void CleanupTempFiles(bool isCommit, bool isProcExit);
341 : static void RemovePgTempRelationFiles(const char *tsdirname);
342 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343 :
344 : static void walkdir(const char *path,
345 : void (*action) (const char *fname, bool isdir, int elevel),
346 : bool process_symlinks,
347 : int elevel);
348 : #ifdef PG_FLUSH_DATA_WORKS
349 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350 : #endif
351 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352 : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353 :
354 : static int fsync_parent_path(const char *fname, int elevel);
355 :
356 :
357 : /* ResourceOwner callbacks to hold virtual file descriptors */
358 : static void ResOwnerReleaseFile(Datum res);
359 : static char *ResOwnerPrintFile(Datum res);
360 :
361 : static const ResourceOwnerDesc file_resowner_desc =
362 : {
363 : .name = "File",
364 : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365 : .release_priority = RELEASE_PRIO_FILES,
366 : .ReleaseResource = ResOwnerReleaseFile,
367 : .DebugPrint = ResOwnerPrintFile
368 : };
369 :
370 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
371 : static inline void
372 6436 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
373 : {
374 6436 : ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
375 6436 : }
376 : static inline void
377 6426 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
378 : {
379 6426 : ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
380 6426 : }
381 :
382 : /*
383 : * pg_fsync --- do fsync with or without writethrough
384 : */
385 : int
386 112478 : pg_fsync(int fd)
387 : {
388 : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389 : struct stat st;
390 :
391 : /*
392 : * Some operating system implementations of fsync() have requirements
393 : * about the file access modes that were used when their file descriptor
394 : * argument was opened, and these requirements differ depending on whether
395 : * the file descriptor is for a directory.
396 : *
397 : * For any file descriptor that may eventually be handed to fsync(), we
398 : * should have opened it with access modes that are compatible with
399 : * fsync() on all supported systems, otherwise the code may not be
400 : * portable, even if it runs ok on the current system.
401 : *
402 : * We assert here that a descriptor for a file was opened with write
403 : * permissions (either O_RDWR or O_WRONLY) and for a directory without
404 : * write permissions (O_RDONLY).
405 : *
406 : * Ignore any fstat errors and let the follow-up fsync() do its work.
407 : * Doing this sanity check here counts for the case where fsync() is
408 : * disabled.
409 : */
410 : if (fstat(fd, &st) == 0)
411 : {
412 : int desc_flags = fcntl(fd, F_GETFL);
413 :
414 : /*
415 : * O_RDONLY is historically 0, so just make sure that for directories
416 : * no write flags are used.
417 : */
418 : if (S_ISDIR(st.st_mode))
419 : Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
420 : else
421 : Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
422 : }
423 : errno = 0;
424 : #endif
425 :
426 : /* #if is to skip the wal_sync_method test if there's no need for it */
427 : #if defined(HAVE_FSYNC_WRITETHROUGH)
428 : if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
429 : return pg_fsync_writethrough(fd);
430 : else
431 : #endif
432 112478 : return pg_fsync_no_writethrough(fd);
433 : }
434 :
435 :
436 : /*
437 : * pg_fsync_no_writethrough --- same as fsync except does nothing if
438 : * enableFsync is off
439 : */
440 : int
441 112478 : pg_fsync_no_writethrough(int fd)
442 : {
443 : int rc;
444 :
445 112478 : if (!enableFsync)
446 112478 : return 0;
447 :
448 0 : retry:
449 0 : rc = fsync(fd);
450 :
451 0 : if (rc == -1 && errno == EINTR)
452 0 : goto retry;
453 :
454 0 : return rc;
455 : }
456 :
457 : /*
458 : * pg_fsync_writethrough
459 : */
460 : int
461 0 : pg_fsync_writethrough(int fd)
462 : {
463 0 : if (enableFsync)
464 : {
465 : #if defined(F_FULLFSYNC)
466 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
467 : #else
468 0 : errno = ENOSYS;
469 0 : return -1;
470 : #endif
471 : }
472 : else
473 0 : return 0;
474 : }
475 :
476 : /*
477 : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
478 : */
479 : int
480 0 : pg_fdatasync(int fd)
481 : {
482 : int rc;
483 :
484 0 : if (!enableFsync)
485 0 : return 0;
486 :
487 0 : retry:
488 0 : rc = fdatasync(fd);
489 :
490 0 : if (rc == -1 && errno == EINTR)
491 0 : goto retry;
492 :
493 0 : return rc;
494 : }
495 :
496 : /*
497 : * pg_file_exists -- check that a file exists.
498 : *
499 : * This requires an absolute path to the file. Returns true if the file is
500 : * not a directory, false otherwise.
501 : */
502 : bool
503 35354 : pg_file_exists(const char *name)
504 : {
505 : struct stat st;
506 :
507 : Assert(name != NULL);
508 :
509 35354 : if (stat(name, &st) == 0)
510 18752 : return !S_ISDIR(st.st_mode);
511 16602 : else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
512 0 : ereport(ERROR,
513 : (errcode_for_file_access(),
514 : errmsg("could not access file \"%s\": %m", name)));
515 :
516 16602 : return false;
517 : }
518 :
519 : /*
520 : * pg_flush_data --- advise OS that the described dirty data should be flushed
521 : *
522 : * offset of 0 with nbytes 0 means that the entire file should be flushed
523 : */
524 : void
525 65646 : pg_flush_data(int fd, off_t offset, off_t nbytes)
526 : {
527 : /*
528 : * Right now file flushing is primarily used to avoid making later
529 : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
530 : * if fsyncs are disabled - that's a decision we might want to make
531 : * configurable at some point.
532 : */
533 65646 : if (!enableFsync)
534 65646 : return;
535 :
536 : /*
537 : * We compile all alternatives that are supported on the current platform,
538 : * to find portability problems more easily.
539 : */
540 : #if defined(HAVE_SYNC_FILE_RANGE)
541 : {
542 : int rc;
543 : static bool not_implemented_by_kernel = false;
544 :
545 0 : if (not_implemented_by_kernel)
546 0 : return;
547 :
548 0 : retry:
549 :
550 : /*
551 : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
552 : * tells the OS that writeback for the specified blocks should be
553 : * started, but that we don't want to wait for completion. Note that
554 : * this call might block if too much dirty data exists in the range.
555 : * This is the preferable method on OSs supporting it, as it works
556 : * reliably when available (contrast to msync()) and doesn't flush out
557 : * clean data (like FADV_DONTNEED).
558 : */
559 0 : rc = sync_file_range(fd, offset, nbytes,
560 : SYNC_FILE_RANGE_WRITE);
561 0 : if (rc != 0)
562 : {
563 : int elevel;
564 :
565 0 : if (rc == EINTR)
566 0 : goto retry;
567 :
568 : /*
569 : * For systems that don't have an implementation of
570 : * sync_file_range() such as Windows WSL, generate only one
571 : * warning and then suppress all further attempts by this process.
572 : */
573 0 : if (errno == ENOSYS)
574 : {
575 0 : elevel = WARNING;
576 0 : not_implemented_by_kernel = true;
577 : }
578 : else
579 0 : elevel = data_sync_elevel(WARNING);
580 :
581 0 : ereport(elevel,
582 : (errcode_for_file_access(),
583 : errmsg("could not flush dirty data: %m")));
584 : }
585 :
586 0 : return;
587 : }
588 : #endif
589 : #if !defined(WIN32) && defined(MS_ASYNC)
590 : {
591 : void *p;
592 : static int pagesize = 0;
593 :
594 : /*
595 : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
596 : * writeback. On linux it only does so if MS_SYNC is specified, but
597 : * then it does the writeback synchronously. Luckily all common linux
598 : * systems have sync_file_range(). This is preferable over
599 : * FADV_DONTNEED because it doesn't flush out clean data.
600 : *
601 : * We map the file (mmap()), tell the kernel to sync back the contents
602 : * (msync()), and then remove the mapping again (munmap()).
603 : */
604 :
605 : /* mmap() needs actual length if we want to map whole file */
606 : if (offset == 0 && nbytes == 0)
607 : {
608 : nbytes = lseek(fd, 0, SEEK_END);
609 : if (nbytes < 0)
610 : {
611 : ereport(WARNING,
612 : (errcode_for_file_access(),
613 : errmsg("could not determine dirty data size: %m")));
614 : return;
615 : }
616 : }
617 :
618 : /*
619 : * Some platforms reject partial-page mmap() attempts. To deal with
620 : * that, just truncate the request to a page boundary. If any extra
621 : * bytes don't get flushed, well, it's only a hint anyway.
622 : */
623 :
624 : /* fetch pagesize only once */
625 : if (pagesize == 0)
626 : pagesize = sysconf(_SC_PAGESIZE);
627 :
628 : /* align length to pagesize, dropping any fractional page */
629 : if (pagesize > 0)
630 : nbytes = (nbytes / pagesize) * pagesize;
631 :
632 : /* fractional-page request is a no-op */
633 : if (nbytes <= 0)
634 : return;
635 :
636 : /*
637 : * mmap could well fail, particularly on 32-bit platforms where there
638 : * may simply not be enough address space. If so, silently fall
639 : * through to the next implementation.
640 : */
641 : if (nbytes <= (off_t) SSIZE_MAX)
642 : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
643 : else
644 : p = MAP_FAILED;
645 :
646 : if (p != MAP_FAILED)
647 : {
648 : int rc;
649 :
650 : rc = msync(p, (size_t) nbytes, MS_ASYNC);
651 : if (rc != 0)
652 : {
653 : ereport(data_sync_elevel(WARNING),
654 : (errcode_for_file_access(),
655 : errmsg("could not flush dirty data: %m")));
656 : /* NB: need to fall through to munmap()! */
657 : }
658 :
659 : rc = munmap(p, (size_t) nbytes);
660 : if (rc != 0)
661 : {
662 : /* FATAL error because mapping would remain */
663 : ereport(FATAL,
664 : (errcode_for_file_access(),
665 : errmsg("could not munmap() while flushing data: %m")));
666 : }
667 :
668 : return;
669 : }
670 : }
671 : #endif
672 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
673 : {
674 : int rc;
675 :
676 : /*
677 : * Signal the kernel that the passed in range should not be cached
678 : * anymore. This has the, desired, side effect of writing out dirty
679 : * data, and the, undesired, side effect of likely discarding useful
680 : * clean cached blocks. For the latter reason this is the least
681 : * preferable method.
682 : */
683 :
684 : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
685 :
686 : if (rc != 0)
687 : {
688 : /* don't error out, this is just a performance optimization */
689 : ereport(WARNING,
690 : (errcode_for_file_access(),
691 : errmsg("could not flush dirty data: %m")));
692 : }
693 :
694 : return;
695 : }
696 : #endif
697 : }
698 :
699 : /*
700 : * Truncate an open file to a given length.
701 : */
702 : static int
703 1030 : pg_ftruncate(int fd, off_t length)
704 : {
705 : int ret;
706 :
707 1030 : retry:
708 1030 : ret = ftruncate(fd, length);
709 :
710 1030 : if (ret == -1 && errno == EINTR)
711 0 : goto retry;
712 :
713 1030 : return ret;
714 : }
715 :
716 : /*
717 : * Truncate a file to a given length by name.
718 : */
719 : int
720 420946 : pg_truncate(const char *path, off_t length)
721 : {
722 : int ret;
723 : #ifdef WIN32
724 : int save_errno;
725 : int fd;
726 :
727 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
728 : if (fd >= 0)
729 : {
730 : ret = pg_ftruncate(fd, length);
731 : save_errno = errno;
732 : CloseTransientFile(fd);
733 : errno = save_errno;
734 : }
735 : else
736 : ret = -1;
737 : #else
738 :
739 420946 : retry:
740 420946 : ret = truncate(path, length);
741 :
742 420946 : if (ret == -1 && errno == EINTR)
743 0 : goto retry;
744 : #endif
745 :
746 420946 : return ret;
747 : }
748 :
749 : /*
750 : * fsync_fname -- fsync a file or directory, handling errors properly
751 : *
752 : * Try to fsync a file or directory. When doing the latter, ignore errors that
753 : * indicate the OS just doesn't allow/require fsyncing directories.
754 : */
755 : void
756 36294 : fsync_fname(const char *fname, bool isdir)
757 : {
758 36294 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
759 36294 : }
760 :
761 : /*
762 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
763 : *
764 : * This routine ensures that, after returning, the effect of renaming file
765 : * persists in case of a crash. A crash while this routine is running will
766 : * leave you with either the pre-existing or the moved file in place of the
767 : * new file; no mixed state or truncated files are possible.
768 : *
769 : * It does so by using fsync on the old filename and the possibly existing
770 : * target filename before the rename, and the target file and directory after.
771 : *
772 : * Note that rename() cannot be used across arbitrary directories, as they
773 : * might not be on the same filesystem. Therefore this routine does not
774 : * support renaming across directories.
775 : *
776 : * Log errors with the caller specified severity.
777 : *
778 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779 : * valid upon return.
780 : */
781 : int
782 10160 : durable_rename(const char *oldfile, const char *newfile, int elevel)
783 : {
784 : int fd;
785 :
786 : /*
787 : * First fsync the old and target path (if it exists), to ensure that they
788 : * are properly persistent on disk. Syncing the target file is not
789 : * strictly necessary, but it makes it easier to reason about crashes;
790 : * because it's then guaranteed that either source or target file exists
791 : * after a crash.
792 : */
793 10160 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
794 0 : return -1;
795 :
796 10160 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
797 10160 : if (fd < 0)
798 : {
799 7138 : if (errno != ENOENT)
800 : {
801 0 : ereport(elevel,
802 : (errcode_for_file_access(),
803 : errmsg("could not open file \"%s\": %m", newfile)));
804 0 : return -1;
805 : }
806 : }
807 : else
808 : {
809 3022 : if (pg_fsync(fd) != 0)
810 : {
811 : int save_errno;
812 :
813 : /* close file upon error, might not be in transaction context */
814 0 : save_errno = errno;
815 0 : CloseTransientFile(fd);
816 0 : errno = save_errno;
817 :
818 0 : ereport(elevel,
819 : (errcode_for_file_access(),
820 : errmsg("could not fsync file \"%s\": %m", newfile)));
821 0 : return -1;
822 : }
823 :
824 3022 : if (CloseTransientFile(fd) != 0)
825 : {
826 0 : ereport(elevel,
827 : (errcode_for_file_access(),
828 : errmsg("could not close file \"%s\": %m", newfile)));
829 0 : return -1;
830 : }
831 : }
832 :
833 : /* Time to do the real deal... */
834 10160 : if (rename(oldfile, newfile) < 0)
835 : {
836 0 : ereport(elevel,
837 : (errcode_for_file_access(),
838 : errmsg("could not rename file \"%s\" to \"%s\": %m",
839 : oldfile, newfile)));
840 0 : return -1;
841 : }
842 :
843 : /*
844 : * To guarantee renaming the file is persistent, fsync the file with its
845 : * new name, and its containing directory.
846 : */
847 10160 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
848 0 : return -1;
849 :
850 10160 : if (fsync_parent_path(newfile, elevel) != 0)
851 0 : return -1;
852 :
853 10160 : return 0;
854 : }
855 :
856 : /*
857 : * durable_unlink -- remove a file in a durable manner
858 : *
859 : * This routine ensures that, after returning, the effect of removing file
860 : * persists in case of a crash. A crash while this routine is running will
861 : * leave the system in no mixed state.
862 : *
863 : * It does so by using fsync on the parent directory of the file after the
864 : * actual removal is done.
865 : *
866 : * Log errors with the severity specified by caller.
867 : *
868 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
869 : * valid upon return.
870 : */
871 : int
872 1976 : durable_unlink(const char *fname, int elevel)
873 : {
874 1976 : if (unlink(fname) < 0)
875 : {
876 76 : ereport(elevel,
877 : (errcode_for_file_access(),
878 : errmsg("could not remove file \"%s\": %m",
879 : fname)));
880 76 : return -1;
881 : }
882 :
883 : /*
884 : * To guarantee that the removal of the file is persistent, fsync its
885 : * parent directory.
886 : */
887 1900 : if (fsync_parent_path(fname, elevel) != 0)
888 0 : return -1;
889 :
890 1900 : return 0;
891 : }
892 :
893 : /*
894 : * InitFileAccess --- initialize this module during backend startup
895 : *
896 : * This is called during either normal or standalone backend start.
897 : * It is *not* called in the postmaster.
898 : *
899 : * Note that this does not initialize temporary file access, that is
900 : * separately initialized via InitTemporaryFileAccess().
901 : */
902 : void
903 42280 : InitFileAccess(void)
904 : {
905 : Assert(SizeVfdCache == 0); /* call me only once */
906 :
907 : /* initialize cache header entry */
908 42280 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
909 42280 : if (VfdCache == NULL)
910 0 : ereport(FATAL,
911 : (errcode(ERRCODE_OUT_OF_MEMORY),
912 : errmsg("out of memory")));
913 :
914 338240 : MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
915 42280 : VfdCache->fd = VFD_CLOSED;
916 :
917 42280 : SizeVfdCache = 1;
918 42280 : }
919 :
920 : /*
921 : * InitTemporaryFileAccess --- initialize temporary file access during startup
922 : *
923 : * This is called during either normal or standalone backend start.
924 : * It is *not* called in the postmaster.
925 : *
926 : * This is separate from InitFileAccess() because temporary file cleanup can
927 : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
928 : * our reporting has to happen before that. Low level file access should be
929 : * available for longer, hence the separate initialization / shutdown of
930 : * temporary file handling.
931 : */
932 : void
933 42280 : InitTemporaryFileAccess(void)
934 : {
935 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
936 : Assert(!temporary_files_allowed); /* call me only once */
937 :
938 : /*
939 : * Register before-shmem-exit hook to ensure temp files are dropped while
940 : * we can still report stats.
941 : */
942 42280 : before_shmem_exit(BeforeShmemExit_Files, 0);
943 :
944 : #ifdef USE_ASSERT_CHECKING
945 : temporary_files_allowed = true;
946 : #endif
947 42280 : }
948 :
949 : /*
950 : * count_usable_fds --- count how many FDs the system will let us open,
951 : * and estimate how many are already open.
952 : *
953 : * We stop counting if usable_fds reaches max_to_probe. Note: a small
954 : * value of max_to_probe might result in an underestimate of already_open;
955 : * we must fill in any "gaps" in the set of used FDs before the calculation
956 : * of already_open will give the right answer. In practice, max_to_probe
957 : * of a couple of dozen should be enough to ensure good results.
958 : *
959 : * We assume stderr (FD 2) is available for dup'ing. While the calling
960 : * script could theoretically close that, it would be a really bad idea,
961 : * since then one risks loss of error messages from, e.g., libc.
962 : */
963 : static void
964 2022 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
965 : {
966 : int *fd;
967 : int size;
968 2022 : int used = 0;
969 2022 : int highestfd = 0;
970 : int j;
971 :
972 : #ifdef HAVE_GETRLIMIT
973 : struct rlimit rlim;
974 : int getrlimit_status;
975 : #endif
976 :
977 2022 : size = 1024;
978 2022 : fd = (int *) palloc(size * sizeof(int));
979 :
980 : #ifdef HAVE_GETRLIMIT
981 2022 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
982 2022 : if (getrlimit_status != 0)
983 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
984 : #endif /* HAVE_GETRLIMIT */
985 :
986 : /* dup until failure or probe limit reached */
987 : for (;;)
988 2019978 : {
989 : int thisfd;
990 :
991 : #ifdef HAVE_GETRLIMIT
992 :
993 : /*
994 : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
995 : * some platforms
996 : */
997 2022000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
998 0 : break;
999 : #endif
1000 :
1001 2022000 : thisfd = dup(2);
1002 2022000 : if (thisfd < 0)
1003 : {
1004 : /* Expect EMFILE or ENFILE, else it's fishy */
1005 0 : if (errno != EMFILE && errno != ENFILE)
1006 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1007 0 : break;
1008 : }
1009 :
1010 2022000 : if (used >= size)
1011 : {
1012 0 : size *= 2;
1013 0 : fd = (int *) repalloc(fd, size * sizeof(int));
1014 : }
1015 2022000 : fd[used++] = thisfd;
1016 :
1017 2022000 : if (highestfd < thisfd)
1018 2022000 : highestfd = thisfd;
1019 :
1020 2022000 : if (used >= max_to_probe)
1021 2022 : break;
1022 : }
1023 :
1024 : /* release the files we opened */
1025 2024022 : for (j = 0; j < used; j++)
1026 2022000 : close(fd[j]);
1027 :
1028 2022 : pfree(fd);
1029 :
1030 : /*
1031 : * Return results. usable_fds is just the number of successful dups. We
1032 : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1033 : * number) and so already_open is highestfd+1 - usable_fds.
1034 : */
1035 2022 : *usable_fds = used;
1036 2022 : *already_open = highestfd + 1 - used;
1037 2022 : }
1038 :
1039 : /*
1040 : * set_max_safe_fds
1041 : * Determine number of file descriptors that fd.c is allowed to use
1042 : */
1043 : void
1044 2022 : set_max_safe_fds(void)
1045 : {
1046 : int usable_fds;
1047 : int already_open;
1048 :
1049 : /*----------
1050 : * We want to set max_safe_fds to
1051 : * MIN(usable_fds, max_files_per_process)
1052 : * less the slop factor for files that are opened without consulting
1053 : * fd.c. This ensures that we won't allow to open more than
1054 : * max_files_per_process, or the experimentally-determined EMFILE limit,
1055 : * additional files.
1056 : *----------
1057 : */
1058 2022 : count_usable_fds(max_files_per_process,
1059 : &usable_fds, &already_open);
1060 :
1061 2022 : max_safe_fds = Min(usable_fds, max_files_per_process);
1062 :
1063 : /*
1064 : * Take off the FDs reserved for system() etc.
1065 : */
1066 2022 : max_safe_fds -= NUM_RESERVED_FDS;
1067 :
1068 : /*
1069 : * Make sure we still have enough to get by.
1070 : */
1071 2022 : if (max_safe_fds < FD_MINFREE)
1072 0 : ereport(FATAL,
1073 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1074 : errmsg("insufficient file descriptors available to start server process"),
1075 : errdetail("System allows %d, server needs at least %d, %d files are already open.",
1076 : max_safe_fds + NUM_RESERVED_FDS,
1077 : FD_MINFREE + NUM_RESERVED_FDS,
1078 : already_open)));
1079 :
1080 2022 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1081 : max_safe_fds, usable_fds, already_open);
1082 2022 : }
1083 :
1084 : /*
1085 : * Open a file with BasicOpenFilePerm() and pass default file mode for the
1086 : * fileMode parameter.
1087 : */
1088 : int
1089 60664 : BasicOpenFile(const char *fileName, int fileFlags)
1090 : {
1091 60664 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1092 : }
1093 :
1094 : /*
1095 : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1096 : *
1097 : * This is exported for use by places that really want a plain kernel FD,
1098 : * but need to be proof against running out of FDs. Once an FD has been
1099 : * successfully returned, it is the caller's responsibility to ensure that
1100 : * it will not be leaked on ereport()! Most users should *not* call this
1101 : * routine directly, but instead use the VFD abstraction level, which
1102 : * provides protection against descriptor leaks as well as management of
1103 : * files that need to be open for more than a short period of time.
1104 : *
1105 : * Ideally this should be the *only* direct call of open() in the backend.
1106 : * In practice, the postmaster calls open() directly, and there are some
1107 : * direct open() calls done early in backend startup. Those are OK since
1108 : * this module wouldn't have any open files to close at that point anyway.
1109 : */
1110 : int
1111 18078234 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1112 : {
1113 : int fd;
1114 :
1115 18078234 : tryAgain:
1116 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1117 :
1118 : /*
1119 : * The value we defined to stand in for O_DIRECT when simulating it with
1120 : * F_NOCACHE had better not collide with any of the standard flags.
1121 : */
1122 : StaticAssertStmt((PG_O_DIRECT &
1123 : (O_APPEND |
1124 : O_CLOEXEC |
1125 : O_CREAT |
1126 : O_DSYNC |
1127 : O_EXCL |
1128 : O_RDWR |
1129 : O_RDONLY |
1130 : O_SYNC |
1131 : O_TRUNC |
1132 : O_WRONLY)) == 0,
1133 : "PG_O_DIRECT value collides with standard flag");
1134 : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1135 : #else
1136 18078234 : fd = open(fileName, fileFlags, fileMode);
1137 : #endif
1138 :
1139 18078234 : if (fd >= 0)
1140 : {
1141 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1142 : if (fileFlags & PG_O_DIRECT)
1143 : {
1144 : if (fcntl(fd, F_NOCACHE, 1) < 0)
1145 : {
1146 : int save_errno = errno;
1147 :
1148 : close(fd);
1149 : errno = save_errno;
1150 : return -1;
1151 : }
1152 : }
1153 : #endif
1154 :
1155 17276458 : return fd; /* success! */
1156 : }
1157 :
1158 801776 : if (errno == EMFILE || errno == ENFILE)
1159 : {
1160 0 : int save_errno = errno;
1161 :
1162 0 : ereport(LOG,
1163 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1164 : errmsg("out of file descriptors: %m; release and retry")));
1165 0 : errno = 0;
1166 0 : if (ReleaseLruFile())
1167 0 : goto tryAgain;
1168 0 : errno = save_errno;
1169 : }
1170 :
1171 801776 : return -1; /* failure */
1172 : }
1173 :
1174 : /*
1175 : * AcquireExternalFD - attempt to reserve an external file descriptor
1176 : *
1177 : * This should be used by callers that need to hold a file descriptor open
1178 : * over more than a short interval, but cannot use any of the other facilities
1179 : * provided by this module.
1180 : *
1181 : * The difference between this and the underlying ReserveExternalFD function
1182 : * is that this will report failure (by setting errno and returning false)
1183 : * if "too many" external FDs are already reserved. This should be used in
1184 : * any code where the total number of FDs to be reserved is not predictable
1185 : * and small.
1186 : */
1187 : bool
1188 197570 : AcquireExternalFD(void)
1189 : {
1190 : /*
1191 : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1192 : * "external" FDs.
1193 : */
1194 197570 : if (numExternalFDs < max_safe_fds / 3)
1195 : {
1196 197570 : ReserveExternalFD();
1197 197570 : return true;
1198 : }
1199 0 : errno = EMFILE;
1200 0 : return false;
1201 : }
1202 :
1203 : /*
1204 : * ReserveExternalFD - report external consumption of a file descriptor
1205 : *
1206 : * This should be used by callers that need to hold a file descriptor open
1207 : * over more than a short interval, but cannot use any of the other facilities
1208 : * provided by this module. This just tracks the use of the FD and closes
1209 : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1210 : *
1211 : * Call this directly only in code where failure to reserve the FD would be
1212 : * fatal; for example, the WAL-writing code does so, since the alternative is
1213 : * session failure. Also, it's very unwise to do so in code that could
1214 : * consume more than one FD per process.
1215 : *
1216 : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1217 : * available, it doesn't matter too much whether this is called before or
1218 : * after actually opening the FD; but doing so beforehand reduces the risk of
1219 : * an EMFILE failure if not everybody played nice. In any case, it's solely
1220 : * caller's responsibility to keep the external-FD count in sync with reality.
1221 : */
1222 : void
1223 343982 : ReserveExternalFD(void)
1224 : {
1225 : /*
1226 : * Release VFDs if needed to stay safe. Because we do this before
1227 : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1228 : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1229 : */
1230 343982 : ReleaseLruFiles();
1231 :
1232 343982 : numExternalFDs++;
1233 343982 : }
1234 :
1235 : /*
1236 : * ReleaseExternalFD - report release of an external file descriptor
1237 : *
1238 : * This is guaranteed not to change errno, so it can be used in failure paths.
1239 : */
1240 : void
1241 304868 : ReleaseExternalFD(void)
1242 : {
1243 : Assert(numExternalFDs > 0);
1244 304868 : numExternalFDs--;
1245 304868 : }
1246 :
1247 :
1248 : #if defined(FDDEBUG)
1249 :
1250 : static void
1251 : _dump_lru(void)
1252 : {
1253 : int mru = VfdCache[0].lruLessRecently;
1254 : Vfd *vfdP = &VfdCache[mru];
1255 : char buf[2048];
1256 :
1257 : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1258 : while (mru != 0)
1259 : {
1260 : mru = vfdP->lruLessRecently;
1261 : vfdP = &VfdCache[mru];
1262 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1263 : }
1264 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1265 : elog(LOG, "%s", buf);
1266 : }
1267 : #endif /* FDDEBUG */
1268 :
1269 : static void
1270 2494766 : Delete(File file)
1271 : {
1272 : Vfd *vfdP;
1273 :
1274 : Assert(file != 0);
1275 :
1276 : DO_DB(elog(LOG, "Delete %d (%s)",
1277 : file, VfdCache[file].fileName));
1278 : DO_DB(_dump_lru());
1279 :
1280 2494766 : vfdP = &VfdCache[file];
1281 :
1282 2494766 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1283 2494766 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1284 :
1285 : DO_DB(_dump_lru());
1286 2494766 : }
1287 :
1288 : static void
1289 6854 : LruDelete(File file)
1290 : {
1291 : Vfd *vfdP;
1292 :
1293 : Assert(file != 0);
1294 :
1295 : DO_DB(elog(LOG, "LruDelete %d (%s)",
1296 : file, VfdCache[file].fileName));
1297 :
1298 6854 : vfdP = &VfdCache[file];
1299 :
1300 6854 : pgaio_closing_fd(vfdP->fd);
1301 :
1302 : /*
1303 : * Close the file. We aren't expecting this to fail; if it does, better
1304 : * to leak the FD than to mess up our internal state.
1305 : */
1306 6854 : if (close(vfdP->fd) != 0)
1307 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1308 : "could not close file \"%s\": %m", vfdP->fileName);
1309 6854 : vfdP->fd = VFD_CLOSED;
1310 6854 : --nfile;
1311 :
1312 : /* delete the vfd record from the LRU ring */
1313 6854 : Delete(file);
1314 6854 : }
1315 :
1316 : static void
1317 3290210 : Insert(File file)
1318 : {
1319 : Vfd *vfdP;
1320 :
1321 : Assert(file != 0);
1322 :
1323 : DO_DB(elog(LOG, "Insert %d (%s)",
1324 : file, VfdCache[file].fileName));
1325 : DO_DB(_dump_lru());
1326 :
1327 3290210 : vfdP = &VfdCache[file];
1328 :
1329 3290210 : vfdP->lruMoreRecently = 0;
1330 3290210 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1331 3290210 : VfdCache[0].lruLessRecently = file;
1332 3290210 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1333 :
1334 : DO_DB(_dump_lru());
1335 3290210 : }
1336 :
1337 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1338 : static int
1339 126 : LruInsert(File file)
1340 : {
1341 : Vfd *vfdP;
1342 :
1343 : Assert(file != 0);
1344 :
1345 : DO_DB(elog(LOG, "LruInsert %d (%s)",
1346 : file, VfdCache[file].fileName));
1347 :
1348 126 : vfdP = &VfdCache[file];
1349 :
1350 126 : if (FileIsNotOpen(file))
1351 : {
1352 : /* Close excess kernel FDs. */
1353 126 : ReleaseLruFiles();
1354 :
1355 : /*
1356 : * The open could still fail for lack of file descriptors, eg due to
1357 : * overall system file table being full. So, be prepared to release
1358 : * another FD if necessary...
1359 : */
1360 126 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1361 : vfdP->fileMode);
1362 126 : if (vfdP->fd < 0)
1363 : {
1364 : DO_DB(elog(LOG, "re-open failed: %m"));
1365 0 : return -1;
1366 : }
1367 : else
1368 : {
1369 126 : ++nfile;
1370 : }
1371 : }
1372 :
1373 : /*
1374 : * put it at the head of the Lru ring
1375 : */
1376 :
1377 126 : Insert(file);
1378 :
1379 126 : return 0;
1380 : }
1381 :
1382 : /*
1383 : * Release one kernel FD by closing the least-recently-used VFD.
1384 : */
1385 : static bool
1386 6572 : ReleaseLruFile(void)
1387 : {
1388 : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1389 :
1390 6572 : if (nfile > 0)
1391 : {
1392 : /*
1393 : * There are opened files and so there should be at least one used vfd
1394 : * in the ring.
1395 : */
1396 : Assert(VfdCache[0].lruMoreRecently != 0);
1397 6572 : LruDelete(VfdCache[0].lruMoreRecently);
1398 6572 : return true; /* freed a file */
1399 : }
1400 0 : return false; /* no files available to free */
1401 : }
1402 :
1403 : /*
1404 : * Release kernel FDs as needed to get under the max_safe_fds limit.
1405 : * After calling this, it's OK to try to open another file.
1406 : */
1407 : static void
1408 18606944 : ReleaseLruFiles(void)
1409 : {
1410 18613516 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1411 : {
1412 6572 : if (!ReleaseLruFile())
1413 0 : break;
1414 : }
1415 18606944 : }
1416 :
1417 : static File
1418 2628302 : AllocateVfd(void)
1419 : {
1420 : Index i;
1421 : File file;
1422 :
1423 : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1424 :
1425 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1426 :
1427 2628302 : if (VfdCache[0].nextFree == 0)
1428 : {
1429 : /*
1430 : * The free list is empty so it is time to increase the size of the
1431 : * array. We choose to double it each time this happens. However,
1432 : * there's not much point in starting *real* small.
1433 : */
1434 49324 : Size newCacheSize = SizeVfdCache * 2;
1435 : Vfd *newVfdCache;
1436 :
1437 49324 : if (newCacheSize < 32)
1438 36484 : newCacheSize = 32;
1439 :
1440 : /*
1441 : * Be careful not to clobber VfdCache ptr if realloc fails.
1442 : */
1443 49324 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1444 49324 : if (newVfdCache == NULL)
1445 0 : ereport(ERROR,
1446 : (errcode(ERRCODE_OUT_OF_MEMORY),
1447 : errmsg("out of memory")));
1448 49324 : VfdCache = newVfdCache;
1449 :
1450 : /*
1451 : * Initialize the new entries and link them into the free list.
1452 : */
1453 2346792 : for (i = SizeVfdCache; i < newCacheSize; i++)
1454 : {
1455 18379744 : MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1456 2297468 : VfdCache[i].nextFree = i + 1;
1457 2297468 : VfdCache[i].fd = VFD_CLOSED;
1458 : }
1459 49324 : VfdCache[newCacheSize - 1].nextFree = 0;
1460 49324 : VfdCache[0].nextFree = SizeVfdCache;
1461 :
1462 : /*
1463 : * Record the new size
1464 : */
1465 49324 : SizeVfdCache = newCacheSize;
1466 : }
1467 :
1468 2628302 : file = VfdCache[0].nextFree;
1469 :
1470 2628302 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1471 :
1472 2628302 : return file;
1473 : }
1474 :
1475 : static void
1476 1828174 : FreeVfd(File file)
1477 : {
1478 1828174 : Vfd *vfdP = &VfdCache[file];
1479 :
1480 : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1481 : file, vfdP->fileName ? vfdP->fileName : ""));
1482 :
1483 1828174 : if (vfdP->fileName != NULL)
1484 : {
1485 1037266 : free(vfdP->fileName);
1486 1037266 : vfdP->fileName = NULL;
1487 : }
1488 1828174 : vfdP->fdstate = 0x0;
1489 :
1490 1828174 : vfdP->nextFree = VfdCache[0].nextFree;
1491 1828174 : VfdCache[0].nextFree = file;
1492 1828174 : }
1493 :
1494 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1495 : static int
1496 5972416 : FileAccess(File file)
1497 : {
1498 : int returnValue;
1499 :
1500 : DO_DB(elog(LOG, "FileAccess %d (%s)",
1501 : file, VfdCache[file].fileName));
1502 :
1503 : /*
1504 : * Is the file open? If not, open it and put it at the head of the LRU
1505 : * ring (possibly closing the least recently used file to get an FD).
1506 : */
1507 :
1508 5972416 : if (FileIsNotOpen(file))
1509 : {
1510 126 : returnValue = LruInsert(file);
1511 126 : if (returnValue != 0)
1512 0 : return returnValue;
1513 : }
1514 5972290 : else if (VfdCache[0].lruLessRecently != file)
1515 : {
1516 : /*
1517 : * We now know that the file is open and that it is not the last one
1518 : * accessed, so we need to move it to the head of the Lru ring.
1519 : */
1520 :
1521 1452690 : Delete(file);
1522 1452690 : Insert(file);
1523 : }
1524 :
1525 5972416 : return 0;
1526 : }
1527 :
1528 : /*
1529 : * Called whenever a temporary file is deleted to report its size.
1530 : */
1531 : static void
1532 3786 : ReportTemporaryFileUsage(const char *path, off_t size)
1533 : {
1534 3786 : pgstat_report_tempfile(size);
1535 :
1536 3786 : if (log_temp_files >= 0)
1537 : {
1538 1162 : if ((size / 1024) >= log_temp_files)
1539 230 : ereport(LOG,
1540 : (errmsg("temporary file: path \"%s\", size %lu",
1541 : path, (unsigned long) size)));
1542 : }
1543 3786 : }
1544 :
1545 : /*
1546 : * Called to register a temporary file for automatic close.
1547 : * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1548 : * before the file was opened.
1549 : */
1550 : static void
1551 6436 : RegisterTemporaryFile(File file)
1552 : {
1553 6436 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1554 6436 : VfdCache[file].resowner = CurrentResourceOwner;
1555 :
1556 : /* Backup mechanism for closing at end of xact. */
1557 6436 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1558 6436 : have_xact_temporary_files = true;
1559 6436 : }
1560 :
1561 : /*
1562 : * Called when we get a shared invalidation message on some relation.
1563 : */
1564 : #ifdef NOT_USED
1565 : void
1566 : FileInvalidate(File file)
1567 : {
1568 : Assert(FileIsValid(file));
1569 : if (!FileIsNotOpen(file))
1570 : LruDelete(file);
1571 : }
1572 : #endif
1573 :
1574 : /*
1575 : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1576 : * fileMode parameter.
1577 : */
1578 : File
1579 2628302 : PathNameOpenFile(const char *fileName, int fileFlags)
1580 : {
1581 2628302 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1582 : }
1583 :
1584 : /*
1585 : * open a file in an arbitrary directory
1586 : *
1587 : * NB: if the passed pathname is relative (which it usually is),
1588 : * it will be interpreted relative to the process' working directory
1589 : * (which should always be $PGDATA when this code is running).
1590 : */
1591 : File
1592 2628302 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1593 : {
1594 : char *fnamecopy;
1595 : File file;
1596 : Vfd *vfdP;
1597 :
1598 : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1599 : fileName, fileFlags, fileMode));
1600 :
1601 : /*
1602 : * We need a malloc'd copy of the file name; fail cleanly if no room.
1603 : */
1604 2628302 : fnamecopy = strdup(fileName);
1605 2628302 : if (fnamecopy == NULL)
1606 0 : ereport(ERROR,
1607 : (errcode(ERRCODE_OUT_OF_MEMORY),
1608 : errmsg("out of memory")));
1609 :
1610 2628302 : file = AllocateVfd();
1611 2628302 : vfdP = &VfdCache[file];
1612 :
1613 : /* Close excess kernel FDs. */
1614 2628302 : ReleaseLruFiles();
1615 :
1616 : /*
1617 : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1618 : * client shouldn't be expected to know which kernel descriptors are
1619 : * currently open, so it wouldn't make sense for them to be inherited by
1620 : * executed subprograms.
1621 : */
1622 2628302 : fileFlags |= O_CLOEXEC;
1623 :
1624 2628302 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1625 :
1626 2628302 : if (vfdP->fd < 0)
1627 : {
1628 790908 : int save_errno = errno;
1629 :
1630 790908 : FreeVfd(file);
1631 790908 : free(fnamecopy);
1632 790908 : errno = save_errno;
1633 790908 : return -1;
1634 : }
1635 1837394 : ++nfile;
1636 : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1637 : vfdP->fd));
1638 :
1639 1837394 : vfdP->fileName = fnamecopy;
1640 : /* Saved flags are adjusted to be OK for re-opening file */
1641 1837394 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1642 1837394 : vfdP->fileMode = fileMode;
1643 1837394 : vfdP->fileSize = 0;
1644 1837394 : vfdP->fdstate = 0x0;
1645 1837394 : vfdP->resowner = NULL;
1646 :
1647 1837394 : Insert(file);
1648 :
1649 1837394 : return file;
1650 : }
1651 :
1652 : /*
1653 : * Create directory 'directory'. If necessary, create 'basedir', which must
1654 : * be the directory above it. This is designed for creating the top-level
1655 : * temporary directory on demand before creating a directory underneath it.
1656 : * Do nothing if the directory already exists.
1657 : *
1658 : * Directories created within the top-level temporary directory should begin
1659 : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1660 : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1661 : * that do not need any particular prefix.
1662 : */
1663 : void
1664 344 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1665 : {
1666 344 : if (MakePGDirectory(directory) < 0)
1667 : {
1668 26 : if (errno == EEXIST)
1669 6 : return;
1670 :
1671 : /*
1672 : * Failed. Try to create basedir first in case it's missing. Tolerate
1673 : * EEXIST to close a race against another process following the same
1674 : * algorithm.
1675 : */
1676 20 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1677 0 : ereport(ERROR,
1678 : (errcode_for_file_access(),
1679 : errmsg("cannot create temporary directory \"%s\": %m",
1680 : basedir)));
1681 :
1682 : /* Try again. */
1683 20 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1684 0 : ereport(ERROR,
1685 : (errcode_for_file_access(),
1686 : errmsg("cannot create temporary subdirectory \"%s\": %m",
1687 : directory)));
1688 : }
1689 : }
1690 :
1691 : /*
1692 : * Delete a directory and everything in it, if it exists.
1693 : */
1694 : void
1695 418 : PathNameDeleteTemporaryDir(const char *dirname)
1696 : {
1697 : struct stat statbuf;
1698 :
1699 : /* Silently ignore missing directory. */
1700 418 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1701 80 : return;
1702 :
1703 : /*
1704 : * Currently, walkdir doesn't offer a way for our passed in function to
1705 : * maintain state. Perhaps it should, so that we could tell the caller
1706 : * whether this operation succeeded or failed. Since this operation is
1707 : * used in a cleanup path, we wouldn't actually behave differently: we'll
1708 : * just log failures.
1709 : */
1710 338 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1711 : }
1712 :
1713 : /*
1714 : * Open a temporary file that will disappear when we close it.
1715 : *
1716 : * This routine takes care of generating an appropriate tempfile name.
1717 : * There's no need to pass in fileFlags or fileMode either, since only
1718 : * one setting makes any sense for a temp file.
1719 : *
1720 : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1721 : * to ensure it's closed and deleted when it's no longer needed, typically at
1722 : * the end-of-transaction. In most cases, you don't want temporary files to
1723 : * outlive the transaction that created them, so this should be false -- but
1724 : * if you need "somewhat" temporary storage, this might be useful. In either
1725 : * case, the file is removed when the File is explicitly closed.
1726 : */
1727 : File
1728 2046 : OpenTemporaryFile(bool interXact)
1729 : {
1730 2046 : File file = 0;
1731 :
1732 : Assert(temporary_files_allowed); /* check temp file access is up */
1733 :
1734 : /*
1735 : * Make sure the current resource owner has space for this File before we
1736 : * open it, if we'll be registering it below.
1737 : */
1738 2046 : if (!interXact)
1739 2046 : ResourceOwnerEnlarge(CurrentResourceOwner);
1740 :
1741 : /*
1742 : * If some temp tablespace(s) have been given to us, try to use the next
1743 : * one. If a given tablespace can't be found, we silently fall back to
1744 : * the database's default tablespace.
1745 : *
1746 : * BUT: if the temp file is slated to outlive the current transaction,
1747 : * force it into the database's default tablespace, so that it will not
1748 : * pose a threat to possible tablespace drop attempts.
1749 : */
1750 2046 : if (numTempTableSpaces > 0 && !interXact)
1751 : {
1752 2 : Oid tblspcOid = GetNextTempTableSpace();
1753 :
1754 2 : if (OidIsValid(tblspcOid))
1755 2 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1756 : }
1757 :
1758 : /*
1759 : * If not, or if tablespace is bad, create in database's default
1760 : * tablespace. MyDatabaseTableSpace should normally be set before we get
1761 : * here, but just in case it isn't, fall back to pg_default tablespace.
1762 : */
1763 2046 : if (file <= 0)
1764 2044 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1765 : MyDatabaseTableSpace :
1766 : DEFAULTTABLESPACE_OID,
1767 : true);
1768 :
1769 : /* Mark it for deletion at close and temporary file size limit */
1770 2046 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1771 :
1772 : /* Register it with the current resource owner */
1773 2046 : if (!interXact)
1774 2046 : RegisterTemporaryFile(file);
1775 :
1776 2046 : return file;
1777 : }
1778 :
1779 : /*
1780 : * Return the path of the temp directory in a given tablespace.
1781 : */
1782 : void
1783 13260 : TempTablespacePath(char *path, Oid tablespace)
1784 : {
1785 : /*
1786 : * Identify the tempfile directory for this tablespace.
1787 : *
1788 : * If someone tries to specify pg_global, use pg_default instead.
1789 : */
1790 13260 : if (tablespace == InvalidOid ||
1791 2 : tablespace == DEFAULTTABLESPACE_OID ||
1792 : tablespace == GLOBALTABLESPACE_OID)
1793 13258 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1794 : else
1795 : {
1796 : /* All other tablespaces are accessed via symlinks */
1797 2 : snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1798 : PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
1799 : PG_TEMP_FILES_DIR);
1800 : }
1801 13260 : }
1802 :
1803 : /*
1804 : * Open a temporary file in a specific tablespace.
1805 : * Subroutine for OpenTemporaryFile, which see for details.
1806 : */
1807 : static File
1808 2046 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1809 : {
1810 : char tempdirpath[MAXPGPATH];
1811 : char tempfilepath[MAXPGPATH];
1812 : File file;
1813 :
1814 2046 : TempTablespacePath(tempdirpath, tblspcOid);
1815 :
1816 : /*
1817 : * Generate a tempfile name that should be unique within the current
1818 : * database instance.
1819 : */
1820 2046 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1821 : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1822 :
1823 : /*
1824 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1825 : * temp file that can be reused.
1826 : */
1827 2046 : file = PathNameOpenFile(tempfilepath,
1828 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1829 2046 : if (file <= 0)
1830 : {
1831 : /*
1832 : * We might need to create the tablespace's tempfile directory, if no
1833 : * one has yet done so.
1834 : *
1835 : * Don't check for an error from MakePGDirectory; it could fail if
1836 : * someone else just did the same thing. If it doesn't work then
1837 : * we'll bomb out on the second create attempt, instead.
1838 : */
1839 182 : (void) MakePGDirectory(tempdirpath);
1840 :
1841 182 : file = PathNameOpenFile(tempfilepath,
1842 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1843 182 : if (file <= 0 && rejectError)
1844 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1845 : tempfilepath);
1846 : }
1847 :
1848 2046 : return file;
1849 : }
1850 :
1851 :
1852 : /*
1853 : * Create a new file. The directory containing it must already exist. Files
1854 : * created this way are subject to temp_file_limit and are automatically
1855 : * closed at end of transaction, but are not automatically deleted on close
1856 : * because they are intended to be shared between cooperating backends.
1857 : *
1858 : * If the file is inside the top-level temporary directory, its name should
1859 : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1860 : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1861 : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1862 : * the prefix isn't needed.
1863 : */
1864 : File
1865 2084 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1866 : {
1867 : File file;
1868 :
1869 : Assert(temporary_files_allowed); /* check temp file access is up */
1870 :
1871 2084 : ResourceOwnerEnlarge(CurrentResourceOwner);
1872 :
1873 : /*
1874 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1875 : * temp file that can be reused.
1876 : */
1877 2084 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1878 2084 : if (file <= 0)
1879 : {
1880 344 : if (error_on_failure)
1881 0 : ereport(ERROR,
1882 : (errcode_for_file_access(),
1883 : errmsg("could not create temporary file \"%s\": %m",
1884 : path)));
1885 : else
1886 344 : return file;
1887 : }
1888 :
1889 : /* Mark it for temp_file_limit accounting. */
1890 1740 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1891 :
1892 : /* Register it for automatic close. */
1893 1740 : RegisterTemporaryFile(file);
1894 :
1895 1740 : return file;
1896 : }
1897 :
1898 : /*
1899 : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1900 : * another backend. Files opened this way don't count against the
1901 : * temp_file_limit of the caller, are automatically closed at the end of the
1902 : * transaction but are not deleted on close.
1903 : */
1904 : File
1905 5846 : PathNameOpenTemporaryFile(const char *path, int mode)
1906 : {
1907 : File file;
1908 :
1909 : Assert(temporary_files_allowed); /* check temp file access is up */
1910 :
1911 5846 : ResourceOwnerEnlarge(CurrentResourceOwner);
1912 :
1913 5846 : file = PathNameOpenFile(path, mode | PG_BINARY);
1914 :
1915 : /* If no such file, then we don't raise an error. */
1916 5846 : if (file <= 0 && errno != ENOENT)
1917 0 : ereport(ERROR,
1918 : (errcode_for_file_access(),
1919 : errmsg("could not open temporary file \"%s\": %m",
1920 : path)));
1921 :
1922 5846 : if (file > 0)
1923 : {
1924 : /* Register it for automatic close. */
1925 2650 : RegisterTemporaryFile(file);
1926 : }
1927 :
1928 5846 : return file;
1929 : }
1930 :
1931 : /*
1932 : * Delete a file by pathname. Return true if the file existed, false if
1933 : * didn't.
1934 : */
1935 : bool
1936 4184 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1937 : {
1938 : struct stat filestats;
1939 : int stat_errno;
1940 :
1941 : /* Get the final size for pgstat reporting. */
1942 4184 : if (stat(path, &filestats) != 0)
1943 2444 : stat_errno = errno;
1944 : else
1945 1740 : stat_errno = 0;
1946 :
1947 : /*
1948 : * Unlike FileClose's automatic file deletion code, we tolerate
1949 : * non-existence to support BufFileDeleteFileSet which doesn't know how
1950 : * many segments it has to delete until it runs out.
1951 : */
1952 4184 : if (stat_errno == ENOENT)
1953 2444 : return false;
1954 :
1955 1740 : if (unlink(path) < 0)
1956 : {
1957 0 : if (errno != ENOENT)
1958 0 : ereport(error_on_failure ? ERROR : LOG,
1959 : (errcode_for_file_access(),
1960 : errmsg("could not unlink temporary file \"%s\": %m",
1961 : path)));
1962 0 : return false;
1963 : }
1964 :
1965 1740 : if (stat_errno == 0)
1966 1740 : ReportTemporaryFileUsage(path, filestats.st_size);
1967 : else
1968 : {
1969 0 : errno = stat_errno;
1970 0 : ereport(LOG,
1971 : (errcode_for_file_access(),
1972 : errmsg("could not stat file \"%s\": %m", path)));
1973 : }
1974 :
1975 1740 : return true;
1976 : }
1977 :
1978 : /*
1979 : * close a file when done with it
1980 : */
1981 : void
1982 1037266 : FileClose(File file)
1983 : {
1984 : Vfd *vfdP;
1985 :
1986 : Assert(FileIsValid(file));
1987 :
1988 : DO_DB(elog(LOG, "FileClose: %d (%s)",
1989 : file, VfdCache[file].fileName));
1990 :
1991 1037266 : vfdP = &VfdCache[file];
1992 :
1993 1037266 : if (!FileIsNotOpen(file))
1994 : {
1995 1035222 : pgaio_closing_fd(vfdP->fd);
1996 :
1997 : /* close the file */
1998 1035222 : if (close(vfdP->fd) != 0)
1999 : {
2000 : /*
2001 : * We may need to panic on failure to close non-temporary files;
2002 : * see LruDelete.
2003 : */
2004 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
2005 : "could not close file \"%s\": %m", vfdP->fileName);
2006 : }
2007 :
2008 1035222 : --nfile;
2009 1035222 : vfdP->fd = VFD_CLOSED;
2010 :
2011 : /* remove the file from the lru ring */
2012 1035222 : Delete(file);
2013 : }
2014 :
2015 1037266 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2016 : {
2017 : /* Subtract its size from current usage (do first in case of error) */
2018 3786 : temporary_files_size -= vfdP->fileSize;
2019 3786 : vfdP->fileSize = 0;
2020 : }
2021 :
2022 : /*
2023 : * Delete the file if it was temporary, and make a log entry if wanted
2024 : */
2025 1037266 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2026 : {
2027 : struct stat filestats;
2028 : int stat_errno;
2029 :
2030 : /*
2031 : * If we get an error, as could happen within the ereport/elog calls,
2032 : * we'll come right back here during transaction abort. Reset the
2033 : * flag to ensure that we can't get into an infinite loop. This code
2034 : * is arranged to ensure that the worst-case consequence is failing to
2035 : * emit log message(s), not failing to attempt the unlink.
2036 : */
2037 2046 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2038 :
2039 :
2040 : /* first try the stat() */
2041 2046 : if (stat(vfdP->fileName, &filestats))
2042 0 : stat_errno = errno;
2043 : else
2044 2046 : stat_errno = 0;
2045 :
2046 : /* in any case do the unlink */
2047 2046 : if (unlink(vfdP->fileName))
2048 0 : ereport(LOG,
2049 : (errcode_for_file_access(),
2050 : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2051 :
2052 : /* and last report the stat results */
2053 2046 : if (stat_errno == 0)
2054 2046 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2055 : else
2056 : {
2057 0 : errno = stat_errno;
2058 0 : ereport(LOG,
2059 : (errcode_for_file_access(),
2060 : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2061 : }
2062 : }
2063 :
2064 : /* Unregister it from the resource owner */
2065 1037266 : if (vfdP->resowner)
2066 6426 : ResourceOwnerForgetFile(vfdP->resowner, file);
2067 :
2068 : /*
2069 : * Return the Vfd slot to the free list
2070 : */
2071 1037266 : FreeVfd(file);
2072 1037266 : }
2073 :
2074 : /*
2075 : * FilePrefetch - initiate asynchronous read of a given range of the file.
2076 : *
2077 : * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2078 : *
2079 : * posix_fadvise() is the simplest standardized interface that accomplishes
2080 : * this.
2081 : */
2082 : int
2083 16742 : FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2084 : {
2085 : Assert(FileIsValid(file));
2086 :
2087 : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2088 : file, VfdCache[file].fileName,
2089 : (int64) offset, (int64) amount));
2090 :
2091 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2092 : {
2093 : int returnCode;
2094 :
2095 16742 : returnCode = FileAccess(file);
2096 16742 : if (returnCode < 0)
2097 0 : return returnCode;
2098 :
2099 16742 : retry:
2100 16742 : pgstat_report_wait_start(wait_event_info);
2101 16742 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2102 : POSIX_FADV_WILLNEED);
2103 16742 : pgstat_report_wait_end();
2104 :
2105 16742 : if (returnCode == EINTR)
2106 0 : goto retry;
2107 :
2108 16742 : return returnCode;
2109 : }
2110 : #elif defined(__darwin__)
2111 : {
2112 : struct radvisory
2113 : {
2114 : off_t ra_offset; /* offset into the file */
2115 : int ra_count; /* size of the read */
2116 : } ra;
2117 : int returnCode;
2118 :
2119 : returnCode = FileAccess(file);
2120 : if (returnCode < 0)
2121 : return returnCode;
2122 :
2123 : ra.ra_offset = offset;
2124 : ra.ra_count = amount;
2125 : pgstat_report_wait_start(wait_event_info);
2126 : returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2127 : pgstat_report_wait_end();
2128 : if (returnCode != -1)
2129 : return 0;
2130 : else
2131 : return errno;
2132 : }
2133 : #else
2134 : return 0;
2135 : #endif
2136 : }
2137 :
2138 : void
2139 0 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2140 : {
2141 : int returnCode;
2142 :
2143 : Assert(FileIsValid(file));
2144 :
2145 : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2146 : file, VfdCache[file].fileName,
2147 : (int64) offset, (int64) nbytes));
2148 :
2149 0 : if (nbytes <= 0)
2150 0 : return;
2151 :
2152 0 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
2153 0 : return;
2154 :
2155 0 : returnCode = FileAccess(file);
2156 0 : if (returnCode < 0)
2157 0 : return;
2158 :
2159 0 : pgstat_report_wait_start(wait_event_info);
2160 0 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
2161 0 : pgstat_report_wait_end();
2162 : }
2163 :
2164 : ssize_t
2165 807890 : FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2166 : uint32 wait_event_info)
2167 : {
2168 : ssize_t returnCode;
2169 : Vfd *vfdP;
2170 :
2171 : Assert(FileIsValid(file));
2172 :
2173 : DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2174 : file, VfdCache[file].fileName,
2175 : (int64) offset,
2176 : iovcnt));
2177 :
2178 807890 : returnCode = FileAccess(file);
2179 807890 : if (returnCode < 0)
2180 0 : return returnCode;
2181 :
2182 807890 : vfdP = &VfdCache[file];
2183 :
2184 807890 : retry:
2185 807890 : pgstat_report_wait_start(wait_event_info);
2186 807890 : returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2187 807890 : pgstat_report_wait_end();
2188 :
2189 807890 : if (returnCode < 0)
2190 : {
2191 : /*
2192 : * Windows may run out of kernel buffers and return "Insufficient
2193 : * system resources" error. Wait a bit and retry to solve it.
2194 : *
2195 : * It is rumored that EINTR is also possible on some Unix filesystems,
2196 : * in which case immediate retry is indicated.
2197 : */
2198 : #ifdef WIN32
2199 : DWORD error = GetLastError();
2200 :
2201 : switch (error)
2202 : {
2203 : case ERROR_NO_SYSTEM_RESOURCES:
2204 : pg_usleep(1000L);
2205 : errno = EINTR;
2206 : break;
2207 : default:
2208 : _dosmaperr(error);
2209 : break;
2210 : }
2211 : #endif
2212 : /* OK to retry if interrupted */
2213 0 : if (errno == EINTR)
2214 0 : goto retry;
2215 : }
2216 :
2217 807890 : return returnCode;
2218 : }
2219 :
2220 : int
2221 2413652 : FileStartReadV(PgAioHandle *ioh, File file,
2222 : int iovcnt, off_t offset,
2223 : uint32 wait_event_info)
2224 : {
2225 : int returnCode;
2226 : Vfd *vfdP;
2227 :
2228 : Assert(FileIsValid(file));
2229 :
2230 : DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2231 : file, VfdCache[file].fileName,
2232 : (int64) offset,
2233 : iovcnt));
2234 :
2235 2413652 : returnCode = FileAccess(file);
2236 2413652 : if (returnCode < 0)
2237 0 : return returnCode;
2238 :
2239 2413652 : vfdP = &VfdCache[file];
2240 :
2241 2413652 : pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2242 :
2243 2413652 : return 0;
2244 : }
2245 :
2246 : ssize_t
2247 1366636 : FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2248 : uint32 wait_event_info)
2249 : {
2250 : ssize_t returnCode;
2251 : Vfd *vfdP;
2252 :
2253 : Assert(FileIsValid(file));
2254 :
2255 : DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2256 : file, VfdCache[file].fileName,
2257 : (int64) offset,
2258 : iovcnt));
2259 :
2260 1366636 : returnCode = FileAccess(file);
2261 1366636 : if (returnCode < 0)
2262 0 : return returnCode;
2263 :
2264 1366636 : vfdP = &VfdCache[file];
2265 :
2266 : /*
2267 : * If enforcing temp_file_limit and it's a temp file, check to see if the
2268 : * write would overrun temp_file_limit, and throw error if so. Note: it's
2269 : * really a modularity violation to throw error here; we should set errno
2270 : * and return -1. However, there's no way to report a suitable error
2271 : * message if we do that. All current callers would just throw error
2272 : * immediately anyway, so this is safe at present.
2273 : */
2274 1366636 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2275 : {
2276 0 : off_t past_write = offset;
2277 :
2278 0 : for (int i = 0; i < iovcnt; ++i)
2279 0 : past_write += iov[i].iov_len;
2280 :
2281 0 : if (past_write > vfdP->fileSize)
2282 : {
2283 0 : uint64 newTotal = temporary_files_size;
2284 :
2285 0 : newTotal += past_write - vfdP->fileSize;
2286 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2287 0 : ereport(ERROR,
2288 : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2289 : errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2290 : temp_file_limit)));
2291 : }
2292 : }
2293 :
2294 1366636 : retry:
2295 1366636 : pgstat_report_wait_start(wait_event_info);
2296 1366636 : returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2297 1366636 : pgstat_report_wait_end();
2298 :
2299 1366636 : if (returnCode >= 0)
2300 : {
2301 : /*
2302 : * Some callers expect short writes to set errno, and traditionally we
2303 : * have assumed that they imply disk space shortage. We don't want to
2304 : * waste CPU cycles adding up the total size here, so we'll just set
2305 : * it for all successful writes in case such a caller determines that
2306 : * the write was short and ereports "%m".
2307 : */
2308 1366636 : errno = ENOSPC;
2309 :
2310 : /*
2311 : * Maintain fileSize and temporary_files_size if it's a temp file.
2312 : */
2313 1366636 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2314 : {
2315 106238 : off_t past_write = offset + returnCode;
2316 :
2317 106238 : if (past_write > vfdP->fileSize)
2318 : {
2319 72898 : temporary_files_size += past_write - vfdP->fileSize;
2320 72898 : vfdP->fileSize = past_write;
2321 : }
2322 : }
2323 : }
2324 : else
2325 : {
2326 : /*
2327 : * See comments in FileReadV()
2328 : */
2329 : #ifdef WIN32
2330 : DWORD error = GetLastError();
2331 :
2332 : switch (error)
2333 : {
2334 : case ERROR_NO_SYSTEM_RESOURCES:
2335 : pg_usleep(1000L);
2336 : errno = EINTR;
2337 : break;
2338 : default:
2339 : _dosmaperr(error);
2340 : break;
2341 : }
2342 : #endif
2343 : /* OK to retry if interrupted */
2344 0 : if (errno == EINTR)
2345 0 : goto retry;
2346 : }
2347 :
2348 1366636 : return returnCode;
2349 : }
2350 :
2351 : int
2352 1122 : FileSync(File file, uint32 wait_event_info)
2353 : {
2354 : int returnCode;
2355 :
2356 : Assert(FileIsValid(file));
2357 :
2358 : DO_DB(elog(LOG, "FileSync: %d (%s)",
2359 : file, VfdCache[file].fileName));
2360 :
2361 1122 : returnCode = FileAccess(file);
2362 1122 : if (returnCode < 0)
2363 0 : return returnCode;
2364 :
2365 1122 : pgstat_report_wait_start(wait_event_info);
2366 1122 : returnCode = pg_fsync(VfdCache[file].fd);
2367 1122 : pgstat_report_wait_end();
2368 :
2369 1122 : return returnCode;
2370 : }
2371 :
2372 : /*
2373 : * Zero a region of the file.
2374 : *
2375 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2376 : * appropriate error.
2377 : */
2378 : int
2379 406412 : FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2380 : {
2381 : int returnCode;
2382 : ssize_t written;
2383 :
2384 : Assert(FileIsValid(file));
2385 :
2386 : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2387 : file, VfdCache[file].fileName,
2388 : (int64) offset, (int64) amount));
2389 :
2390 406412 : returnCode = FileAccess(file);
2391 406412 : if (returnCode < 0)
2392 0 : return returnCode;
2393 :
2394 406412 : pgstat_report_wait_start(wait_event_info);
2395 406412 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2396 406412 : pgstat_report_wait_end();
2397 :
2398 406412 : if (written < 0)
2399 0 : return -1;
2400 406412 : else if (written != amount)
2401 : {
2402 : /* if errno is unset, assume problem is no disk space */
2403 0 : if (errno == 0)
2404 0 : errno = ENOSPC;
2405 0 : return -1;
2406 : }
2407 :
2408 406412 : return 0;
2409 : }
2410 :
2411 : /*
2412 : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2413 : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2414 : * use FileZero() instead.
2415 : *
2416 : * Note that at least glibc() implements posix_fallocate() in userspace if not
2417 : * implemented by the filesystem. That's not the case for all environments
2418 : * though.
2419 : *
2420 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2421 : * appropriate error.
2422 : */
2423 : int
2424 1016 : FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2425 : {
2426 : #ifdef HAVE_POSIX_FALLOCATE
2427 : int returnCode;
2428 :
2429 : Assert(FileIsValid(file));
2430 :
2431 : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2432 : file, VfdCache[file].fileName,
2433 : (int64) offset, (int64) amount));
2434 :
2435 1016 : returnCode = FileAccess(file);
2436 1016 : if (returnCode < 0)
2437 0 : return -1;
2438 :
2439 1016 : retry:
2440 1016 : pgstat_report_wait_start(wait_event_info);
2441 1016 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2442 1016 : pgstat_report_wait_end();
2443 :
2444 1016 : if (returnCode == 0)
2445 1016 : return 0;
2446 0 : else if (returnCode == EINTR)
2447 0 : goto retry;
2448 :
2449 : /* for compatibility with %m printing etc */
2450 0 : errno = returnCode;
2451 :
2452 : /*
2453 : * Return in cases of a "real" failure, if fallocate is not supported,
2454 : * fall through to the FileZero() backed implementation.
2455 : */
2456 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2457 0 : return -1;
2458 : #endif
2459 :
2460 0 : return FileZero(file, offset, amount, wait_event_info);
2461 : }
2462 :
2463 : off_t
2464 4276330 : FileSize(File file)
2465 : {
2466 : Assert(FileIsValid(file));
2467 :
2468 : DO_DB(elog(LOG, "FileSize %d (%s)",
2469 : file, VfdCache[file].fileName));
2470 :
2471 4276330 : if (FileIsNotOpen(file))
2472 : {
2473 42 : if (FileAccess(file) < 0)
2474 0 : return (off_t) -1;
2475 : }
2476 :
2477 4276330 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2478 : }
2479 :
2480 : int
2481 1030 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
2482 : {
2483 : int returnCode;
2484 :
2485 : Assert(FileIsValid(file));
2486 :
2487 : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2488 : file, VfdCache[file].fileName));
2489 :
2490 1030 : returnCode = FileAccess(file);
2491 1030 : if (returnCode < 0)
2492 0 : return returnCode;
2493 :
2494 1030 : pgstat_report_wait_start(wait_event_info);
2495 1030 : returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2496 1030 : pgstat_report_wait_end();
2497 :
2498 1030 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2499 : {
2500 : /* adjust our state for truncation of a temp file */
2501 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2502 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2503 0 : VfdCache[file].fileSize = offset;
2504 : }
2505 :
2506 1030 : return returnCode;
2507 : }
2508 :
2509 : /*
2510 : * Return the pathname associated with an open file.
2511 : *
2512 : * The returned string points to an internal buffer, which is valid until
2513 : * the file is closed.
2514 : */
2515 : char *
2516 44 : FilePathName(File file)
2517 : {
2518 : Assert(FileIsValid(file));
2519 :
2520 44 : return VfdCache[file].fileName;
2521 : }
2522 :
2523 : /*
2524 : * Return the raw file descriptor of an opened file.
2525 : *
2526 : * The returned file descriptor will be valid until the file is closed, but
2527 : * there are a lot of things that can make that happen. So the caller should
2528 : * be careful not to do much of anything else before it finishes using the
2529 : * returned file descriptor.
2530 : */
2531 : int
2532 957874 : FileGetRawDesc(File file)
2533 : {
2534 : int returnCode;
2535 :
2536 957874 : returnCode = FileAccess(file);
2537 957874 : if (returnCode < 0)
2538 0 : return returnCode;
2539 :
2540 : Assert(FileIsValid(file));
2541 957874 : return VfdCache[file].fd;
2542 : }
2543 :
2544 : /*
2545 : * FileGetRawFlags - returns the file flags on open(2)
2546 : */
2547 : int
2548 0 : FileGetRawFlags(File file)
2549 : {
2550 : Assert(FileIsValid(file));
2551 0 : return VfdCache[file].fileFlags;
2552 : }
2553 :
2554 : /*
2555 : * FileGetRawMode - returns the mode bitmask passed to open(2)
2556 : */
2557 : mode_t
2558 0 : FileGetRawMode(File file)
2559 : {
2560 : Assert(FileIsValid(file));
2561 0 : return VfdCache[file].fileMode;
2562 : }
2563 :
2564 : /*
2565 : * Make room for another allocatedDescs[] array entry if needed and possible.
2566 : * Returns true if an array element is available.
2567 : */
2568 : static bool
2569 15634534 : reserveAllocatedDesc(void)
2570 : {
2571 : AllocateDesc *newDescs;
2572 : int newMax;
2573 :
2574 : /* Quick out if array already has a free slot. */
2575 15634534 : if (numAllocatedDescs < maxAllocatedDescs)
2576 15632484 : return true;
2577 :
2578 : /*
2579 : * If the array hasn't yet been created in the current process, initialize
2580 : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2581 : * we will ever need, anyway. We don't want to look at max_safe_fds
2582 : * immediately because set_max_safe_fds() may not have run yet.
2583 : */
2584 2050 : if (allocatedDescs == NULL)
2585 : {
2586 2050 : newMax = FD_MINFREE / 3;
2587 2050 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2588 : /* Out of memory already? Treat as fatal error. */
2589 2050 : if (newDescs == NULL)
2590 0 : ereport(ERROR,
2591 : (errcode(ERRCODE_OUT_OF_MEMORY),
2592 : errmsg("out of memory")));
2593 2050 : allocatedDescs = newDescs;
2594 2050 : maxAllocatedDescs = newMax;
2595 2050 : return true;
2596 : }
2597 :
2598 : /*
2599 : * Consider enlarging the array beyond the initial allocation used above.
2600 : * By the time this happens, max_safe_fds should be known accurately.
2601 : *
2602 : * We mustn't let allocated descriptors hog all the available FDs, and in
2603 : * practice we'd better leave a reasonable number of FDs for VFD use. So
2604 : * set the maximum to max_safe_fds / 3. (This should certainly be at
2605 : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2606 : * tightening the restriction here.) Recall that "external" FDs are
2607 : * allowed to consume another third of max_safe_fds.
2608 : */
2609 0 : newMax = max_safe_fds / 3;
2610 0 : if (newMax > maxAllocatedDescs)
2611 : {
2612 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2613 : newMax * sizeof(AllocateDesc));
2614 : /* Treat out-of-memory as a non-fatal error. */
2615 0 : if (newDescs == NULL)
2616 0 : return false;
2617 0 : allocatedDescs = newDescs;
2618 0 : maxAllocatedDescs = newMax;
2619 0 : return true;
2620 : }
2621 :
2622 : /* Can't enlarge allocatedDescs[] any more. */
2623 0 : return false;
2624 : }
2625 :
2626 : /*
2627 : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2628 : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2629 : * necessary to open the file. When done, call FreeFile rather than fclose.
2630 : *
2631 : * Note that files that will be open for any significant length of time
2632 : * should NOT be handled this way, since they cannot share kernel file
2633 : * descriptors with other files; there is grave risk of running out of FDs
2634 : * if anyone locks down too many FDs. Most callers of this routine are
2635 : * simply reading a config file that they will read and close immediately.
2636 : *
2637 : * fd.c will automatically close all files opened with AllocateFile at
2638 : * transaction commit or abort; this prevents FD leakage if a routine
2639 : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2640 : *
2641 : * Ideally this should be the *only* direct call of fopen() in the backend.
2642 : */
2643 : FILE *
2644 162472 : AllocateFile(const char *name, const char *mode)
2645 : {
2646 : FILE *file;
2647 :
2648 : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2649 : numAllocatedDescs, name));
2650 :
2651 : /* Can we allocate another non-virtual FD? */
2652 162472 : if (!reserveAllocatedDesc())
2653 0 : ereport(ERROR,
2654 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2655 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2656 : maxAllocatedDescs, name)));
2657 :
2658 : /* Close excess kernel FDs. */
2659 162472 : ReleaseLruFiles();
2660 :
2661 162472 : TryAgain:
2662 162472 : if ((file = fopen(name, mode)) != NULL)
2663 : {
2664 150298 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2665 :
2666 150298 : desc->kind = AllocateDescFile;
2667 150298 : desc->desc.file = file;
2668 150298 : desc->create_subid = GetCurrentSubTransactionId();
2669 150298 : numAllocatedDescs++;
2670 150298 : return desc->desc.file;
2671 : }
2672 :
2673 12174 : if (errno == EMFILE || errno == ENFILE)
2674 : {
2675 0 : int save_errno = errno;
2676 :
2677 0 : ereport(LOG,
2678 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2679 : errmsg("out of file descriptors: %m; release and retry")));
2680 0 : errno = 0;
2681 0 : if (ReleaseLruFile())
2682 0 : goto TryAgain;
2683 0 : errno = save_errno;
2684 : }
2685 :
2686 12174 : return NULL;
2687 : }
2688 :
2689 : /*
2690 : * Open a file with OpenTransientFilePerm() and pass default file mode for
2691 : * the fileMode parameter.
2692 : */
2693 : int
2694 15388918 : OpenTransientFile(const char *fileName, int fileFlags)
2695 : {
2696 15388918 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2697 : }
2698 :
2699 : /*
2700 : * Like AllocateFile, but returns an unbuffered fd like open(2)
2701 : */
2702 : int
2703 15388930 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2704 : {
2705 : int fd;
2706 :
2707 : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2708 : numAllocatedDescs, fileName));
2709 :
2710 : /* Can we allocate another non-virtual FD? */
2711 15388930 : if (!reserveAllocatedDesc())
2712 0 : ereport(ERROR,
2713 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2714 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2715 : maxAllocatedDescs, fileName)));
2716 :
2717 : /* Close excess kernel FDs. */
2718 15388930 : ReleaseLruFiles();
2719 :
2720 15388930 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2721 :
2722 15388930 : if (fd >= 0)
2723 : {
2724 15381014 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2725 :
2726 15381014 : desc->kind = AllocateDescRawFD;
2727 15381014 : desc->desc.fd = fd;
2728 15381014 : desc->create_subid = GetCurrentSubTransactionId();
2729 15381014 : numAllocatedDescs++;
2730 :
2731 15381014 : return fd;
2732 : }
2733 :
2734 7916 : return -1; /* failure */
2735 : }
2736 :
2737 : /*
2738 : * Routines that want to initiate a pipe stream should use OpenPipeStream
2739 : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2740 : * necessary. When done, call ClosePipeStream rather than pclose.
2741 : *
2742 : * This function also ensures that the popen'd program is run with default
2743 : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2744 : * uses. This ensures desirable response to, eg, closing a read pipe early.
2745 : */
2746 : FILE *
2747 106 : OpenPipeStream(const char *command, const char *mode)
2748 : {
2749 : FILE *file;
2750 : int save_errno;
2751 :
2752 : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2753 : numAllocatedDescs, command));
2754 :
2755 : /* Can we allocate another non-virtual FD? */
2756 106 : if (!reserveAllocatedDesc())
2757 0 : ereport(ERROR,
2758 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2759 : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2760 : maxAllocatedDescs, command)));
2761 :
2762 : /* Close excess kernel FDs. */
2763 106 : ReleaseLruFiles();
2764 :
2765 106 : TryAgain:
2766 106 : fflush(NULL);
2767 106 : pqsignal(SIGPIPE, SIG_DFL);
2768 106 : errno = 0;
2769 106 : file = popen(command, mode);
2770 106 : save_errno = errno;
2771 106 : pqsignal(SIGPIPE, SIG_IGN);
2772 106 : errno = save_errno;
2773 106 : if (file != NULL)
2774 : {
2775 106 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2776 :
2777 106 : desc->kind = AllocateDescPipe;
2778 106 : desc->desc.file = file;
2779 106 : desc->create_subid = GetCurrentSubTransactionId();
2780 106 : numAllocatedDescs++;
2781 106 : return desc->desc.file;
2782 : }
2783 :
2784 0 : if (errno == EMFILE || errno == ENFILE)
2785 : {
2786 0 : ereport(LOG,
2787 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2788 : errmsg("out of file descriptors: %m; release and retry")));
2789 0 : if (ReleaseLruFile())
2790 0 : goto TryAgain;
2791 0 : errno = save_errno;
2792 : }
2793 :
2794 0 : return NULL;
2795 : }
2796 :
2797 : /*
2798 : * Free an AllocateDesc of any type.
2799 : *
2800 : * The argument *must* point into the allocatedDescs[] array.
2801 : */
2802 : static int
2803 15612766 : FreeDesc(AllocateDesc *desc)
2804 : {
2805 : int result;
2806 :
2807 : /* Close the underlying object */
2808 15612766 : switch (desc->kind)
2809 : {
2810 150298 : case AllocateDescFile:
2811 150298 : result = fclose(desc->desc.file);
2812 150298 : break;
2813 106 : case AllocateDescPipe:
2814 106 : result = pclose(desc->desc.file);
2815 106 : break;
2816 81348 : case AllocateDescDir:
2817 81348 : result = closedir(desc->desc.dir);
2818 81348 : break;
2819 15381014 : case AllocateDescRawFD:
2820 15381014 : pgaio_closing_fd(desc->desc.fd);
2821 15381014 : result = close(desc->desc.fd);
2822 15381014 : break;
2823 0 : default:
2824 0 : elog(ERROR, "AllocateDesc kind not recognized");
2825 : result = 0; /* keep compiler quiet */
2826 : break;
2827 : }
2828 :
2829 : /* Compact storage in the allocatedDescs array */
2830 15612766 : numAllocatedDescs--;
2831 15612766 : *desc = allocatedDescs[numAllocatedDescs];
2832 :
2833 15612766 : return result;
2834 : }
2835 :
2836 : /*
2837 : * Close a file returned by AllocateFile.
2838 : *
2839 : * Note we do not check fclose's return value --- it is up to the caller
2840 : * to handle close errors.
2841 : */
2842 : int
2843 150266 : FreeFile(FILE *file)
2844 : {
2845 : int i;
2846 :
2847 : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2848 :
2849 : /* Remove file from list of allocated files, if it's present */
2850 150268 : for (i = numAllocatedDescs; --i >= 0;)
2851 : {
2852 150268 : AllocateDesc *desc = &allocatedDescs[i];
2853 :
2854 150268 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2855 150266 : return FreeDesc(desc);
2856 : }
2857 :
2858 : /* Only get here if someone passes us a file not in allocatedDescs */
2859 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2860 :
2861 0 : return fclose(file);
2862 : }
2863 :
2864 : /*
2865 : * Close a file returned by OpenTransientFile.
2866 : *
2867 : * Note we do not check close's return value --- it is up to the caller
2868 : * to handle close errors.
2869 : */
2870 : int
2871 15381012 : CloseTransientFile(int fd)
2872 : {
2873 : int i;
2874 :
2875 : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2876 :
2877 : /* Remove fd from list of allocated files, if it's present */
2878 15381030 : for (i = numAllocatedDescs; --i >= 0;)
2879 : {
2880 15381030 : AllocateDesc *desc = &allocatedDescs[i];
2881 :
2882 15381030 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2883 15381012 : return FreeDesc(desc);
2884 : }
2885 :
2886 : /* Only get here if someone passes us a file not in allocatedDescs */
2887 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2888 :
2889 0 : pgaio_closing_fd(fd);
2890 :
2891 0 : return close(fd);
2892 : }
2893 :
2894 : /*
2895 : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2896 : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2897 : * necessary to open the directory, and with closing it after an elog.
2898 : * When done, call FreeDir rather than closedir.
2899 : *
2900 : * Returns NULL, with errno set, on failure. Note that failure detection
2901 : * is commonly left to the following call of ReadDir or ReadDirExtended;
2902 : * see the comments for ReadDir.
2903 : *
2904 : * Ideally this should be the *only* direct call of opendir() in the backend.
2905 : */
2906 : DIR *
2907 83026 : AllocateDir(const char *dirname)
2908 : {
2909 : DIR *dir;
2910 :
2911 : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2912 : numAllocatedDescs, dirname));
2913 :
2914 : /* Can we allocate another non-virtual FD? */
2915 83026 : if (!reserveAllocatedDesc())
2916 0 : ereport(ERROR,
2917 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2918 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2919 : maxAllocatedDescs, dirname)));
2920 :
2921 : /* Close excess kernel FDs. */
2922 83026 : ReleaseLruFiles();
2923 :
2924 83026 : TryAgain:
2925 83026 : if ((dir = opendir(dirname)) != NULL)
2926 : {
2927 81348 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2928 :
2929 81348 : desc->kind = AllocateDescDir;
2930 81348 : desc->desc.dir = dir;
2931 81348 : desc->create_subid = GetCurrentSubTransactionId();
2932 81348 : numAllocatedDescs++;
2933 81348 : return desc->desc.dir;
2934 : }
2935 :
2936 1678 : if (errno == EMFILE || errno == ENFILE)
2937 : {
2938 0 : int save_errno = errno;
2939 :
2940 0 : ereport(LOG,
2941 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2942 : errmsg("out of file descriptors: %m; release and retry")));
2943 0 : errno = 0;
2944 0 : if (ReleaseLruFile())
2945 0 : goto TryAgain;
2946 0 : errno = save_errno;
2947 : }
2948 :
2949 1678 : return NULL;
2950 : }
2951 :
2952 : /*
2953 : * Read a directory opened with AllocateDir, ereport'ing any error.
2954 : *
2955 : * This is easier to use than raw readdir() since it takes care of some
2956 : * otherwise rather tedious and error-prone manipulation of errno. Also,
2957 : * if you are happy with a generic error message for AllocateDir failure,
2958 : * you can just do
2959 : *
2960 : * dir = AllocateDir(path);
2961 : * while ((dirent = ReadDir(dir, path)) != NULL)
2962 : * process dirent;
2963 : * FreeDir(dir);
2964 : *
2965 : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2966 : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2967 : * use this shortcut.)
2968 : *
2969 : * The pathname passed to AllocateDir must be passed to this routine too,
2970 : * but it is only used for error reporting.
2971 : */
2972 : struct dirent *
2973 3355362 : ReadDir(DIR *dir, const char *dirname)
2974 : {
2975 3355362 : return ReadDirExtended(dir, dirname, ERROR);
2976 : }
2977 :
2978 : /*
2979 : * Alternate version of ReadDir that allows caller to specify the elevel
2980 : * for any error report (whether it's reporting an initial failure of
2981 : * AllocateDir or a subsequent directory read failure).
2982 : *
2983 : * If elevel < ERROR, returns NULL after any error. With the normal coding
2984 : * pattern, this will result in falling out of the loop immediately as
2985 : * though the directory contained no (more) entries.
2986 : */
2987 : struct dirent *
2988 6435138 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2989 : {
2990 : struct dirent *dent;
2991 :
2992 : /* Give a generic message for AllocateDir failure, if caller didn't */
2993 6435138 : if (dir == NULL)
2994 : {
2995 6 : ereport(elevel,
2996 : (errcode_for_file_access(),
2997 : errmsg("could not open directory \"%s\": %m",
2998 : dirname)));
2999 0 : return NULL;
3000 : }
3001 :
3002 6435132 : errno = 0;
3003 6435132 : if ((dent = readdir(dir)) != NULL)
3004 6375766 : return dent;
3005 :
3006 59366 : if (errno)
3007 0 : ereport(elevel,
3008 : (errcode_for_file_access(),
3009 : errmsg("could not read directory \"%s\": %m",
3010 : dirname)));
3011 59366 : return NULL;
3012 : }
3013 :
3014 : /*
3015 : * Close a directory opened with AllocateDir.
3016 : *
3017 : * Returns closedir's return value (with errno set if it's not 0).
3018 : * Note we do not check the return value --- it is up to the caller
3019 : * to handle close errors if wanted.
3020 : *
3021 : * Does nothing if dir == NULL; we assume that directory open failure was
3022 : * already reported if desired.
3023 : */
3024 : int
3025 81104 : FreeDir(DIR *dir)
3026 : {
3027 : int i;
3028 :
3029 : /* Nothing to do if AllocateDir failed */
3030 81104 : if (dir == NULL)
3031 0 : return 0;
3032 :
3033 : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3034 :
3035 : /* Remove dir from list of allocated dirs, if it's present */
3036 81104 : for (i = numAllocatedDescs; --i >= 0;)
3037 : {
3038 81104 : AllocateDesc *desc = &allocatedDescs[i];
3039 :
3040 81104 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3041 81104 : return FreeDesc(desc);
3042 : }
3043 :
3044 : /* Only get here if someone passes us a dir not in allocatedDescs */
3045 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3046 :
3047 0 : return closedir(dir);
3048 : }
3049 :
3050 :
3051 : /*
3052 : * Close a pipe stream returned by OpenPipeStream.
3053 : */
3054 : int
3055 106 : ClosePipeStream(FILE *file)
3056 : {
3057 : int i;
3058 :
3059 : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3060 :
3061 : /* Remove file from list of allocated files, if it's present */
3062 106 : for (i = numAllocatedDescs; --i >= 0;)
3063 : {
3064 106 : AllocateDesc *desc = &allocatedDescs[i];
3065 :
3066 106 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3067 106 : return FreeDesc(desc);
3068 : }
3069 :
3070 : /* Only get here if someone passes us a file not in allocatedDescs */
3071 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3072 :
3073 0 : return pclose(file);
3074 : }
3075 :
3076 : /*
3077 : * closeAllVfds
3078 : *
3079 : * Force all VFDs into the physically-closed state, so that the fewest
3080 : * possible number of kernel file descriptors are in use. There is no
3081 : * change in the logical state of the VFDs.
3082 : */
3083 : void
3084 60 : closeAllVfds(void)
3085 : {
3086 : Index i;
3087 :
3088 60 : if (SizeVfdCache > 0)
3089 : {
3090 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3091 1920 : for (i = 1; i < SizeVfdCache; i++)
3092 : {
3093 1860 : if (!FileIsNotOpen(i))
3094 282 : LruDelete(i);
3095 : }
3096 : }
3097 60 : }
3098 :
3099 :
3100 : /*
3101 : * SetTempTablespaces
3102 : *
3103 : * Define a list (actually an array) of OIDs of tablespaces to use for
3104 : * temporary files. This list will be used until end of transaction,
3105 : * unless this function is called again before then. It is caller's
3106 : * responsibility that the passed-in array has adequate lifespan (typically
3107 : * it'd be allocated in TopTransactionContext).
3108 : *
3109 : * Some entries of the array may be InvalidOid, indicating that the current
3110 : * database's default tablespace should be used.
3111 : */
3112 : void
3113 5906 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3114 : {
3115 : Assert(numSpaces >= 0);
3116 5906 : tempTableSpaces = tableSpaces;
3117 5906 : numTempTableSpaces = numSpaces;
3118 :
3119 : /*
3120 : * Select a random starting point in the list. This is to minimize
3121 : * conflicts between backends that are most likely sharing the same list
3122 : * of temp tablespaces. Note that if we create multiple temp files in the
3123 : * same transaction, we'll advance circularly through the list --- this
3124 : * ensures that large temporary sort files are nicely spread across all
3125 : * available tablespaces.
3126 : */
3127 5906 : if (numSpaces > 1)
3128 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
3129 0 : 0, numSpaces - 1);
3130 : else
3131 5906 : nextTempTableSpace = 0;
3132 5906 : }
3133 :
3134 : /*
3135 : * TempTablespacesAreSet
3136 : *
3137 : * Returns true if SetTempTablespaces has been called in current transaction.
3138 : * (This is just so that tablespaces.c doesn't need its own per-transaction
3139 : * state.)
3140 : */
3141 : bool
3142 7546 : TempTablespacesAreSet(void)
3143 : {
3144 7546 : return (numTempTableSpaces >= 0);
3145 : }
3146 :
3147 : /*
3148 : * GetTempTablespaces
3149 : *
3150 : * Populate an array with the OIDs of the tablespaces that should be used for
3151 : * temporary files. (Some entries may be InvalidOid, indicating that the
3152 : * current database's default tablespace should be used.) At most numSpaces
3153 : * entries will be filled.
3154 : * Returns the number of OIDs that were copied into the output array.
3155 : */
3156 : int
3157 370 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3158 : {
3159 : int i;
3160 :
3161 : Assert(TempTablespacesAreSet());
3162 370 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3163 0 : tableSpaces[i] = tempTableSpaces[i];
3164 :
3165 370 : return i;
3166 : }
3167 :
3168 : /*
3169 : * GetNextTempTableSpace
3170 : *
3171 : * Select the next temp tablespace to use. A result of InvalidOid means
3172 : * to use the current database's default tablespace.
3173 : */
3174 : Oid
3175 4144 : GetNextTempTableSpace(void)
3176 : {
3177 4144 : if (numTempTableSpaces > 0)
3178 : {
3179 : /* Advance nextTempTableSpace counter with wraparound */
3180 2 : if (++nextTempTableSpace >= numTempTableSpaces)
3181 2 : nextTempTableSpace = 0;
3182 2 : return tempTableSpaces[nextTempTableSpace];
3183 : }
3184 4142 : return InvalidOid;
3185 : }
3186 :
3187 :
3188 : /*
3189 : * AtEOSubXact_Files
3190 : *
3191 : * Take care of subtransaction commit/abort. At abort, we close temp files
3192 : * that the subtransaction may have opened. At commit, we reassign the
3193 : * files that were opened to the parent subtransaction.
3194 : */
3195 : void
3196 19942 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3197 : SubTransactionId parentSubid)
3198 : {
3199 : Index i;
3200 :
3201 19942 : for (i = 0; i < numAllocatedDescs; i++)
3202 : {
3203 0 : if (allocatedDescs[i].create_subid == mySubid)
3204 : {
3205 0 : if (isCommit)
3206 0 : allocatedDescs[i].create_subid = parentSubid;
3207 : else
3208 : {
3209 : /* have to recheck the item after FreeDesc (ugly) */
3210 0 : FreeDesc(&allocatedDescs[i--]);
3211 : }
3212 : }
3213 : }
3214 19942 : }
3215 :
3216 : /*
3217 : * AtEOXact_Files
3218 : *
3219 : * This routine is called during transaction commit or abort. All still-open
3220 : * per-transaction temporary file VFDs are closed, which also causes the
3221 : * underlying files to be deleted (although they should've been closed already
3222 : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3223 : * closed. We also forget any transaction-local temp tablespace list.
3224 : *
3225 : * The isCommit flag is used only to decide whether to emit warnings about
3226 : * unclosed files.
3227 : */
3228 : void
3229 819206 : AtEOXact_Files(bool isCommit)
3230 : {
3231 819206 : CleanupTempFiles(isCommit, false);
3232 819206 : tempTableSpaces = NULL;
3233 819206 : numTempTableSpaces = -1;
3234 819206 : }
3235 :
3236 : /*
3237 : * BeforeShmemExit_Files
3238 : *
3239 : * before_shmem_exit hook to clean up temp files during backend shutdown.
3240 : * Here, we want to clean up *all* temp files including interXact ones.
3241 : */
3242 : static void
3243 42280 : BeforeShmemExit_Files(int code, Datum arg)
3244 : {
3245 42280 : CleanupTempFiles(false, true);
3246 :
3247 : /* prevent further temp files from being created */
3248 : #ifdef USE_ASSERT_CHECKING
3249 : temporary_files_allowed = false;
3250 : #endif
3251 42280 : }
3252 :
3253 : /*
3254 : * Close temporary files and delete their underlying files.
3255 : *
3256 : * isCommit: if true, this is normal transaction commit, and we don't
3257 : * expect any remaining files; warn if there are some.
3258 : *
3259 : * isProcExit: if true, this is being called as the backend process is
3260 : * exiting. If that's the case, we should remove all temporary files; if
3261 : * that's not the case, we are being called for transaction commit/abort
3262 : * and should only remove transaction-local temp files. In either case,
3263 : * also clean up "allocated" stdio files, dirs and fds.
3264 : */
3265 : static void
3266 861486 : CleanupTempFiles(bool isCommit, bool isProcExit)
3267 : {
3268 : Index i;
3269 :
3270 : /*
3271 : * Careful here: at proc_exit we need extra cleanup, not just
3272 : * xact_temporary files.
3273 : */
3274 861486 : if (isProcExit || have_xact_temporary_files)
3275 : {
3276 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3277 2410532 : for (i = 1; i < SizeVfdCache; i++)
3278 : {
3279 2366786 : unsigned short fdstate = VfdCache[i].fdstate;
3280 :
3281 2366786 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3282 8 : VfdCache[i].fileName != NULL)
3283 : {
3284 : /*
3285 : * If we're in the process of exiting a backend process, close
3286 : * all temporary files. Otherwise, only close temporary files
3287 : * local to the current transaction. They should be closed by
3288 : * the ResourceOwner mechanism already, so this is just a
3289 : * debugging cross-check.
3290 : */
3291 8 : if (isProcExit)
3292 8 : FileClose(i);
3293 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
3294 : {
3295 0 : elog(WARNING,
3296 : "temporary file %s not closed at end-of-transaction",
3297 : VfdCache[i].fileName);
3298 0 : FileClose(i);
3299 : }
3300 : }
3301 : }
3302 :
3303 43746 : have_xact_temporary_files = false;
3304 : }
3305 :
3306 : /* Complain if any allocated files remain open at commit. */
3307 861486 : if (isCommit && numAllocatedDescs > 0)
3308 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3309 : numAllocatedDescs);
3310 :
3311 : /* Clean up "allocated" stdio files, dirs and fds. */
3312 861764 : while (numAllocatedDescs > 0)
3313 278 : FreeDesc(&allocatedDescs[0]);
3314 861486 : }
3315 :
3316 :
3317 : /*
3318 : * Remove temporary and temporary relation files left over from a prior
3319 : * postmaster session
3320 : *
3321 : * This should be called during postmaster startup. It will forcibly
3322 : * remove any leftover files created by OpenTemporaryFile and any leftover
3323 : * temporary relation files created by mdcreate.
3324 : *
3325 : * During post-backend-crash restart cycle, this routine is called when
3326 : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3327 : * queries are using temp files could result in useless storage usage that can
3328 : * only be reclaimed by a service restart. The argument against enabling it is
3329 : * that someone might want to examine the temporary files for debugging
3330 : * purposes. This does however mean that OpenTemporaryFile had better allow for
3331 : * collision with an existing temp file name.
3332 : *
3333 : * NOTE: this function and its subroutines generally report syscall failures
3334 : * with ereport(LOG) and keep going. Removing temp files is not so critical
3335 : * that we should fail to start the database when we can't do it.
3336 : */
3337 : void
3338 1652 : RemovePgTempFiles(void)
3339 : {
3340 : char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3341 : DIR *spc_dir;
3342 : struct dirent *spc_de;
3343 :
3344 : /*
3345 : * First process temp files in pg_default ($PGDATA/base)
3346 : */
3347 1652 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3348 1652 : RemovePgTempFilesInDir(temp_path, true, false);
3349 1652 : RemovePgTempRelationFiles("base");
3350 :
3351 : /*
3352 : * Cycle through temp directories for all non-default tablespaces.
3353 : */
3354 1652 : spc_dir = AllocateDir(PG_TBLSPC_DIR);
3355 :
3356 5080 : while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3357 : {
3358 3428 : if (strcmp(spc_de->d_name, ".") == 0 ||
3359 1776 : strcmp(spc_de->d_name, "..") == 0)
3360 3304 : continue;
3361 :
3362 124 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3363 124 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
3364 : PG_TEMP_FILES_DIR);
3365 124 : RemovePgTempFilesInDir(temp_path, true, false);
3366 :
3367 124 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3368 124 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3369 124 : RemovePgTempRelationFiles(temp_path);
3370 : }
3371 :
3372 1652 : FreeDir(spc_dir);
3373 :
3374 : /*
3375 : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3376 : * DataDir as well. However, that is *not* cleaned here because doing so
3377 : * would create a race condition. It's done separately, earlier in
3378 : * postmaster startup.
3379 : */
3380 1652 : }
3381 :
3382 : /*
3383 : * Process one pgsql_tmp directory for RemovePgTempFiles.
3384 : *
3385 : * If missing_ok is true, it's all right for the named directory to not exist.
3386 : * Any other problem results in a LOG message. (missing_ok should be true at
3387 : * the top level, since pgsql_tmp directories are not created until needed.)
3388 : *
3389 : * At the top level, this should be called with unlink_all = false, so that
3390 : * only files matching the temporary name prefix will be unlinked. When
3391 : * recursing it will be called with unlink_all = true to unlink everything
3392 : * under a top-level temporary directory.
3393 : *
3394 : * (These two flags could be replaced by one, but it seems clearer to keep
3395 : * them separate.)
3396 : */
3397 : void
3398 1778 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3399 : {
3400 : DIR *temp_dir;
3401 : struct dirent *temp_de;
3402 : char rm_path[MAXPGPATH * 2];
3403 :
3404 1778 : temp_dir = AllocateDir(tmpdirname);
3405 :
3406 1778 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
3407 1646 : return;
3408 :
3409 402 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3410 : {
3411 270 : if (strcmp(temp_de->d_name, ".") == 0 ||
3412 138 : strcmp(temp_de->d_name, "..") == 0)
3413 264 : continue;
3414 :
3415 6 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3416 6 : tmpdirname, temp_de->d_name);
3417 :
3418 6 : if (unlink_all ||
3419 6 : strncmp(temp_de->d_name,
3420 : PG_TEMP_FILE_PREFIX,
3421 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
3422 6 : {
3423 6 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3424 :
3425 6 : if (type == PGFILETYPE_ERROR)
3426 0 : continue;
3427 6 : else if (type == PGFILETYPE_DIR)
3428 : {
3429 : /* recursively remove contents, then directory itself */
3430 2 : RemovePgTempFilesInDir(rm_path, false, true);
3431 :
3432 2 : if (rmdir(rm_path) < 0)
3433 0 : ereport(LOG,
3434 : (errcode_for_file_access(),
3435 : errmsg("could not remove directory \"%s\": %m",
3436 : rm_path)));
3437 : }
3438 : else
3439 : {
3440 4 : if (unlink(rm_path) < 0)
3441 0 : ereport(LOG,
3442 : (errcode_for_file_access(),
3443 : errmsg("could not remove file \"%s\": %m",
3444 : rm_path)));
3445 : }
3446 : }
3447 : else
3448 0 : ereport(LOG,
3449 : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3450 : rm_path)));
3451 : }
3452 :
3453 132 : FreeDir(temp_dir);
3454 : }
3455 :
3456 : /* Process one tablespace directory, look for per-DB subdirectories */
3457 : static void
3458 1776 : RemovePgTempRelationFiles(const char *tsdirname)
3459 : {
3460 : DIR *ts_dir;
3461 : struct dirent *de;
3462 : char dbspace_path[MAXPGPATH * 2];
3463 :
3464 1776 : ts_dir = AllocateDir(tsdirname);
3465 :
3466 11088 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3467 : {
3468 : /*
3469 : * We're only interested in the per-database directories, which have
3470 : * numeric names. Note that this code will also (properly) ignore "."
3471 : * and "..".
3472 : */
3473 9312 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3474 3682 : continue;
3475 :
3476 5630 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3477 5630 : tsdirname, de->d_name);
3478 5630 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3479 : }
3480 :
3481 1776 : FreeDir(ts_dir);
3482 1776 : }
3483 :
3484 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3485 : static void
3486 5630 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3487 : {
3488 : DIR *dbspace_dir;
3489 : struct dirent *de;
3490 : char rm_path[MAXPGPATH * 2];
3491 :
3492 5630 : dbspace_dir = AllocateDir(dbspacedirname);
3493 :
3494 1712326 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3495 : {
3496 1706696 : if (!looks_like_temp_rel_name(de->d_name))
3497 1706688 : continue;
3498 :
3499 8 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3500 8 : dbspacedirname, de->d_name);
3501 :
3502 8 : if (unlink(rm_path) < 0)
3503 0 : ereport(LOG,
3504 : (errcode_for_file_access(),
3505 : errmsg("could not remove file \"%s\": %m",
3506 : rm_path)));
3507 : }
3508 :
3509 5630 : FreeDir(dbspace_dir);
3510 5630 : }
3511 :
3512 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3513 : bool
3514 2283246 : looks_like_temp_rel_name(const char *name)
3515 : {
3516 : int pos;
3517 : int savepos;
3518 :
3519 : /* Must start with "t". */
3520 2283246 : if (name[0] != 't')
3521 2283166 : return false;
3522 :
3523 : /* Followed by a non-empty string of digits and then an underscore. */
3524 392 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3525 : ;
3526 80 : if (pos == 1 || name[pos] != '_')
3527 0 : return false;
3528 :
3529 : /* Followed by another nonempty string of digits. */
3530 392 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3531 : ;
3532 80 : if (savepos == pos)
3533 0 : return false;
3534 :
3535 : /* We might have _forkname or .segment or both. */
3536 80 : if (name[pos] == '_')
3537 : {
3538 40 : int forkchar = forkname_chars(&name[pos + 1], NULL);
3539 :
3540 40 : if (forkchar <= 0)
3541 0 : return false;
3542 40 : pos += forkchar + 1;
3543 : }
3544 80 : if (name[pos] == '.')
3545 : {
3546 : int segchar;
3547 :
3548 80 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3549 : ;
3550 40 : if (segchar <= 1)
3551 0 : return false;
3552 40 : pos += segchar;
3553 : }
3554 :
3555 : /* Now we should be at the end. */
3556 80 : if (name[pos] != '\0')
3557 0 : return false;
3558 80 : return true;
3559 : }
3560 :
3561 : #ifdef HAVE_SYNCFS
3562 : static void
3563 0 : do_syncfs(const char *path)
3564 : {
3565 : int fd;
3566 :
3567 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3568 : path);
3569 :
3570 0 : fd = OpenTransientFile(path, O_RDONLY);
3571 0 : if (fd < 0)
3572 : {
3573 0 : ereport(LOG,
3574 : (errcode_for_file_access(),
3575 : errmsg("could not open file \"%s\": %m", path)));
3576 0 : return;
3577 : }
3578 0 : if (syncfs(fd) < 0)
3579 0 : ereport(LOG,
3580 : (errcode_for_file_access(),
3581 : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3582 0 : CloseTransientFile(fd);
3583 : }
3584 : #endif
3585 :
3586 : /*
3587 : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3588 : * all potential filesystem, depending on recovery_init_sync_method setting.
3589 : *
3590 : * We fsync regular files and directories wherever they are, but we
3591 : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3592 : * Other symlinks are presumed to point at files we're not responsible
3593 : * for fsyncing, and might not have privileges to write at all.
3594 : *
3595 : * Errors are logged but not considered fatal; that's because this is used
3596 : * only during database startup, to deal with the possibility that there are
3597 : * issued-but-unsynced writes pending against the data directory. We want to
3598 : * ensure that such writes reach disk before anything that's done in the new
3599 : * run. However, aborting on error would result in failure to start for
3600 : * harmless cases such as read-only files in the data directory, and that's
3601 : * not good either.
3602 : *
3603 : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3604 : * rewriting all changes again during recovery.
3605 : *
3606 : * Note we assume we're chdir'd into PGDATA to begin with.
3607 : */
3608 : void
3609 342 : SyncDataDirectory(void)
3610 : {
3611 : bool xlog_is_symlink;
3612 :
3613 : /* We can skip this whole thing if fsync is disabled. */
3614 342 : if (!enableFsync)
3615 342 : return;
3616 :
3617 : /*
3618 : * If pg_wal is a symlink, we'll need to recurse into it separately,
3619 : * because the first walkdir below will ignore it.
3620 : */
3621 0 : xlog_is_symlink = false;
3622 :
3623 : {
3624 : struct stat st;
3625 :
3626 0 : if (lstat("pg_wal", &st) < 0)
3627 0 : ereport(LOG,
3628 : (errcode_for_file_access(),
3629 : errmsg("could not stat file \"%s\": %m",
3630 : "pg_wal")));
3631 0 : else if (S_ISLNK(st.st_mode))
3632 0 : xlog_is_symlink = true;
3633 : }
3634 :
3635 : #ifdef HAVE_SYNCFS
3636 0 : if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
3637 : {
3638 : DIR *dir;
3639 : struct dirent *de;
3640 :
3641 : /*
3642 : * On Linux, we don't have to open every single file one by one. We
3643 : * can use syncfs() to sync whole filesystems. We only expect
3644 : * filesystem boundaries to exist where we tolerate symlinks, namely
3645 : * pg_wal and the tablespaces, so we call syncfs() for each of those
3646 : * directories.
3647 : */
3648 :
3649 : /* Prepare to report progress syncing the data directory via syncfs. */
3650 0 : begin_startup_progress_phase();
3651 :
3652 : /* Sync the top level pgdata directory. */
3653 0 : do_syncfs(".");
3654 : /* If any tablespaces are configured, sync each of those. */
3655 0 : dir = AllocateDir(PG_TBLSPC_DIR);
3656 0 : while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3657 : {
3658 : char path[MAXPGPATH];
3659 :
3660 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3661 0 : continue;
3662 :
3663 0 : snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3664 0 : do_syncfs(path);
3665 : }
3666 0 : FreeDir(dir);
3667 : /* If pg_wal is a symlink, process that too. */
3668 0 : if (xlog_is_symlink)
3669 0 : do_syncfs("pg_wal");
3670 0 : return;
3671 : }
3672 : #endif /* !HAVE_SYNCFS */
3673 :
3674 : #ifdef PG_FLUSH_DATA_WORKS
3675 : /* Prepare to report progress of the pre-fsync phase. */
3676 0 : begin_startup_progress_phase();
3677 :
3678 : /*
3679 : * If possible, hint to the kernel that we're soon going to fsync the data
3680 : * directory and its contents. Errors in this step are even less
3681 : * interesting than normal, so log them only at DEBUG1.
3682 : */
3683 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
3684 0 : if (xlog_is_symlink)
3685 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3686 0 : walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3687 : #endif
3688 :
3689 : /* Prepare to report progress syncing the data directory via fsync. */
3690 0 : begin_startup_progress_phase();
3691 :
3692 : /*
3693 : * Now we do the fsync()s in the same order.
3694 : *
3695 : * The main call ignores symlinks, so in addition to specially processing
3696 : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3697 : * process_symlinks = true. Note that if there are any plain directories
3698 : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3699 : * so we don't worry about optimizing it.
3700 : */
3701 0 : walkdir(".", datadir_fsync_fname, false, LOG);
3702 0 : if (xlog_is_symlink)
3703 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3704 0 : walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
3705 : }
3706 :
3707 : /*
3708 : * walkdir: recursively walk a directory, applying the action to each
3709 : * regular file and directory (including the named directory itself).
3710 : *
3711 : * If process_symlinks is true, the action and recursion are also applied
3712 : * to regular files and directories that are pointed to by symlinks in the
3713 : * given directory; otherwise symlinks are ignored. Symlinks are always
3714 : * ignored in subdirectories, ie we intentionally don't pass down the
3715 : * process_symlinks flag to recursive calls.
3716 : *
3717 : * Errors are reported at level elevel, which might be ERROR or less.
3718 : *
3719 : * See also walkdir in file_utils.c, which is a frontend version of this
3720 : * logic.
3721 : */
3722 : static void
3723 338 : walkdir(const char *path,
3724 : void (*action) (const char *fname, bool isdir, int elevel),
3725 : bool process_symlinks,
3726 : int elevel)
3727 : {
3728 : DIR *dir;
3729 : struct dirent *de;
3730 :
3731 338 : dir = AllocateDir(path);
3732 :
3733 2676 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3734 : {
3735 : char subpath[MAXPGPATH * 2];
3736 :
3737 2338 : CHECK_FOR_INTERRUPTS();
3738 :
3739 2338 : if (strcmp(de->d_name, ".") == 0 ||
3740 2000 : strcmp(de->d_name, "..") == 0)
3741 676 : continue;
3742 :
3743 1662 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3744 :
3745 1662 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3746 : {
3747 1662 : case PGFILETYPE_REG:
3748 1662 : (*action) (subpath, false, elevel);
3749 1662 : break;
3750 0 : case PGFILETYPE_DIR:
3751 0 : walkdir(subpath, action, false, elevel);
3752 0 : break;
3753 0 : default:
3754 :
3755 : /*
3756 : * Errors are already reported directly by get_dirent_type(),
3757 : * and any remaining symlinks and unknown file types are
3758 : * ignored.
3759 : */
3760 0 : break;
3761 : }
3762 : }
3763 :
3764 338 : FreeDir(dir); /* we ignore any error here */
3765 :
3766 : /*
3767 : * It's important to fsync the destination directory itself as individual
3768 : * file fsyncs don't guarantee that the directory entry for the file is
3769 : * synced. However, skip this if AllocateDir failed; the action function
3770 : * might not be robust against that.
3771 : */
3772 338 : if (dir)
3773 338 : (*action) (path, true, elevel);
3774 338 : }
3775 :
3776 :
3777 : /*
3778 : * Hint to the OS that it should get ready to fsync() this file.
3779 : *
3780 : * Ignores errors trying to open unreadable files, and logs other errors at a
3781 : * caller-specified level.
3782 : */
3783 : #ifdef PG_FLUSH_DATA_WORKS
3784 :
3785 : static void
3786 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3787 : {
3788 : int fd;
3789 :
3790 : /* Don't try to flush directories, it'll likely just fail */
3791 0 : if (isdir)
3792 0 : return;
3793 :
3794 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3795 : fname);
3796 :
3797 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3798 :
3799 0 : if (fd < 0)
3800 : {
3801 0 : if (errno == EACCES)
3802 0 : return;
3803 0 : ereport(elevel,
3804 : (errcode_for_file_access(),
3805 : errmsg("could not open file \"%s\": %m", fname)));
3806 0 : return;
3807 : }
3808 :
3809 : /*
3810 : * pg_flush_data() ignores errors, which is ok because this is only a
3811 : * hint.
3812 : */
3813 0 : pg_flush_data(fd, 0, 0);
3814 :
3815 0 : if (CloseTransientFile(fd) != 0)
3816 0 : ereport(elevel,
3817 : (errcode_for_file_access(),
3818 : errmsg("could not close file \"%s\": %m", fname)));
3819 : }
3820 :
3821 : #endif /* PG_FLUSH_DATA_WORKS */
3822 :
3823 : static void
3824 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3825 : {
3826 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3827 : fname);
3828 :
3829 : /*
3830 : * We want to silently ignoring errors about unreadable files. Pass that
3831 : * desire on to fsync_fname_ext().
3832 : */
3833 0 : fsync_fname_ext(fname, isdir, true, elevel);
3834 0 : }
3835 :
3836 : static void
3837 2000 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3838 : {
3839 2000 : if (isdir)
3840 : {
3841 338 : if (rmdir(fname) != 0 && errno != ENOENT)
3842 0 : ereport(elevel,
3843 : (errcode_for_file_access(),
3844 : errmsg("could not remove directory \"%s\": %m", fname)));
3845 : }
3846 : else
3847 : {
3848 : /* Use PathNameDeleteTemporaryFile to report filesize */
3849 1662 : PathNameDeleteTemporaryFile(fname, false);
3850 : }
3851 2000 : }
3852 :
3853 : /*
3854 : * fsync_fname_ext -- Try to fsync a file or directory
3855 : *
3856 : * If ignore_perm is true, ignore errors upon trying to open unreadable
3857 : * files. Logs other errors at a caller-specified level.
3858 : *
3859 : * Returns 0 if the operation succeeded, -1 otherwise.
3860 : */
3861 : int
3862 68674 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3863 : {
3864 : int fd;
3865 : int flags;
3866 : int returncode;
3867 :
3868 : /*
3869 : * Some OSs require directories to be opened read-only whereas other
3870 : * systems don't allow us to fsync files opened read-only; so we need both
3871 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3872 : * not writable by our userid, but we assume that's OK.
3873 : */
3874 68674 : flags = PG_BINARY;
3875 68674 : if (!isdir)
3876 25278 : flags |= O_RDWR;
3877 : else
3878 43396 : flags |= O_RDONLY;
3879 :
3880 68674 : fd = OpenTransientFile(fname, flags);
3881 :
3882 : /*
3883 : * Some OSs don't allow us to open directories at all (Windows returns
3884 : * EACCES), just ignore the error in that case. If desired also silently
3885 : * ignoring errors about unreadable files. Log others.
3886 : */
3887 68674 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3888 0 : return 0;
3889 68674 : else if (fd < 0 && ignore_perm && errno == EACCES)
3890 0 : return 0;
3891 68674 : else if (fd < 0)
3892 : {
3893 0 : ereport(elevel,
3894 : (errcode_for_file_access(),
3895 : errmsg("could not open file \"%s\": %m", fname)));
3896 0 : return -1;
3897 : }
3898 :
3899 68674 : returncode = pg_fsync(fd);
3900 :
3901 : /*
3902 : * Some OSes don't allow us to fsync directories at all, so we can ignore
3903 : * those errors. Anything else needs to be logged.
3904 : */
3905 68674 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3906 : {
3907 : int save_errno;
3908 :
3909 : /* close file upon error, might not be in transaction context */
3910 0 : save_errno = errno;
3911 0 : (void) CloseTransientFile(fd);
3912 0 : errno = save_errno;
3913 :
3914 0 : ereport(elevel,
3915 : (errcode_for_file_access(),
3916 : errmsg("could not fsync file \"%s\": %m", fname)));
3917 0 : return -1;
3918 : }
3919 :
3920 68674 : if (CloseTransientFile(fd) != 0)
3921 : {
3922 0 : ereport(elevel,
3923 : (errcode_for_file_access(),
3924 : errmsg("could not close file \"%s\": %m", fname)));
3925 0 : return -1;
3926 : }
3927 :
3928 68674 : return 0;
3929 : }
3930 :
3931 : /*
3932 : * fsync_parent_path -- fsync the parent path of a file or directory
3933 : *
3934 : * This is aimed at making file operations persistent on disk in case of
3935 : * an OS crash or power failure.
3936 : */
3937 : static int
3938 12060 : fsync_parent_path(const char *fname, int elevel)
3939 : {
3940 : char parentpath[MAXPGPATH];
3941 :
3942 12060 : strlcpy(parentpath, fname, MAXPGPATH);
3943 12060 : get_parent_directory(parentpath);
3944 :
3945 : /*
3946 : * get_parent_directory() returns an empty string if the input argument is
3947 : * just a file name (see comments in path.c), so handle that as being the
3948 : * current directory.
3949 : */
3950 12060 : if (strlen(parentpath) == 0)
3951 378 : strlcpy(parentpath, ".", MAXPGPATH);
3952 :
3953 12060 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3954 0 : return -1;
3955 :
3956 12060 : return 0;
3957 : }
3958 :
3959 : /*
3960 : * Create a PostgreSQL data sub-directory
3961 : *
3962 : * The data directory itself, and most of its sub-directories, are created at
3963 : * initdb time, but we do have some occasions when we create directories in
3964 : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3965 : * make sure that those directories are created consistently. Today, that means
3966 : * making sure that the created directory has the correct permissions, which is
3967 : * what pg_dir_create_mode tracks for us.
3968 : *
3969 : * Note that we also set the umask() based on what we understand the correct
3970 : * permissions to be (see file_perm.c).
3971 : *
3972 : * For permissions other than the default, mkdir() can be used directly, but
3973 : * be sure to consider carefully such cases -- a sub-directory with incorrect
3974 : * permissions in a PostgreSQL data directory could cause backups and other
3975 : * processes to fail.
3976 : */
3977 : int
3978 2792 : MakePGDirectory(const char *directoryName)
3979 : {
3980 2792 : return mkdir(directoryName, pg_dir_create_mode);
3981 : }
3982 :
3983 : /*
3984 : * Return the passed-in error level, or PANIC if data_sync_retry is off.
3985 : *
3986 : * Failure to fsync any data file is cause for immediate panic, unless
3987 : * data_sync_retry is enabled. Data may have been written to the operating
3988 : * system and removed from our buffer pool already, and if we are running on
3989 : * an operating system that forgets dirty data on write-back failure, there
3990 : * may be only one copy of the data remaining: in the WAL. A later attempt to
3991 : * fsync again might falsely report success. Therefore we must not allow any
3992 : * further checkpoints to be attempted. data_sync_retry can in theory be
3993 : * enabled on systems known not to drop dirty buffered data on write-back
3994 : * failure (with the likely outcome that checkpoints will continue to fail
3995 : * until the underlying problem is fixed).
3996 : *
3997 : * Any code that reports a failure from fsync() or related functions should
3998 : * filter the error level with this function.
3999 : */
4000 : int
4001 36294 : data_sync_elevel(int elevel)
4002 : {
4003 36294 : return data_sync_retry ? elevel : PANIC;
4004 : }
4005 :
4006 : bool
4007 2102 : check_debug_io_direct(char **newval, void **extra, GucSource source)
4008 : {
4009 2102 : bool result = true;
4010 : int flags;
4011 :
4012 : #if PG_O_DIRECT == 0
4013 : if (strcmp(*newval, "") != 0)
4014 : {
4015 : GUC_check_errdetail("\"%s\" is not supported on this platform.",
4016 : "debug_io_direct");
4017 : result = false;
4018 : }
4019 : flags = 0;
4020 : #else
4021 : List *elemlist;
4022 : ListCell *l;
4023 : char *rawstring;
4024 :
4025 : /* Need a modifiable copy of string */
4026 2102 : rawstring = pstrdup(*newval);
4027 :
4028 2102 : if (!SplitGUCList(rawstring, ',', &elemlist))
4029 : {
4030 0 : GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4031 : "debug_io_direct");
4032 0 : pfree(rawstring);
4033 0 : list_free(elemlist);
4034 0 : return false;
4035 : }
4036 :
4037 2102 : flags = 0;
4038 2114 : foreach(l, elemlist)
4039 : {
4040 12 : char *item = (char *) lfirst(l);
4041 :
4042 12 : if (pg_strcasecmp(item, "data") == 0)
4043 4 : flags |= IO_DIRECT_DATA;
4044 8 : else if (pg_strcasecmp(item, "wal") == 0)
4045 4 : flags |= IO_DIRECT_WAL;
4046 4 : else if (pg_strcasecmp(item, "wal_init") == 0)
4047 4 : flags |= IO_DIRECT_WAL_INIT;
4048 : else
4049 : {
4050 0 : GUC_check_errdetail("Invalid option \"%s\".", item);
4051 0 : result = false;
4052 0 : break;
4053 : }
4054 : }
4055 :
4056 : /*
4057 : * It's possible to configure block sizes smaller than our assumed I/O
4058 : * alignment size, which could result in invalid I/O requests.
4059 : */
4060 : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4061 : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4062 : {
4063 : GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4064 : "debug_io_direct", "XLOG_BLCKSZ");
4065 : result = false;
4066 : }
4067 : #endif
4068 : #if BLCKSZ < PG_IO_ALIGN_SIZE
4069 : if (result && (flags & IO_DIRECT_DATA))
4070 : {
4071 : GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4072 : "debug_io_direct", "BLCKSZ");
4073 : result = false;
4074 : }
4075 : #endif
4076 :
4077 2102 : pfree(rawstring);
4078 2102 : list_free(elemlist);
4079 : #endif
4080 :
4081 2102 : if (!result)
4082 0 : return result;
4083 :
4084 : /* Save the flags in *extra, for use by assign_debug_io_direct */
4085 2102 : *extra = guc_malloc(LOG, sizeof(int));
4086 2102 : if (!*extra)
4087 0 : return false;
4088 2102 : *((int *) *extra) = flags;
4089 :
4090 2102 : return result;
4091 : }
4092 :
4093 : void
4094 2102 : assign_debug_io_direct(const char *newval, void *extra)
4095 : {
4096 2102 : int *flags = (int *) extra;
4097 :
4098 2102 : io_direct_flags = *flags;
4099 2102 : }
4100 :
4101 : /* ResourceOwner callbacks */
4102 :
4103 : static void
4104 10 : ResOwnerReleaseFile(Datum res)
4105 : {
4106 10 : File file = (File) DatumGetInt32(res);
4107 : Vfd *vfdP;
4108 :
4109 : Assert(FileIsValid(file));
4110 :
4111 10 : vfdP = &VfdCache[file];
4112 10 : vfdP->resowner = NULL;
4113 :
4114 10 : FileClose(file);
4115 10 : }
4116 :
4117 : static char *
4118 0 : ResOwnerPrintFile(Datum res)
4119 : {
4120 0 : return psprintf("File %d", DatumGetInt32(res));
4121 : }
|