Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * fd.c
4 : * Virtual file descriptor code.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/fd.c
11 : *
12 : * NOTES:
13 : *
14 : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : * The server opens many file descriptors for a variety of reasons,
16 : * including base tables, scratch files (e.g., sort and hash spool
17 : * files), and random calls to C library routines like system(3); it
18 : * is quite easy to exceed system limits on the number of open files a
19 : * single process can have. (This is around 1024 on many modern
20 : * operating systems, but may be lower on others.)
21 : *
22 : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : * being opened and closed as needed. Obviously, if a routine is
24 : * opened using these interfaces, all subsequent operations must also
25 : * be through these interfaces (the File type is not a real file
26 : * descriptor).
27 : *
28 : * For this scheme to work, most (if not all) routines throughout the
29 : * server should use these interfaces instead of calling the C library
30 : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : * may find ourselves short of real file descriptors anyway.
32 : *
33 : * INTERFACE ROUTINES
34 : *
35 : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : * File is closed, either explicitly or implicitly at end of transaction or
38 : * process exit. PathNameOpenFile is intended for files that are held open
39 : * for a long time, like relation files. It is the caller's responsibility
40 : * to close them, there is no automatic mechanism in fd.c for that.
41 : *
42 : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : * temporary files that have names so that they can be shared between
44 : * backends. Such files are automatically closed and count against the
45 : * temporary file limit of the backend that creates them, but unlike anonymous
46 : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : * ownership mechanism that provides automatic cleanup for shared files when
48 : * the last of a group of backends detaches.
49 : *
50 : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : * They behave like the corresponding native functions, except that the handle
53 : * is registered with the current subtransaction, and will be automatically
54 : * closed at abort. These are intended mainly for short operations like
55 : * reading a configuration file; there is a limit on the number of files that
56 : * can be opened using these functions at any one time.
57 : *
58 : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : * release file descriptors in use by the virtual file descriptors if
60 : * necessary. There is no automatic cleanup of file descriptors returned by
61 : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : * descriptor by calling close(2).
63 : *
64 : * If a non-virtual file descriptor needs to be held open for any length of
65 : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : *
70 : *-------------------------------------------------------------------------
71 : */
72 :
73 : #include "postgres.h"
74 :
75 : #include <dirent.h>
76 : #include <sys/file.h>
77 : #include <sys/param.h>
78 : #include <sys/resource.h> /* for getrlimit */
79 : #include <sys/stat.h>
80 : #include <sys/types.h>
81 : #ifndef WIN32
82 : #include <sys/mman.h>
83 : #endif
84 : #include <limits.h>
85 : #include <unistd.h>
86 : #include <fcntl.h>
87 :
88 : #include "access/xact.h"
89 : #include "access/xlog.h"
90 : #include "catalog/pg_tablespace.h"
91 : #include "common/file_perm.h"
92 : #include "common/file_utils.h"
93 : #include "common/pg_prng.h"
94 : #include "miscadmin.h"
95 : #include "pgstat.h"
96 : #include "postmaster/startup.h"
97 : #include "storage/fd.h"
98 : #include "storage/ipc.h"
99 : #include "utils/guc.h"
100 : #include "utils/guc_hooks.h"
101 : #include "utils/resowner.h"
102 : #include "utils/varlena.h"
103 :
104 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105 : #if defined(HAVE_SYNC_FILE_RANGE)
106 : #define PG_FLUSH_DATA_WORKS 1
107 : #elif !defined(WIN32) && defined(MS_ASYNC)
108 : #define PG_FLUSH_DATA_WORKS 1
109 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110 : #define PG_FLUSH_DATA_WORKS 1
111 : #endif
112 :
113 : /*
114 : * We must leave some file descriptors free for system(), the dynamic loader,
115 : * and other code that tries to open files without consulting fd.c. This
116 : * is the number left free. (While we try fairly hard to prevent EMFILE
117 : * errors, there's never any guarantee that we won't get ENFILE due to
118 : * other processes chewing up FDs. So it's a bad idea to try to open files
119 : * without consulting fd.c. Nonetheless we cannot control all code.)
120 : *
121 : * Because this is just a fixed setting, we are effectively assuming that
122 : * no such code will leave FDs open over the long term; otherwise the slop
123 : * is likely to be insufficient. Note in particular that we expect that
124 : * loading a shared library does not result in any permanent increase in
125 : * the number of open files. (This appears to be true on most if not
126 : * all platforms as of Feb 2004.)
127 : */
128 : #define NUM_RESERVED_FDS 10
129 :
130 : /*
131 : * If we have fewer than this many usable FDs after allowing for the reserved
132 : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
133 : * much less than that. Note that this value ensures numExternalFDs can be
134 : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135 : * will not pass unless that can grow to at least 14.)
136 : */
137 : #define FD_MINFREE 48
138 :
139 : /*
140 : * A number of platforms allow individual processes to open many more files
141 : * than they can really support when *many* processes do the same thing.
142 : * This GUC parameter lets the DBA limit max_safe_fds to something less than
143 : * what the postmaster's initial probe suggests will work.
144 : */
145 : int max_files_per_process = 1000;
146 :
147 : /*
148 : * Maximum number of file descriptors to open for operations that fd.c knows
149 : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
150 : * to a conservative value, and remains that way indefinitely in bootstrap or
151 : * standalone-backend cases. In normal postmaster operation, the postmaster
152 : * calls set_max_safe_fds() late in initialization to update the value, and
153 : * that value is then inherited by forked subprocesses.
154 : *
155 : * Note: the value of max_files_per_process is taken into account while
156 : * setting this variable, and so need not be tested separately.
157 : */
158 : int max_safe_fds = FD_MINFREE; /* default if not changed */
159 :
160 : /* Whether it is safe to continue running after fsync() fails. */
161 : bool data_sync_retry = false;
162 :
163 : /* How SyncDataDirectory() should do its job. */
164 : int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
165 :
166 : /* Which kinds of files should be opened with PG_O_DIRECT. */
167 : int io_direct_flags;
168 :
169 : /* Debugging.... */
170 :
171 : #ifdef FDDEBUG
172 : #define DO_DB(A) \
173 : do { \
174 : int _do_db_save_errno = errno; \
175 : A; \
176 : errno = _do_db_save_errno; \
177 : } while (0)
178 : #else
179 : #define DO_DB(A) \
180 : ((void) 0)
181 : #endif
182 :
183 : #define VFD_CLOSED (-1)
184 :
185 : #define FileIsValid(file) \
186 : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
187 :
188 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
189 :
190 : /* these are the assigned bits in fdstate below: */
191 : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
192 : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
193 : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
194 :
195 : typedef struct vfd
196 : {
197 : int fd; /* current FD, or VFD_CLOSED if none */
198 : unsigned short fdstate; /* bitflags for VFD's state */
199 : ResourceOwner resowner; /* owner, for automatic cleanup */
200 : File nextFree; /* link to next free VFD, if in freelist */
201 : File lruMoreRecently; /* doubly linked recency-of-use list */
202 : File lruLessRecently;
203 : off_t fileSize; /* current size of file (0 if not temporary) */
204 : char *fileName; /* name of file, or NULL for unused VFD */
205 : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
206 : int fileFlags; /* open(2) flags for (re)opening the file */
207 : mode_t fileMode; /* mode to pass to open(2) */
208 : } Vfd;
209 :
210 : /*
211 : * Virtual File Descriptor array pointer and size. This grows as
212 : * needed. 'File' values are indexes into this array.
213 : * Note that VfdCache[0] is not a usable VFD, just a list header.
214 : */
215 : static Vfd *VfdCache;
216 : static Size SizeVfdCache = 0;
217 :
218 : /*
219 : * Number of file descriptors known to be in use by VFD entries.
220 : */
221 : static int nfile = 0;
222 :
223 : /*
224 : * Flag to tell whether it's worth scanning VfdCache looking for temp files
225 : * to close
226 : */
227 : static bool have_xact_temporary_files = false;
228 :
229 : /*
230 : * Tracks the total size of all temporary files. Note: when temp_file_limit
231 : * is being enforced, this cannot overflow since the limit cannot be more
232 : * than INT_MAX kilobytes. When not enforcing, it could theoretically
233 : * overflow, but we don't care.
234 : */
235 : static uint64 temporary_files_size = 0;
236 :
237 : /* Temporary file access initialized and not yet shut down? */
238 : #ifdef USE_ASSERT_CHECKING
239 : static bool temporary_files_allowed = false;
240 : #endif
241 :
242 : /*
243 : * List of OS handles opened with AllocateFile, AllocateDir and
244 : * OpenTransientFile.
245 : */
246 : typedef enum
247 : {
248 : AllocateDescFile,
249 : AllocateDescPipe,
250 : AllocateDescDir,
251 : AllocateDescRawFD,
252 : } AllocateDescKind;
253 :
254 : typedef struct
255 : {
256 : AllocateDescKind kind;
257 : SubTransactionId create_subid;
258 : union
259 : {
260 : FILE *file;
261 : DIR *dir;
262 : int fd;
263 : } desc;
264 : } AllocateDesc;
265 :
266 : static int numAllocatedDescs = 0;
267 : static int maxAllocatedDescs = 0;
268 : static AllocateDesc *allocatedDescs = NULL;
269 :
270 : /*
271 : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
272 : */
273 : static int numExternalFDs = 0;
274 :
275 : /*
276 : * Number of temporary files opened during the current session;
277 : * this is used in generation of tempfile names.
278 : */
279 : static long tempFileCounter = 0;
280 :
281 : /*
282 : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
283 : * indicating that the current database's default tablespace should be used.)
284 : * When numTempTableSpaces is -1, this has not been set in the current
285 : * transaction.
286 : */
287 : static Oid *tempTableSpaces = NULL;
288 : static int numTempTableSpaces = -1;
289 : static int nextTempTableSpace = 0;
290 :
291 :
292 : /*--------------------
293 : *
294 : * Private Routines
295 : *
296 : * Delete - delete a file from the Lru ring
297 : * LruDelete - remove a file from the Lru ring and close its FD
298 : * Insert - put a file at the front of the Lru ring
299 : * LruInsert - put a file at the front of the Lru ring and open it
300 : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
301 : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
302 : * AllocateVfd - grab a free (or new) file record (from VfdCache)
303 : * FreeVfd - free a file record
304 : *
305 : * The Least Recently Used ring is a doubly linked list that begins and
306 : * ends on element zero. Element zero is special -- it doesn't represent
307 : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
308 : * anchor that shows us the beginning/end of the ring.
309 : * Only VFD elements that are currently really open (have an FD assigned) are
310 : * in the Lru ring. Elements that are "virtually" open can be recognized
311 : * by having a non-null fileName field.
312 : *
313 : * example:
314 : *
315 : * /--less----\ /---------\
316 : * v \ v \
317 : * #0 --more---> LeastRecentlyUsed --more-\ \
318 : * ^\ | |
319 : * \\less--> MostRecentlyUsedFile <---/ |
320 : * \more---/ \--less--/
321 : *
322 : *--------------------
323 : */
324 : static void Delete(File file);
325 : static void LruDelete(File file);
326 : static void Insert(File file);
327 : static int LruInsert(File file);
328 : static bool ReleaseLruFile(void);
329 : static void ReleaseLruFiles(void);
330 : static File AllocateVfd(void);
331 : static void FreeVfd(File file);
332 :
333 : static int FileAccess(File file);
334 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
335 : static bool reserveAllocatedDesc(void);
336 : static int FreeDesc(AllocateDesc *desc);
337 :
338 : static void BeforeShmemExit_Files(int code, Datum arg);
339 : static void CleanupTempFiles(bool isCommit, bool isProcExit);
340 : static void RemovePgTempRelationFiles(const char *tsdirname);
341 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
342 :
343 : static void walkdir(const char *path,
344 : void (*action) (const char *fname, bool isdir, int elevel),
345 : bool process_symlinks,
346 : int elevel);
347 : #ifdef PG_FLUSH_DATA_WORKS
348 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
349 : #endif
350 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
351 : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
352 :
353 : static int fsync_parent_path(const char *fname, int elevel);
354 :
355 :
356 : /* ResourceOwner callbacks to hold virtual file descriptors */
357 : static void ResOwnerReleaseFile(Datum res);
358 : static char *ResOwnerPrintFile(Datum res);
359 :
360 : static const ResourceOwnerDesc file_resowner_desc =
361 : {
362 : .name = "File",
363 : .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
364 : .release_priority = RELEASE_PRIO_FILES,
365 : .ReleaseResource = ResOwnerReleaseFile,
366 : .DebugPrint = ResOwnerPrintFile
367 : };
368 :
369 : /* Convenience wrappers over ResourceOwnerRemember/Forget */
370 : static inline void
371 8616 : ResourceOwnerRememberFile(ResourceOwner owner, File file)
372 : {
373 8616 : ResourceOwnerRemember(owner, Int32GetDatum(file), &file_resowner_desc);
374 8616 : }
375 : static inline void
376 8606 : ResourceOwnerForgetFile(ResourceOwner owner, File file)
377 : {
378 8606 : ResourceOwnerForget(owner, Int32GetDatum(file), &file_resowner_desc);
379 8606 : }
380 :
381 : /*
382 : * pg_fsync --- do fsync with or without writethrough
383 : */
384 : int
385 110988 : pg_fsync(int fd)
386 : {
387 : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
388 : struct stat st;
389 :
390 : /*
391 : * Some operating system implementations of fsync() have requirements
392 : * about the file access modes that were used when their file descriptor
393 : * argument was opened, and these requirements differ depending on whether
394 : * the file descriptor is for a directory.
395 : *
396 : * For any file descriptor that may eventually be handed to fsync(), we
397 : * should have opened it with access modes that are compatible with
398 : * fsync() on all supported systems, otherwise the code may not be
399 : * portable, even if it runs ok on the current system.
400 : *
401 : * We assert here that a descriptor for a file was opened with write
402 : * permissions (either O_RDWR or O_WRONLY) and for a directory without
403 : * write permissions (O_RDONLY).
404 : *
405 : * Ignore any fstat errors and let the follow-up fsync() do its work.
406 : * Doing this sanity check here counts for the case where fsync() is
407 : * disabled.
408 : */
409 : if (fstat(fd, &st) == 0)
410 : {
411 : int desc_flags = fcntl(fd, F_GETFL);
412 :
413 : /*
414 : * O_RDONLY is historically 0, so just make sure that for directories
415 : * no write flags are used.
416 : */
417 : if (S_ISDIR(st.st_mode))
418 : Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
419 : else
420 : Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
421 : }
422 : errno = 0;
423 : #endif
424 :
425 : /* #if is to skip the wal_sync_method test if there's no need for it */
426 : #if defined(HAVE_FSYNC_WRITETHROUGH)
427 : if (wal_sync_method == WAL_SYNC_METHOD_FSYNC_WRITETHROUGH)
428 : return pg_fsync_writethrough(fd);
429 : else
430 : #endif
431 110988 : return pg_fsync_no_writethrough(fd);
432 : }
433 :
434 :
435 : /*
436 : * pg_fsync_no_writethrough --- same as fsync except does nothing if
437 : * enableFsync is off
438 : */
439 : int
440 110988 : pg_fsync_no_writethrough(int fd)
441 : {
442 : int rc;
443 :
444 110988 : if (!enableFsync)
445 110988 : return 0;
446 :
447 0 : retry:
448 0 : rc = fsync(fd);
449 :
450 0 : if (rc == -1 && errno == EINTR)
451 0 : goto retry;
452 :
453 0 : return rc;
454 : }
455 :
456 : /*
457 : * pg_fsync_writethrough
458 : */
459 : int
460 0 : pg_fsync_writethrough(int fd)
461 : {
462 0 : if (enableFsync)
463 : {
464 : #if defined(F_FULLFSYNC)
465 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
466 : #else
467 0 : errno = ENOSYS;
468 0 : return -1;
469 : #endif
470 : }
471 : else
472 0 : return 0;
473 : }
474 :
475 : /*
476 : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
477 : */
478 : int
479 0 : pg_fdatasync(int fd)
480 : {
481 : int rc;
482 :
483 0 : if (!enableFsync)
484 0 : return 0;
485 :
486 0 : retry:
487 0 : rc = fdatasync(fd);
488 :
489 0 : if (rc == -1 && errno == EINTR)
490 0 : goto retry;
491 :
492 0 : return rc;
493 : }
494 :
495 : /*
496 : * pg_file_exists -- check that a file exists.
497 : *
498 : * This requires an absolute path to the file. Returns true if the file is
499 : * not a directory, false otherwise.
500 : */
501 : bool
502 33160 : pg_file_exists(const char *name)
503 : {
504 : struct stat st;
505 :
506 : Assert(name != NULL);
507 :
508 33160 : if (stat(name, &st) == 0)
509 17310 : return !S_ISDIR(st.st_mode);
510 15850 : else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
511 0 : ereport(ERROR,
512 : (errcode_for_file_access(),
513 : errmsg("could not access file \"%s\": %m", name)));
514 :
515 15850 : return false;
516 : }
517 :
518 : /*
519 : * pg_flush_data --- advise OS that the described dirty data should be flushed
520 : *
521 : * offset of 0 with nbytes 0 means that the entire file should be flushed
522 : */
523 : void
524 58702 : pg_flush_data(int fd, off_t offset, off_t nbytes)
525 : {
526 : /*
527 : * Right now file flushing is primarily used to avoid making later
528 : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
529 : * if fsyncs are disabled - that's a decision we might want to make
530 : * configurable at some point.
531 : */
532 58702 : if (!enableFsync)
533 58702 : return;
534 :
535 : /*
536 : * We compile all alternatives that are supported on the current platform,
537 : * to find portability problems more easily.
538 : */
539 : #if defined(HAVE_SYNC_FILE_RANGE)
540 : {
541 : int rc;
542 : static bool not_implemented_by_kernel = false;
543 :
544 0 : if (not_implemented_by_kernel)
545 0 : return;
546 :
547 0 : retry:
548 :
549 : /*
550 : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
551 : * tells the OS that writeback for the specified blocks should be
552 : * started, but that we don't want to wait for completion. Note that
553 : * this call might block if too much dirty data exists in the range.
554 : * This is the preferable method on OSs supporting it, as it works
555 : * reliably when available (contrast to msync()) and doesn't flush out
556 : * clean data (like FADV_DONTNEED).
557 : */
558 0 : rc = sync_file_range(fd, offset, nbytes,
559 : SYNC_FILE_RANGE_WRITE);
560 0 : if (rc != 0)
561 : {
562 : int elevel;
563 :
564 0 : if (rc == EINTR)
565 0 : goto retry;
566 :
567 : /*
568 : * For systems that don't have an implementation of
569 : * sync_file_range() such as Windows WSL, generate only one
570 : * warning and then suppress all further attempts by this process.
571 : */
572 0 : if (errno == ENOSYS)
573 : {
574 0 : elevel = WARNING;
575 0 : not_implemented_by_kernel = true;
576 : }
577 : else
578 0 : elevel = data_sync_elevel(WARNING);
579 :
580 0 : ereport(elevel,
581 : (errcode_for_file_access(),
582 : errmsg("could not flush dirty data: %m")));
583 : }
584 :
585 0 : return;
586 : }
587 : #endif
588 : #if !defined(WIN32) && defined(MS_ASYNC)
589 : {
590 : void *p;
591 : static int pagesize = 0;
592 :
593 : /*
594 : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
595 : * writeback. On linux it only does so if MS_SYNC is specified, but
596 : * then it does the writeback synchronously. Luckily all common linux
597 : * systems have sync_file_range(). This is preferable over
598 : * FADV_DONTNEED because it doesn't flush out clean data.
599 : *
600 : * We map the file (mmap()), tell the kernel to sync back the contents
601 : * (msync()), and then remove the mapping again (munmap()).
602 : */
603 :
604 : /* mmap() needs actual length if we want to map whole file */
605 : if (offset == 0 && nbytes == 0)
606 : {
607 : nbytes = lseek(fd, 0, SEEK_END);
608 : if (nbytes < 0)
609 : {
610 : ereport(WARNING,
611 : (errcode_for_file_access(),
612 : errmsg("could not determine dirty data size: %m")));
613 : return;
614 : }
615 : }
616 :
617 : /*
618 : * Some platforms reject partial-page mmap() attempts. To deal with
619 : * that, just truncate the request to a page boundary. If any extra
620 : * bytes don't get flushed, well, it's only a hint anyway.
621 : */
622 :
623 : /* fetch pagesize only once */
624 : if (pagesize == 0)
625 : pagesize = sysconf(_SC_PAGESIZE);
626 :
627 : /* align length to pagesize, dropping any fractional page */
628 : if (pagesize > 0)
629 : nbytes = (nbytes / pagesize) * pagesize;
630 :
631 : /* fractional-page request is a no-op */
632 : if (nbytes <= 0)
633 : return;
634 :
635 : /*
636 : * mmap could well fail, particularly on 32-bit platforms where there
637 : * may simply not be enough address space. If so, silently fall
638 : * through to the next implementation.
639 : */
640 : if (nbytes <= (off_t) SSIZE_MAX)
641 : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
642 : else
643 : p = MAP_FAILED;
644 :
645 : if (p != MAP_FAILED)
646 : {
647 : int rc;
648 :
649 : rc = msync(p, (size_t) nbytes, MS_ASYNC);
650 : if (rc != 0)
651 : {
652 : ereport(data_sync_elevel(WARNING),
653 : (errcode_for_file_access(),
654 : errmsg("could not flush dirty data: %m")));
655 : /* NB: need to fall through to munmap()! */
656 : }
657 :
658 : rc = munmap(p, (size_t) nbytes);
659 : if (rc != 0)
660 : {
661 : /* FATAL error because mapping would remain */
662 : ereport(FATAL,
663 : (errcode_for_file_access(),
664 : errmsg("could not munmap() while flushing data: %m")));
665 : }
666 :
667 : return;
668 : }
669 : }
670 : #endif
671 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
672 : {
673 : int rc;
674 :
675 : /*
676 : * Signal the kernel that the passed in range should not be cached
677 : * anymore. This has the, desired, side effect of writing out dirty
678 : * data, and the, undesired, side effect of likely discarding useful
679 : * clean cached blocks. For the latter reason this is the least
680 : * preferable method.
681 : */
682 :
683 : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
684 :
685 : if (rc != 0)
686 : {
687 : /* don't error out, this is just a performance optimization */
688 : ereport(WARNING,
689 : (errcode_for_file_access(),
690 : errmsg("could not flush dirty data: %m")));
691 : }
692 :
693 : return;
694 : }
695 : #endif
696 : }
697 :
698 : /*
699 : * Truncate an open file to a given length.
700 : */
701 : static int
702 958 : pg_ftruncate(int fd, off_t length)
703 : {
704 : int ret;
705 :
706 958 : retry:
707 958 : ret = ftruncate(fd, length);
708 :
709 958 : if (ret == -1 && errno == EINTR)
710 0 : goto retry;
711 :
712 958 : return ret;
713 : }
714 :
715 : /*
716 : * Truncate a file to a given length by name.
717 : */
718 : int
719 412158 : pg_truncate(const char *path, off_t length)
720 : {
721 : int ret;
722 : #ifdef WIN32
723 : int save_errno;
724 : int fd;
725 :
726 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
727 : if (fd >= 0)
728 : {
729 : ret = pg_ftruncate(fd, length);
730 : save_errno = errno;
731 : CloseTransientFile(fd);
732 : errno = save_errno;
733 : }
734 : else
735 : ret = -1;
736 : #else
737 :
738 412158 : retry:
739 412158 : ret = truncate(path, length);
740 :
741 412158 : if (ret == -1 && errno == EINTR)
742 0 : goto retry;
743 : #endif
744 :
745 412158 : return ret;
746 : }
747 :
748 : /*
749 : * fsync_fname -- fsync a file or directory, handling errors properly
750 : *
751 : * Try to fsync a file or directory. When doing the latter, ignore errors that
752 : * indicate the OS just doesn't allow/require fsyncing directories.
753 : */
754 : void
755 35108 : fsync_fname(const char *fname, bool isdir)
756 : {
757 35108 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
758 35108 : }
759 :
760 : /*
761 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
762 : *
763 : * This routine ensures that, after returning, the effect of renaming file
764 : * persists in case of a crash. A crash while this routine is running will
765 : * leave you with either the pre-existing or the moved file in place of the
766 : * new file; no mixed state or truncated files are possible.
767 : *
768 : * It does so by using fsync on the old filename and the possibly existing
769 : * target filename before the rename, and the target file and directory after.
770 : *
771 : * Note that rename() cannot be used across arbitrary directories, as they
772 : * might not be on the same filesystem. Therefore this routine does not
773 : * support renaming across directories.
774 : *
775 : * Log errors with the caller specified severity.
776 : *
777 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
778 : * valid upon return.
779 : */
780 : int
781 9780 : durable_rename(const char *oldfile, const char *newfile, int elevel)
782 : {
783 : int fd;
784 :
785 : /*
786 : * First fsync the old and target path (if it exists), to ensure that they
787 : * are properly persistent on disk. Syncing the target file is not
788 : * strictly necessary, but it makes it easier to reason about crashes;
789 : * because it's then guaranteed that either source or target file exists
790 : * after a crash.
791 : */
792 9780 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
793 0 : return -1;
794 :
795 9780 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
796 9780 : if (fd < 0)
797 : {
798 6894 : if (errno != ENOENT)
799 : {
800 0 : ereport(elevel,
801 : (errcode_for_file_access(),
802 : errmsg("could not open file \"%s\": %m", newfile)));
803 0 : return -1;
804 : }
805 : }
806 : else
807 : {
808 2886 : if (pg_fsync(fd) != 0)
809 : {
810 : int save_errno;
811 :
812 : /* close file upon error, might not be in transaction context */
813 0 : save_errno = errno;
814 0 : CloseTransientFile(fd);
815 0 : errno = save_errno;
816 :
817 0 : ereport(elevel,
818 : (errcode_for_file_access(),
819 : errmsg("could not fsync file \"%s\": %m", newfile)));
820 0 : return -1;
821 : }
822 :
823 2886 : if (CloseTransientFile(fd) != 0)
824 : {
825 0 : ereport(elevel,
826 : (errcode_for_file_access(),
827 : errmsg("could not close file \"%s\": %m", newfile)));
828 0 : return -1;
829 : }
830 : }
831 :
832 : /* Time to do the real deal... */
833 9780 : if (rename(oldfile, newfile) < 0)
834 : {
835 0 : ereport(elevel,
836 : (errcode_for_file_access(),
837 : errmsg("could not rename file \"%s\" to \"%s\": %m",
838 : oldfile, newfile)));
839 0 : return -1;
840 : }
841 :
842 : /*
843 : * To guarantee renaming the file is persistent, fsync the file with its
844 : * new name, and its containing directory.
845 : */
846 9780 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
847 0 : return -1;
848 :
849 9780 : if (fsync_parent_path(newfile, elevel) != 0)
850 0 : return -1;
851 :
852 9780 : return 0;
853 : }
854 :
855 : /*
856 : * durable_unlink -- remove a file in a durable manner
857 : *
858 : * This routine ensures that, after returning, the effect of removing file
859 : * persists in case of a crash. A crash while this routine is running will
860 : * leave the system in no mixed state.
861 : *
862 : * It does so by using fsync on the parent directory of the file after the
863 : * actual removal is done.
864 : *
865 : * Log errors with the severity specified by caller.
866 : *
867 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
868 : * valid upon return.
869 : */
870 : int
871 1952 : durable_unlink(const char *fname, int elevel)
872 : {
873 1952 : if (unlink(fname) < 0)
874 : {
875 74 : ereport(elevel,
876 : (errcode_for_file_access(),
877 : errmsg("could not remove file \"%s\": %m",
878 : fname)));
879 74 : return -1;
880 : }
881 :
882 : /*
883 : * To guarantee that the removal of the file is persistent, fsync its
884 : * parent directory.
885 : */
886 1878 : if (fsync_parent_path(fname, elevel) != 0)
887 0 : return -1;
888 :
889 1878 : return 0;
890 : }
891 :
892 : /*
893 : * InitFileAccess --- initialize this module during backend startup
894 : *
895 : * This is called during either normal or standalone backend start.
896 : * It is *not* called in the postmaster.
897 : *
898 : * Note that this does not initialize temporary file access, that is
899 : * separately initialized via InitTemporaryFileAccess().
900 : */
901 : void
902 37286 : InitFileAccess(void)
903 : {
904 : Assert(SizeVfdCache == 0); /* call me only once */
905 :
906 : /* initialize cache header entry */
907 37286 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
908 37286 : if (VfdCache == NULL)
909 0 : ereport(FATAL,
910 : (errcode(ERRCODE_OUT_OF_MEMORY),
911 : errmsg("out of memory")));
912 :
913 298288 : MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
914 37286 : VfdCache->fd = VFD_CLOSED;
915 :
916 37286 : SizeVfdCache = 1;
917 37286 : }
918 :
919 : /*
920 : * InitTemporaryFileAccess --- initialize temporary file access during startup
921 : *
922 : * This is called during either normal or standalone backend start.
923 : * It is *not* called in the postmaster.
924 : *
925 : * This is separate from InitFileAccess() because temporary file cleanup can
926 : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
927 : * our reporting has to happen before that. Low level file access should be
928 : * available for longer, hence the separate initialization / shutdown of
929 : * temporary file handling.
930 : */
931 : void
932 37286 : InitTemporaryFileAccess(void)
933 : {
934 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
935 : Assert(!temporary_files_allowed); /* call me only once */
936 :
937 : /*
938 : * Register before-shmem-exit hook to ensure temp files are dropped while
939 : * we can still report stats.
940 : */
941 37286 : before_shmem_exit(BeforeShmemExit_Files, 0);
942 :
943 : #ifdef USE_ASSERT_CHECKING
944 : temporary_files_allowed = true;
945 : #endif
946 37286 : }
947 :
948 : /*
949 : * count_usable_fds --- count how many FDs the system will let us open,
950 : * and estimate how many are already open.
951 : *
952 : * We stop counting if usable_fds reaches max_to_probe. Note: a small
953 : * value of max_to_probe might result in an underestimate of already_open;
954 : * we must fill in any "gaps" in the set of used FDs before the calculation
955 : * of already_open will give the right answer. In practice, max_to_probe
956 : * of a couple of dozen should be enough to ensure good results.
957 : *
958 : * We assume stderr (FD 2) is available for dup'ing. While the calling
959 : * script could theoretically close that, it would be a really bad idea,
960 : * since then one risks loss of error messages from, e.g., libc.
961 : */
962 : static void
963 1908 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
964 : {
965 : int *fd;
966 : int size;
967 1908 : int used = 0;
968 1908 : int highestfd = 0;
969 : int j;
970 :
971 : #ifdef HAVE_GETRLIMIT
972 : struct rlimit rlim;
973 : int getrlimit_status;
974 : #endif
975 :
976 1908 : size = 1024;
977 1908 : fd = (int *) palloc(size * sizeof(int));
978 :
979 : #ifdef HAVE_GETRLIMIT
980 1908 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
981 1908 : if (getrlimit_status != 0)
982 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
983 : #endif /* HAVE_GETRLIMIT */
984 :
985 : /* dup until failure or probe limit reached */
986 : for (;;)
987 1906092 : {
988 : int thisfd;
989 :
990 : #ifdef HAVE_GETRLIMIT
991 :
992 : /*
993 : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
994 : * some platforms
995 : */
996 1908000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
997 0 : break;
998 : #endif
999 :
1000 1908000 : thisfd = dup(2);
1001 1908000 : if (thisfd < 0)
1002 : {
1003 : /* Expect EMFILE or ENFILE, else it's fishy */
1004 0 : if (errno != EMFILE && errno != ENFILE)
1005 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1006 0 : break;
1007 : }
1008 :
1009 1908000 : if (used >= size)
1010 : {
1011 0 : size *= 2;
1012 0 : fd = (int *) repalloc(fd, size * sizeof(int));
1013 : }
1014 1908000 : fd[used++] = thisfd;
1015 :
1016 1908000 : if (highestfd < thisfd)
1017 1908000 : highestfd = thisfd;
1018 :
1019 1908000 : if (used >= max_to_probe)
1020 1908 : break;
1021 : }
1022 :
1023 : /* release the files we opened */
1024 1909908 : for (j = 0; j < used; j++)
1025 1908000 : close(fd[j]);
1026 :
1027 1908 : pfree(fd);
1028 :
1029 : /*
1030 : * Return results. usable_fds is just the number of successful dups. We
1031 : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1032 : * number) and so already_open is highestfd+1 - usable_fds.
1033 : */
1034 1908 : *usable_fds = used;
1035 1908 : *already_open = highestfd + 1 - used;
1036 1908 : }
1037 :
1038 : /*
1039 : * set_max_safe_fds
1040 : * Determine number of file descriptors that fd.c is allowed to use
1041 : */
1042 : void
1043 1908 : set_max_safe_fds(void)
1044 : {
1045 : int usable_fds;
1046 : int already_open;
1047 :
1048 : /*----------
1049 : * We want to set max_safe_fds to
1050 : * MIN(usable_fds, max_files_per_process - already_open)
1051 : * less the slop factor for files that are opened without consulting
1052 : * fd.c. This ensures that we won't exceed either max_files_per_process
1053 : * or the experimentally-determined EMFILE limit.
1054 : *----------
1055 : */
1056 1908 : count_usable_fds(max_files_per_process,
1057 : &usable_fds, &already_open);
1058 :
1059 1908 : max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1060 :
1061 : /*
1062 : * Take off the FDs reserved for system() etc.
1063 : */
1064 1908 : max_safe_fds -= NUM_RESERVED_FDS;
1065 :
1066 : /*
1067 : * Make sure we still have enough to get by.
1068 : */
1069 1908 : if (max_safe_fds < FD_MINFREE)
1070 0 : ereport(FATAL,
1071 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1072 : errmsg("insufficient file descriptors available to start server process"),
1073 : errdetail("System allows %d, server needs at least %d.",
1074 : max_safe_fds + NUM_RESERVED_FDS,
1075 : FD_MINFREE + NUM_RESERVED_FDS)));
1076 :
1077 1908 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1078 : max_safe_fds, usable_fds, already_open);
1079 1908 : }
1080 :
1081 : /*
1082 : * Open a file with BasicOpenFilePerm() and pass default file mode for the
1083 : * fileMode parameter.
1084 : */
1085 : int
1086 86178 : BasicOpenFile(const char *fileName, int fileFlags)
1087 : {
1088 86178 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1089 : }
1090 :
1091 : /*
1092 : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1093 : *
1094 : * This is exported for use by places that really want a plain kernel FD,
1095 : * but need to be proof against running out of FDs. Once an FD has been
1096 : * successfully returned, it is the caller's responsibility to ensure that
1097 : * it will not be leaked on ereport()! Most users should *not* call this
1098 : * routine directly, but instead use the VFD abstraction level, which
1099 : * provides protection against descriptor leaks as well as management of
1100 : * files that need to be open for more than a short period of time.
1101 : *
1102 : * Ideally this should be the *only* direct call of open() in the backend.
1103 : * In practice, the postmaster calls open() directly, and there are some
1104 : * direct open() calls done early in backend startup. Those are OK since
1105 : * this module wouldn't have any open files to close at that point anyway.
1106 : */
1107 : int
1108 17869400 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1109 : {
1110 : int fd;
1111 :
1112 17869400 : tryAgain:
1113 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1114 :
1115 : /*
1116 : * The value we defined to stand in for O_DIRECT when simulating it with
1117 : * F_NOCACHE had better not collide with any of the standard flags.
1118 : */
1119 : StaticAssertStmt((PG_O_DIRECT &
1120 : (O_APPEND |
1121 : O_CLOEXEC |
1122 : O_CREAT |
1123 : O_DSYNC |
1124 : O_EXCL |
1125 : O_RDWR |
1126 : O_RDONLY |
1127 : O_SYNC |
1128 : O_TRUNC |
1129 : O_WRONLY)) == 0,
1130 : "PG_O_DIRECT value collides with standard flag");
1131 : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1132 : #else
1133 17869400 : fd = open(fileName, fileFlags, fileMode);
1134 : #endif
1135 :
1136 17869400 : if (fd >= 0)
1137 : {
1138 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1139 : if (fileFlags & PG_O_DIRECT)
1140 : {
1141 : if (fcntl(fd, F_NOCACHE, 1) < 0)
1142 : {
1143 : int save_errno = errno;
1144 :
1145 : close(fd);
1146 : errno = save_errno;
1147 : return -1;
1148 : }
1149 : }
1150 : #endif
1151 :
1152 17101198 : return fd; /* success! */
1153 : }
1154 :
1155 768202 : if (errno == EMFILE || errno == ENFILE)
1156 : {
1157 0 : int save_errno = errno;
1158 :
1159 0 : ereport(LOG,
1160 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1161 : errmsg("out of file descriptors: %m; release and retry")));
1162 0 : errno = 0;
1163 0 : if (ReleaseLruFile())
1164 0 : goto tryAgain;
1165 0 : errno = save_errno;
1166 : }
1167 :
1168 768202 : return -1; /* failure */
1169 : }
1170 :
1171 : /*
1172 : * AcquireExternalFD - attempt to reserve an external file descriptor
1173 : *
1174 : * This should be used by callers that need to hold a file descriptor open
1175 : * over more than a short interval, but cannot use any of the other facilities
1176 : * provided by this module.
1177 : *
1178 : * The difference between this and the underlying ReserveExternalFD function
1179 : * is that this will report failure (by setting errno and returning false)
1180 : * if "too many" external FDs are already reserved. This should be used in
1181 : * any code where the total number of FDs to be reserved is not predictable
1182 : * and small.
1183 : */
1184 : bool
1185 188892 : AcquireExternalFD(void)
1186 : {
1187 : /*
1188 : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1189 : * "external" FDs.
1190 : */
1191 188892 : if (numExternalFDs < max_safe_fds / 3)
1192 : {
1193 188892 : ReserveExternalFD();
1194 188892 : return true;
1195 : }
1196 0 : errno = EMFILE;
1197 0 : return false;
1198 : }
1199 :
1200 : /*
1201 : * ReserveExternalFD - report external consumption of a file descriptor
1202 : *
1203 : * This should be used by callers that need to hold a file descriptor open
1204 : * over more than a short interval, but cannot use any of the other facilities
1205 : * provided by this module. This just tracks the use of the FD and closes
1206 : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1207 : *
1208 : * Call this directly only in code where failure to reserve the FD would be
1209 : * fatal; for example, the WAL-writing code does so, since the alternative is
1210 : * session failure. Also, it's very unwise to do so in code that could
1211 : * consume more than one FD per process.
1212 : *
1213 : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1214 : * available, it doesn't matter too much whether this is called before or
1215 : * after actually opening the FD; but doing so beforehand reduces the risk of
1216 : * an EMFILE failure if not everybody played nice. In any case, it's solely
1217 : * caller's responsibility to keep the external-FD count in sync with reality.
1218 : */
1219 : void
1220 324596 : ReserveExternalFD(void)
1221 : {
1222 : /*
1223 : * Release VFDs if needed to stay safe. Because we do this before
1224 : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1225 : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1226 : */
1227 324596 : ReleaseLruFiles();
1228 :
1229 324596 : numExternalFDs++;
1230 324596 : }
1231 :
1232 : /*
1233 : * ReleaseExternalFD - report release of an external file descriptor
1234 : *
1235 : * This is guaranteed not to change errno, so it can be used in failure paths.
1236 : */
1237 : void
1238 287572 : ReleaseExternalFD(void)
1239 : {
1240 : Assert(numExternalFDs > 0);
1241 287572 : numExternalFDs--;
1242 287572 : }
1243 :
1244 :
1245 : #if defined(FDDEBUG)
1246 :
1247 : static void
1248 : _dump_lru(void)
1249 : {
1250 : int mru = VfdCache[0].lruLessRecently;
1251 : Vfd *vfdP = &VfdCache[mru];
1252 : char buf[2048];
1253 :
1254 : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1255 : while (mru != 0)
1256 : {
1257 : mru = vfdP->lruLessRecently;
1258 : vfdP = &VfdCache[mru];
1259 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1260 : }
1261 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1262 : elog(LOG, "%s", buf);
1263 : }
1264 : #endif /* FDDEBUG */
1265 :
1266 : static void
1267 2287122 : Delete(File file)
1268 : {
1269 : Vfd *vfdP;
1270 :
1271 : Assert(file != 0);
1272 :
1273 : DO_DB(elog(LOG, "Delete %d (%s)",
1274 : file, VfdCache[file].fileName));
1275 : DO_DB(_dump_lru());
1276 :
1277 2287122 : vfdP = &VfdCache[file];
1278 :
1279 2287122 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1280 2287122 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1281 :
1282 : DO_DB(_dump_lru());
1283 2287122 : }
1284 :
1285 : static void
1286 6634 : LruDelete(File file)
1287 : {
1288 : Vfd *vfdP;
1289 :
1290 : Assert(file != 0);
1291 :
1292 : DO_DB(elog(LOG, "LruDelete %d (%s)",
1293 : file, VfdCache[file].fileName));
1294 :
1295 6634 : vfdP = &VfdCache[file];
1296 :
1297 : /*
1298 : * Close the file. We aren't expecting this to fail; if it does, better
1299 : * to leak the FD than to mess up our internal state.
1300 : */
1301 6634 : if (close(vfdP->fd) != 0)
1302 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1303 : "could not close file \"%s\": %m", vfdP->fileName);
1304 6634 : vfdP->fd = VFD_CLOSED;
1305 6634 : --nfile;
1306 :
1307 : /* delete the vfd record from the LRU ring */
1308 6634 : Delete(file);
1309 6634 : }
1310 :
1311 : static void
1312 2962864 : Insert(File file)
1313 : {
1314 : Vfd *vfdP;
1315 :
1316 : Assert(file != 0);
1317 :
1318 : DO_DB(elog(LOG, "Insert %d (%s)",
1319 : file, VfdCache[file].fileName));
1320 : DO_DB(_dump_lru());
1321 :
1322 2962864 : vfdP = &VfdCache[file];
1323 :
1324 2962864 : vfdP->lruMoreRecently = 0;
1325 2962864 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1326 2962864 : VfdCache[0].lruLessRecently = file;
1327 2962864 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1328 :
1329 : DO_DB(_dump_lru());
1330 2962864 : }
1331 :
1332 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1333 : static int
1334 96 : LruInsert(File file)
1335 : {
1336 : Vfd *vfdP;
1337 :
1338 : Assert(file != 0);
1339 :
1340 : DO_DB(elog(LOG, "LruInsert %d (%s)",
1341 : file, VfdCache[file].fileName));
1342 :
1343 96 : vfdP = &VfdCache[file];
1344 :
1345 96 : if (FileIsNotOpen(file))
1346 : {
1347 : /* Close excess kernel FDs. */
1348 96 : ReleaseLruFiles();
1349 :
1350 : /*
1351 : * The open could still fail for lack of file descriptors, eg due to
1352 : * overall system file table being full. So, be prepared to release
1353 : * another FD if necessary...
1354 : */
1355 96 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1356 : vfdP->fileMode);
1357 96 : if (vfdP->fd < 0)
1358 : {
1359 : DO_DB(elog(LOG, "re-open failed: %m"));
1360 0 : return -1;
1361 : }
1362 : else
1363 : {
1364 96 : ++nfile;
1365 : }
1366 : }
1367 :
1368 : /*
1369 : * put it at the head of the Lru ring
1370 : */
1371 :
1372 96 : Insert(file);
1373 :
1374 96 : return 0;
1375 : }
1376 :
1377 : /*
1378 : * Release one kernel FD by closing the least-recently-used VFD.
1379 : */
1380 : static bool
1381 6376 : ReleaseLruFile(void)
1382 : {
1383 : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1384 :
1385 6376 : if (nfile > 0)
1386 : {
1387 : /*
1388 : * There are opened files and so there should be at least one used vfd
1389 : * in the ring.
1390 : */
1391 : Assert(VfdCache[0].lruMoreRecently != 0);
1392 6376 : LruDelete(VfdCache[0].lruMoreRecently);
1393 6376 : return true; /* freed a file */
1394 : }
1395 0 : return false; /* no files available to free */
1396 : }
1397 :
1398 : /*
1399 : * Release kernel FDs as needed to get under the max_safe_fds limit.
1400 : * After calling this, it's OK to try to open another file.
1401 : */
1402 : static void
1403 18338152 : ReleaseLruFiles(void)
1404 : {
1405 18344528 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1406 : {
1407 6376 : if (!ReleaseLruFile())
1408 0 : break;
1409 : }
1410 18338152 : }
1411 :
1412 : static File
1413 2423742 : AllocateVfd(void)
1414 : {
1415 : Index i;
1416 : File file;
1417 :
1418 : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1419 :
1420 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1421 :
1422 2423742 : if (VfdCache[0].nextFree == 0)
1423 : {
1424 : /*
1425 : * The free list is empty so it is time to increase the size of the
1426 : * array. We choose to double it each time this happens. However,
1427 : * there's not much point in starting *real* small.
1428 : */
1429 44500 : Size newCacheSize = SizeVfdCache * 2;
1430 : Vfd *newVfdCache;
1431 :
1432 44500 : if (newCacheSize < 32)
1433 33366 : newCacheSize = 32;
1434 :
1435 : /*
1436 : * Be careful not to clobber VfdCache ptr if realloc fails.
1437 : */
1438 44500 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1439 44500 : if (newVfdCache == NULL)
1440 0 : ereport(ERROR,
1441 : (errcode(ERRCODE_OUT_OF_MEMORY),
1442 : errmsg("out of memory")));
1443 44500 : VfdCache = newVfdCache;
1444 :
1445 : /*
1446 : * Initialize the new entries and link them into the free list.
1447 : */
1448 2100734 : for (i = SizeVfdCache; i < newCacheSize; i++)
1449 : {
1450 16449872 : MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1451 2056234 : VfdCache[i].nextFree = i + 1;
1452 2056234 : VfdCache[i].fd = VFD_CLOSED;
1453 : }
1454 44500 : VfdCache[newCacheSize - 1].nextFree = 0;
1455 44500 : VfdCache[0].nextFree = SizeVfdCache;
1456 :
1457 : /*
1458 : * Record the new size
1459 : */
1460 44500 : SizeVfdCache = newCacheSize;
1461 : }
1462 :
1463 2423742 : file = VfdCache[0].nextFree;
1464 :
1465 2423742 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1466 :
1467 2423742 : return file;
1468 : }
1469 :
1470 : static void
1471 1743310 : FreeVfd(File file)
1472 : {
1473 1743310 : Vfd *vfdP = &VfdCache[file];
1474 :
1475 : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1476 : file, vfdP->fileName ? vfdP->fileName : ""));
1477 :
1478 1743310 : if (vfdP->fileName != NULL)
1479 : {
1480 985728 : free(vfdP->fileName);
1481 985728 : vfdP->fileName = NULL;
1482 : }
1483 1743310 : vfdP->fdstate = 0x0;
1484 :
1485 1743310 : vfdP->nextFree = VfdCache[0].nextFree;
1486 1743310 : VfdCache[0].nextFree = file;
1487 1743310 : }
1488 :
1489 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1490 : static int
1491 4934530 : FileAccess(File file)
1492 : {
1493 : int returnValue;
1494 :
1495 : DO_DB(elog(LOG, "FileAccess %d (%s)",
1496 : file, VfdCache[file].fileName));
1497 :
1498 : /*
1499 : * Is the file open? If not, open it and put it at the head of the LRU
1500 : * ring (possibly closing the least recently used file to get an FD).
1501 : */
1502 :
1503 4934530 : if (FileIsNotOpen(file))
1504 : {
1505 96 : returnValue = LruInsert(file);
1506 96 : if (returnValue != 0)
1507 0 : return returnValue;
1508 : }
1509 4934434 : else if (VfdCache[0].lruLessRecently != file)
1510 : {
1511 : /*
1512 : * We now know that the file is open and that it is not the last one
1513 : * accessed, so we need to move it to the head of the Lru ring.
1514 : */
1515 :
1516 1296608 : Delete(file);
1517 1296608 : Insert(file);
1518 : }
1519 :
1520 4934530 : return 0;
1521 : }
1522 :
1523 : /*
1524 : * Called whenever a temporary file is deleted to report its size.
1525 : */
1526 : static void
1527 5354 : ReportTemporaryFileUsage(const char *path, off_t size)
1528 : {
1529 5354 : pgstat_report_tempfile(size);
1530 :
1531 5354 : if (log_temp_files >= 0)
1532 : {
1533 1734 : if ((size / 1024) >= log_temp_files)
1534 228 : ereport(LOG,
1535 : (errmsg("temporary file: path \"%s\", size %lu",
1536 : path, (unsigned long) size)));
1537 : }
1538 5354 : }
1539 :
1540 : /*
1541 : * Called to register a temporary file for automatic close.
1542 : * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1543 : * before the file was opened.
1544 : */
1545 : static void
1546 8616 : RegisterTemporaryFile(File file)
1547 : {
1548 8616 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1549 8616 : VfdCache[file].resowner = CurrentResourceOwner;
1550 :
1551 : /* Backup mechanism for closing at end of xact. */
1552 8616 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1553 8616 : have_xact_temporary_files = true;
1554 8616 : }
1555 :
1556 : /*
1557 : * Called when we get a shared invalidation message on some relation.
1558 : */
1559 : #ifdef NOT_USED
1560 : void
1561 : FileInvalidate(File file)
1562 : {
1563 : Assert(FileIsValid(file));
1564 : if (!FileIsNotOpen(file))
1565 : LruDelete(file);
1566 : }
1567 : #endif
1568 :
1569 : /*
1570 : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1571 : * fileMode parameter.
1572 : */
1573 : File
1574 2423742 : PathNameOpenFile(const char *fileName, int fileFlags)
1575 : {
1576 2423742 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1577 : }
1578 :
1579 : /*
1580 : * open a file in an arbitrary directory
1581 : *
1582 : * NB: if the passed pathname is relative (which it usually is),
1583 : * it will be interpreted relative to the process' working directory
1584 : * (which should always be $PGDATA when this code is running).
1585 : */
1586 : File
1587 2423742 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1588 : {
1589 : char *fnamecopy;
1590 : File file;
1591 : Vfd *vfdP;
1592 :
1593 : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1594 : fileName, fileFlags, fileMode));
1595 :
1596 : /*
1597 : * We need a malloc'd copy of the file name; fail cleanly if no room.
1598 : */
1599 2423742 : fnamecopy = strdup(fileName);
1600 2423742 : if (fnamecopy == NULL)
1601 0 : ereport(ERROR,
1602 : (errcode(ERRCODE_OUT_OF_MEMORY),
1603 : errmsg("out of memory")));
1604 :
1605 2423742 : file = AllocateVfd();
1606 2423742 : vfdP = &VfdCache[file];
1607 :
1608 : /* Close excess kernel FDs. */
1609 2423742 : ReleaseLruFiles();
1610 :
1611 : /*
1612 : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1613 : * client shouldn't be expected to know which kernel descriptors are
1614 : * currently open, so it wouldn't make sense for them to be inherited by
1615 : * executed subprograms.
1616 : */
1617 2423742 : fileFlags |= O_CLOEXEC;
1618 :
1619 2423742 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1620 :
1621 2423742 : if (vfdP->fd < 0)
1622 : {
1623 757582 : int save_errno = errno;
1624 :
1625 757582 : FreeVfd(file);
1626 757582 : free(fnamecopy);
1627 757582 : errno = save_errno;
1628 757582 : return -1;
1629 : }
1630 1666160 : ++nfile;
1631 : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1632 : vfdP->fd));
1633 :
1634 1666160 : vfdP->fileName = fnamecopy;
1635 : /* Saved flags are adjusted to be OK for re-opening file */
1636 1666160 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1637 1666160 : vfdP->fileMode = fileMode;
1638 1666160 : vfdP->fileSize = 0;
1639 1666160 : vfdP->fdstate = 0x0;
1640 1666160 : vfdP->resowner = NULL;
1641 :
1642 1666160 : Insert(file);
1643 :
1644 1666160 : return file;
1645 : }
1646 :
1647 : /*
1648 : * Create directory 'directory'. If necessary, create 'basedir', which must
1649 : * be the directory above it. This is designed for creating the top-level
1650 : * temporary directory on demand before creating a directory underneath it.
1651 : * Do nothing if the directory already exists.
1652 : *
1653 : * Directories created within the top-level temporary directory should begin
1654 : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1655 : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1656 : * that do not need any particular prefix.
1657 : */
1658 : void
1659 350 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1660 : {
1661 350 : if (MakePGDirectory(directory) < 0)
1662 : {
1663 32 : if (errno == EEXIST)
1664 12 : return;
1665 :
1666 : /*
1667 : * Failed. Try to create basedir first in case it's missing. Tolerate
1668 : * EEXIST to close a race against another process following the same
1669 : * algorithm.
1670 : */
1671 20 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1672 0 : ereport(ERROR,
1673 : (errcode_for_file_access(),
1674 : errmsg("cannot create temporary directory \"%s\": %m",
1675 : basedir)));
1676 :
1677 : /* Try again. */
1678 20 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1679 0 : ereport(ERROR,
1680 : (errcode_for_file_access(),
1681 : errmsg("cannot create temporary subdirectory \"%s\": %m",
1682 : directory)));
1683 : }
1684 : }
1685 :
1686 : /*
1687 : * Delete a directory and everything in it, if it exists.
1688 : */
1689 : void
1690 418 : PathNameDeleteTemporaryDir(const char *dirname)
1691 : {
1692 : struct stat statbuf;
1693 :
1694 : /* Silently ignore missing directory. */
1695 418 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1696 80 : return;
1697 :
1698 : /*
1699 : * Currently, walkdir doesn't offer a way for our passed in function to
1700 : * maintain state. Perhaps it should, so that we could tell the caller
1701 : * whether this operation succeeded or failed. Since this operation is
1702 : * used in a cleanup path, we wouldn't actually behave differently: we'll
1703 : * just log failures.
1704 : */
1705 338 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1706 : }
1707 :
1708 : /*
1709 : * Open a temporary file that will disappear when we close it.
1710 : *
1711 : * This routine takes care of generating an appropriate tempfile name.
1712 : * There's no need to pass in fileFlags or fileMode either, since only
1713 : * one setting makes any sense for a temp file.
1714 : *
1715 : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1716 : * to ensure it's closed and deleted when it's no longer needed, typically at
1717 : * the end-of-transaction. In most cases, you don't want temporary files to
1718 : * outlive the transaction that created them, so this should be false -- but
1719 : * if you need "somewhat" temporary storage, this might be useful. In either
1720 : * case, the file is removed when the File is explicitly closed.
1721 : */
1722 : File
1723 2988 : OpenTemporaryFile(bool interXact)
1724 : {
1725 2988 : File file = 0;
1726 :
1727 : Assert(temporary_files_allowed); /* check temp file access is up */
1728 :
1729 : /*
1730 : * Make sure the current resource owner has space for this File before we
1731 : * open it, if we'll be registering it below.
1732 : */
1733 2988 : if (!interXact)
1734 2988 : ResourceOwnerEnlarge(CurrentResourceOwner);
1735 :
1736 : /*
1737 : * If some temp tablespace(s) have been given to us, try to use the next
1738 : * one. If a given tablespace can't be found, we silently fall back to
1739 : * the database's default tablespace.
1740 : *
1741 : * BUT: if the temp file is slated to outlive the current transaction,
1742 : * force it into the database's default tablespace, so that it will not
1743 : * pose a threat to possible tablespace drop attempts.
1744 : */
1745 2988 : if (numTempTableSpaces > 0 && !interXact)
1746 : {
1747 2 : Oid tblspcOid = GetNextTempTableSpace();
1748 :
1749 2 : if (OidIsValid(tblspcOid))
1750 2 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1751 : }
1752 :
1753 : /*
1754 : * If not, or if tablespace is bad, create in database's default
1755 : * tablespace. MyDatabaseTableSpace should normally be set before we get
1756 : * here, but just in case it isn't, fall back to pg_default tablespace.
1757 : */
1758 2988 : if (file <= 0)
1759 2986 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1760 : MyDatabaseTableSpace :
1761 : DEFAULTTABLESPACE_OID,
1762 : true);
1763 :
1764 : /* Mark it for deletion at close and temporary file size limit */
1765 2988 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1766 :
1767 : /* Register it with the current resource owner */
1768 2988 : if (!interXact)
1769 2988 : RegisterTemporaryFile(file);
1770 :
1771 2988 : return file;
1772 : }
1773 :
1774 : /*
1775 : * Return the path of the temp directory in a given tablespace.
1776 : */
1777 : void
1778 16690 : TempTablespacePath(char *path, Oid tablespace)
1779 : {
1780 : /*
1781 : * Identify the tempfile directory for this tablespace.
1782 : *
1783 : * If someone tries to specify pg_global, use pg_default instead.
1784 : */
1785 16690 : if (tablespace == InvalidOid ||
1786 2 : tablespace == DEFAULTTABLESPACE_OID ||
1787 : tablespace == GLOBALTABLESPACE_OID)
1788 16688 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1789 : else
1790 : {
1791 : /* All other tablespaces are accessed via symlinks */
1792 2 : snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1793 : PG_TBLSPC_DIR, tablespace, TABLESPACE_VERSION_DIRECTORY,
1794 : PG_TEMP_FILES_DIR);
1795 : }
1796 16690 : }
1797 :
1798 : /*
1799 : * Open a temporary file in a specific tablespace.
1800 : * Subroutine for OpenTemporaryFile, which see for details.
1801 : */
1802 : static File
1803 2988 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1804 : {
1805 : char tempdirpath[MAXPGPATH];
1806 : char tempfilepath[MAXPGPATH];
1807 : File file;
1808 :
1809 2988 : TempTablespacePath(tempdirpath, tblspcOid);
1810 :
1811 : /*
1812 : * Generate a tempfile name that should be unique within the current
1813 : * database instance.
1814 : */
1815 2988 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1816 : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1817 :
1818 : /*
1819 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1820 : * temp file that can be reused.
1821 : */
1822 2988 : file = PathNameOpenFile(tempfilepath,
1823 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1824 2988 : if (file <= 0)
1825 : {
1826 : /*
1827 : * We might need to create the tablespace's tempfile directory, if no
1828 : * one has yet done so.
1829 : *
1830 : * Don't check for an error from MakePGDirectory; it could fail if
1831 : * someone else just did the same thing. If it doesn't work then
1832 : * we'll bomb out on the second create attempt, instead.
1833 : */
1834 176 : (void) MakePGDirectory(tempdirpath);
1835 :
1836 176 : file = PathNameOpenFile(tempfilepath,
1837 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1838 176 : if (file <= 0 && rejectError)
1839 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1840 : tempfilepath);
1841 : }
1842 :
1843 2988 : return file;
1844 : }
1845 :
1846 :
1847 : /*
1848 : * Create a new file. The directory containing it must already exist. Files
1849 : * created this way are subject to temp_file_limit and are automatically
1850 : * closed at end of transaction, but are not automatically deleted on close
1851 : * because they are intended to be shared between cooperating backends.
1852 : *
1853 : * If the file is inside the top-level temporary directory, its name should
1854 : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1855 : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1856 : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1857 : * the prefix isn't needed.
1858 : */
1859 : File
1860 2716 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1861 : {
1862 : File file;
1863 :
1864 : Assert(temporary_files_allowed); /* check temp file access is up */
1865 :
1866 2716 : ResourceOwnerEnlarge(CurrentResourceOwner);
1867 :
1868 : /*
1869 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1870 : * temp file that can be reused.
1871 : */
1872 2716 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1873 2716 : if (file <= 0)
1874 : {
1875 350 : if (error_on_failure)
1876 0 : ereport(ERROR,
1877 : (errcode_for_file_access(),
1878 : errmsg("could not create temporary file \"%s\": %m",
1879 : path)));
1880 : else
1881 350 : return file;
1882 : }
1883 :
1884 : /* Mark it for temp_file_limit accounting. */
1885 2366 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1886 :
1887 : /* Register it for automatic close. */
1888 2366 : RegisterTemporaryFile(file);
1889 :
1890 2366 : return file;
1891 : }
1892 :
1893 : /*
1894 : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1895 : * another backend. Files opened this way don't count against the
1896 : * temp_file_limit of the caller, are automatically closed at the end of the
1897 : * transaction but are not deleted on close.
1898 : */
1899 : File
1900 7070 : PathNameOpenTemporaryFile(const char *path, int mode)
1901 : {
1902 : File file;
1903 :
1904 : Assert(temporary_files_allowed); /* check temp file access is up */
1905 :
1906 7070 : ResourceOwnerEnlarge(CurrentResourceOwner);
1907 :
1908 7070 : file = PathNameOpenFile(path, mode | PG_BINARY);
1909 :
1910 : /* If no such file, then we don't raise an error. */
1911 7070 : if (file <= 0 && errno != ENOENT)
1912 0 : ereport(ERROR,
1913 : (errcode_for_file_access(),
1914 : errmsg("could not open temporary file \"%s\": %m",
1915 : path)));
1916 :
1917 7070 : if (file > 0)
1918 : {
1919 : /* Register it for automatic close. */
1920 3262 : RegisterTemporaryFile(file);
1921 : }
1922 :
1923 7070 : return file;
1924 : }
1925 :
1926 : /*
1927 : * Delete a file by pathname. Return true if the file existed, false if
1928 : * didn't.
1929 : */
1930 : bool
1931 5436 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1932 : {
1933 : struct stat filestats;
1934 : int stat_errno;
1935 :
1936 : /* Get the final size for pgstat reporting. */
1937 5436 : if (stat(path, &filestats) != 0)
1938 3070 : stat_errno = errno;
1939 : else
1940 2366 : stat_errno = 0;
1941 :
1942 : /*
1943 : * Unlike FileClose's automatic file deletion code, we tolerate
1944 : * non-existence to support BufFileDeleteFileSet which doesn't know how
1945 : * many segments it has to delete until it runs out.
1946 : */
1947 5436 : if (stat_errno == ENOENT)
1948 3070 : return false;
1949 :
1950 2366 : if (unlink(path) < 0)
1951 : {
1952 0 : if (errno != ENOENT)
1953 0 : ereport(error_on_failure ? ERROR : LOG,
1954 : (errcode_for_file_access(),
1955 : errmsg("could not unlink temporary file \"%s\": %m",
1956 : path)));
1957 0 : return false;
1958 : }
1959 :
1960 2366 : if (stat_errno == 0)
1961 2366 : ReportTemporaryFileUsage(path, filestats.st_size);
1962 : else
1963 : {
1964 0 : errno = stat_errno;
1965 0 : ereport(LOG,
1966 : (errcode_for_file_access(),
1967 : errmsg("could not stat file \"%s\": %m", path)));
1968 : }
1969 :
1970 2366 : return true;
1971 : }
1972 :
1973 : /*
1974 : * close a file when done with it
1975 : */
1976 : void
1977 985728 : FileClose(File file)
1978 : {
1979 : Vfd *vfdP;
1980 :
1981 : Assert(FileIsValid(file));
1982 :
1983 : DO_DB(elog(LOG, "FileClose: %d (%s)",
1984 : file, VfdCache[file].fileName));
1985 :
1986 985728 : vfdP = &VfdCache[file];
1987 :
1988 985728 : if (!FileIsNotOpen(file))
1989 : {
1990 : /* close the file */
1991 983880 : if (close(vfdP->fd) != 0)
1992 : {
1993 : /*
1994 : * We may need to panic on failure to close non-temporary files;
1995 : * see LruDelete.
1996 : */
1997 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1998 : "could not close file \"%s\": %m", vfdP->fileName);
1999 : }
2000 :
2001 983880 : --nfile;
2002 983880 : vfdP->fd = VFD_CLOSED;
2003 :
2004 : /* remove the file from the lru ring */
2005 983880 : Delete(file);
2006 : }
2007 :
2008 985728 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2009 : {
2010 : /* Subtract its size from current usage (do first in case of error) */
2011 5354 : temporary_files_size -= vfdP->fileSize;
2012 5354 : vfdP->fileSize = 0;
2013 : }
2014 :
2015 : /*
2016 : * Delete the file if it was temporary, and make a log entry if wanted
2017 : */
2018 985728 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2019 : {
2020 : struct stat filestats;
2021 : int stat_errno;
2022 :
2023 : /*
2024 : * If we get an error, as could happen within the ereport/elog calls,
2025 : * we'll come right back here during transaction abort. Reset the
2026 : * flag to ensure that we can't get into an infinite loop. This code
2027 : * is arranged to ensure that the worst-case consequence is failing to
2028 : * emit log message(s), not failing to attempt the unlink.
2029 : */
2030 2988 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2031 :
2032 :
2033 : /* first try the stat() */
2034 2988 : if (stat(vfdP->fileName, &filestats))
2035 0 : stat_errno = errno;
2036 : else
2037 2988 : stat_errno = 0;
2038 :
2039 : /* in any case do the unlink */
2040 2988 : if (unlink(vfdP->fileName))
2041 0 : ereport(LOG,
2042 : (errcode_for_file_access(),
2043 : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2044 :
2045 : /* and last report the stat results */
2046 2988 : if (stat_errno == 0)
2047 2988 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2048 : else
2049 : {
2050 0 : errno = stat_errno;
2051 0 : ereport(LOG,
2052 : (errcode_for_file_access(),
2053 : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2054 : }
2055 : }
2056 :
2057 : /* Unregister it from the resource owner */
2058 985728 : if (vfdP->resowner)
2059 8606 : ResourceOwnerForgetFile(vfdP->resowner, file);
2060 :
2061 : /*
2062 : * Return the Vfd slot to the free list
2063 : */
2064 985728 : FreeVfd(file);
2065 985728 : }
2066 :
2067 : /*
2068 : * FilePrefetch - initiate asynchronous read of a given range of the file.
2069 : *
2070 : * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2071 : *
2072 : * posix_fadvise() is the simplest standardized interface that accomplishes
2073 : * this.
2074 : */
2075 : int
2076 173396 : FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2077 : {
2078 : Assert(FileIsValid(file));
2079 :
2080 : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2081 : file, VfdCache[file].fileName,
2082 : (int64) offset, (int64) amount));
2083 :
2084 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2085 : {
2086 : int returnCode;
2087 :
2088 173396 : returnCode = FileAccess(file);
2089 173396 : if (returnCode < 0)
2090 0 : return returnCode;
2091 :
2092 173396 : retry:
2093 173396 : pgstat_report_wait_start(wait_event_info);
2094 173396 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2095 : POSIX_FADV_WILLNEED);
2096 173396 : pgstat_report_wait_end();
2097 :
2098 173396 : if (returnCode == EINTR)
2099 0 : goto retry;
2100 :
2101 173396 : return returnCode;
2102 : }
2103 : #elif defined(__darwin__)
2104 : {
2105 : struct radvisory
2106 : {
2107 : off_t ra_offset; /* offset into the file */
2108 : int ra_count; /* size of the read */
2109 : } ra;
2110 : int returnCode;
2111 :
2112 : returnCode = FileAccess(file);
2113 : if (returnCode < 0)
2114 : return returnCode;
2115 :
2116 : ra.ra_offset = offset;
2117 : ra.ra_count = amount;
2118 : pgstat_report_wait_start(wait_event_info);
2119 : returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2120 : pgstat_report_wait_end();
2121 : if (returnCode != -1)
2122 : return 0;
2123 : else
2124 : return errno;
2125 : }
2126 : #else
2127 : return 0;
2128 : #endif
2129 : }
2130 :
2131 : void
2132 0 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2133 : {
2134 : int returnCode;
2135 :
2136 : Assert(FileIsValid(file));
2137 :
2138 : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2139 : file, VfdCache[file].fileName,
2140 : (int64) offset, (int64) nbytes));
2141 :
2142 0 : if (nbytes <= 0)
2143 0 : return;
2144 :
2145 0 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
2146 0 : return;
2147 :
2148 0 : returnCode = FileAccess(file);
2149 0 : if (returnCode < 0)
2150 0 : return;
2151 :
2152 0 : pgstat_report_wait_start(wait_event_info);
2153 0 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
2154 0 : pgstat_report_wait_end();
2155 : }
2156 :
2157 : ssize_t
2158 3063854 : FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2159 : uint32 wait_event_info)
2160 : {
2161 : ssize_t returnCode;
2162 : Vfd *vfdP;
2163 :
2164 : Assert(FileIsValid(file));
2165 :
2166 : DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2167 : file, VfdCache[file].fileName,
2168 : (int64) offset,
2169 : iovcnt));
2170 :
2171 3063854 : returnCode = FileAccess(file);
2172 3063854 : if (returnCode < 0)
2173 0 : return returnCode;
2174 :
2175 3063854 : vfdP = &VfdCache[file];
2176 :
2177 3063854 : retry:
2178 3063854 : pgstat_report_wait_start(wait_event_info);
2179 3063854 : returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2180 3063854 : pgstat_report_wait_end();
2181 :
2182 3063854 : if (returnCode < 0)
2183 : {
2184 : /*
2185 : * Windows may run out of kernel buffers and return "Insufficient
2186 : * system resources" error. Wait a bit and retry to solve it.
2187 : *
2188 : * It is rumored that EINTR is also possible on some Unix filesystems,
2189 : * in which case immediate retry is indicated.
2190 : */
2191 : #ifdef WIN32
2192 : DWORD error = GetLastError();
2193 :
2194 : switch (error)
2195 : {
2196 : case ERROR_NO_SYSTEM_RESOURCES:
2197 : pg_usleep(1000L);
2198 : errno = EINTR;
2199 : break;
2200 : default:
2201 : _dosmaperr(error);
2202 : break;
2203 : }
2204 : #endif
2205 : /* OK to retry if interrupted */
2206 0 : if (errno == EINTR)
2207 0 : goto retry;
2208 : }
2209 :
2210 3063854 : return returnCode;
2211 : }
2212 :
2213 : ssize_t
2214 1300922 : FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2215 : uint32 wait_event_info)
2216 : {
2217 : ssize_t returnCode;
2218 : Vfd *vfdP;
2219 :
2220 : Assert(FileIsValid(file));
2221 :
2222 : DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2223 : file, VfdCache[file].fileName,
2224 : (int64) offset,
2225 : iovcnt));
2226 :
2227 1300922 : returnCode = FileAccess(file);
2228 1300922 : if (returnCode < 0)
2229 0 : return returnCode;
2230 :
2231 1300922 : vfdP = &VfdCache[file];
2232 :
2233 : /*
2234 : * If enforcing temp_file_limit and it's a temp file, check to see if the
2235 : * write would overrun temp_file_limit, and throw error if so. Note: it's
2236 : * really a modularity violation to throw error here; we should set errno
2237 : * and return -1. However, there's no way to report a suitable error
2238 : * message if we do that. All current callers would just throw error
2239 : * immediately anyway, so this is safe at present.
2240 : */
2241 1300922 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2242 : {
2243 0 : off_t past_write = offset;
2244 :
2245 0 : for (int i = 0; i < iovcnt; ++i)
2246 0 : past_write += iov[i].iov_len;
2247 :
2248 0 : if (past_write > vfdP->fileSize)
2249 : {
2250 0 : uint64 newTotal = temporary_files_size;
2251 :
2252 0 : newTotal += past_write - vfdP->fileSize;
2253 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2254 0 : ereport(ERROR,
2255 : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2256 : errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2257 : temp_file_limit)));
2258 : }
2259 : }
2260 :
2261 1300922 : retry:
2262 1300922 : pgstat_report_wait_start(wait_event_info);
2263 1300922 : returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2264 1300922 : pgstat_report_wait_end();
2265 :
2266 1300922 : if (returnCode >= 0)
2267 : {
2268 : /*
2269 : * Some callers expect short writes to set errno, and traditionally we
2270 : * have assumed that they imply disk space shortage. We don't want to
2271 : * waste CPU cycles adding up the total size here, so we'll just set
2272 : * it for all successful writes in case such a caller determines that
2273 : * the write was short and ereports "%m".
2274 : */
2275 1300922 : errno = ENOSPC;
2276 :
2277 : /*
2278 : * Maintain fileSize and temporary_files_size if it's a temp file.
2279 : */
2280 1300922 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2281 : {
2282 103310 : off_t past_write = offset + returnCode;
2283 :
2284 103310 : if (past_write > vfdP->fileSize)
2285 : {
2286 71156 : temporary_files_size += past_write - vfdP->fileSize;
2287 71156 : vfdP->fileSize = past_write;
2288 : }
2289 : }
2290 : }
2291 : else
2292 : {
2293 : /*
2294 : * See comments in FileReadV()
2295 : */
2296 : #ifdef WIN32
2297 : DWORD error = GetLastError();
2298 :
2299 : switch (error)
2300 : {
2301 : case ERROR_NO_SYSTEM_RESOURCES:
2302 : pg_usleep(1000L);
2303 : errno = EINTR;
2304 : break;
2305 : default:
2306 : _dosmaperr(error);
2307 : break;
2308 : }
2309 : #endif
2310 : /* OK to retry if interrupted */
2311 0 : if (errno == EINTR)
2312 0 : goto retry;
2313 : }
2314 :
2315 1300922 : return returnCode;
2316 : }
2317 :
2318 : int
2319 4050 : FileSync(File file, uint32 wait_event_info)
2320 : {
2321 : int returnCode;
2322 :
2323 : Assert(FileIsValid(file));
2324 :
2325 : DO_DB(elog(LOG, "FileSync: %d (%s)",
2326 : file, VfdCache[file].fileName));
2327 :
2328 4050 : returnCode = FileAccess(file);
2329 4050 : if (returnCode < 0)
2330 0 : return returnCode;
2331 :
2332 4050 : pgstat_report_wait_start(wait_event_info);
2333 4050 : returnCode = pg_fsync(VfdCache[file].fd);
2334 4050 : pgstat_report_wait_end();
2335 :
2336 4050 : return returnCode;
2337 : }
2338 :
2339 : /*
2340 : * Zero a region of the file.
2341 : *
2342 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2343 : * appropriate error.
2344 : */
2345 : int
2346 390330 : FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2347 : {
2348 : int returnCode;
2349 : ssize_t written;
2350 :
2351 : Assert(FileIsValid(file));
2352 :
2353 : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2354 : file, VfdCache[file].fileName,
2355 : (int64) offset, (int64) amount));
2356 :
2357 390330 : returnCode = FileAccess(file);
2358 390330 : if (returnCode < 0)
2359 0 : return returnCode;
2360 :
2361 390330 : pgstat_report_wait_start(wait_event_info);
2362 390330 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2363 390330 : pgstat_report_wait_end();
2364 :
2365 390330 : if (written < 0)
2366 0 : return -1;
2367 390330 : else if (written != amount)
2368 : {
2369 : /* if errno is unset, assume problem is no disk space */
2370 0 : if (errno == 0)
2371 0 : errno = ENOSPC;
2372 0 : return -1;
2373 : }
2374 :
2375 390330 : return 0;
2376 : }
2377 :
2378 : /*
2379 : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2380 : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2381 : * use FileZero() instead.
2382 : *
2383 : * Note that at least glibc() implements posix_fallocate() in userspace if not
2384 : * implemented by the filesystem. That's not the case for all environments
2385 : * though.
2386 : *
2387 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2388 : * appropriate error.
2389 : */
2390 : int
2391 994 : FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2392 : {
2393 : #ifdef HAVE_POSIX_FALLOCATE
2394 : int returnCode;
2395 :
2396 : Assert(FileIsValid(file));
2397 :
2398 : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2399 : file, VfdCache[file].fileName,
2400 : (int64) offset, (int64) amount));
2401 :
2402 994 : returnCode = FileAccess(file);
2403 994 : if (returnCode < 0)
2404 0 : return -1;
2405 :
2406 994 : retry:
2407 994 : pgstat_report_wait_start(wait_event_info);
2408 994 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2409 994 : pgstat_report_wait_end();
2410 :
2411 994 : if (returnCode == 0)
2412 994 : return 0;
2413 0 : else if (returnCode == EINTR)
2414 0 : goto retry;
2415 :
2416 : /* for compatibility with %m printing etc */
2417 0 : errno = returnCode;
2418 :
2419 : /*
2420 : * Return in cases of a "real" failure, if fallocate is not supported,
2421 : * fall through to the FileZero() backed implementation.
2422 : */
2423 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2424 0 : return -1;
2425 : #endif
2426 :
2427 0 : return FileZero(file, offset, amount, wait_event_info);
2428 : }
2429 :
2430 : off_t
2431 3953130 : FileSize(File file)
2432 : {
2433 : Assert(FileIsValid(file));
2434 :
2435 : DO_DB(elog(LOG, "FileSize %d (%s)",
2436 : file, VfdCache[file].fileName));
2437 :
2438 3953130 : if (FileIsNotOpen(file))
2439 : {
2440 26 : if (FileAccess(file) < 0)
2441 0 : return (off_t) -1;
2442 : }
2443 :
2444 3953130 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2445 : }
2446 :
2447 : int
2448 958 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
2449 : {
2450 : int returnCode;
2451 :
2452 : Assert(FileIsValid(file));
2453 :
2454 : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2455 : file, VfdCache[file].fileName));
2456 :
2457 958 : returnCode = FileAccess(file);
2458 958 : if (returnCode < 0)
2459 0 : return returnCode;
2460 :
2461 958 : pgstat_report_wait_start(wait_event_info);
2462 958 : returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2463 958 : pgstat_report_wait_end();
2464 :
2465 958 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2466 : {
2467 : /* adjust our state for truncation of a temp file */
2468 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2469 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2470 0 : VfdCache[file].fileSize = offset;
2471 : }
2472 :
2473 958 : return returnCode;
2474 : }
2475 :
2476 : /*
2477 : * Return the pathname associated with an open file.
2478 : *
2479 : * The returned string points to an internal buffer, which is valid until
2480 : * the file is closed.
2481 : */
2482 : char *
2483 42 : FilePathName(File file)
2484 : {
2485 : Assert(FileIsValid(file));
2486 :
2487 42 : return VfdCache[file].fileName;
2488 : }
2489 :
2490 : /*
2491 : * Return the raw file descriptor of an opened file.
2492 : *
2493 : * The returned file descriptor will be valid until the file is closed, but
2494 : * there are a lot of things that can make that happen. So the caller should
2495 : * be careful not to do much of anything else before it finishes using the
2496 : * returned file descriptor.
2497 : */
2498 : int
2499 0 : FileGetRawDesc(File file)
2500 : {
2501 : Assert(FileIsValid(file));
2502 0 : return VfdCache[file].fd;
2503 : }
2504 :
2505 : /*
2506 : * FileGetRawFlags - returns the file flags on open(2)
2507 : */
2508 : int
2509 0 : FileGetRawFlags(File file)
2510 : {
2511 : Assert(FileIsValid(file));
2512 0 : return VfdCache[file].fileFlags;
2513 : }
2514 :
2515 : /*
2516 : * FileGetRawMode - returns the mode bitmask passed to open(2)
2517 : */
2518 : mode_t
2519 0 : FileGetRawMode(File file)
2520 : {
2521 : Assert(FileIsValid(file));
2522 0 : return VfdCache[file].fileMode;
2523 : }
2524 :
2525 : /*
2526 : * Make room for another allocatedDescs[] array entry if needed and possible.
2527 : * Returns true if an array element is available.
2528 : */
2529 : static bool
2530 15589718 : reserveAllocatedDesc(void)
2531 : {
2532 : AllocateDesc *newDescs;
2533 : int newMax;
2534 :
2535 : /* Quick out if array already has a free slot. */
2536 15589718 : if (numAllocatedDescs < maxAllocatedDescs)
2537 15587784 : return true;
2538 :
2539 : /*
2540 : * If the array hasn't yet been created in the current process, initialize
2541 : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2542 : * we will ever need, anyway. We don't want to look at max_safe_fds
2543 : * immediately because set_max_safe_fds() may not have run yet.
2544 : */
2545 1934 : if (allocatedDescs == NULL)
2546 : {
2547 1934 : newMax = FD_MINFREE / 3;
2548 1934 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2549 : /* Out of memory already? Treat as fatal error. */
2550 1934 : if (newDescs == NULL)
2551 0 : ereport(ERROR,
2552 : (errcode(ERRCODE_OUT_OF_MEMORY),
2553 : errmsg("out of memory")));
2554 1934 : allocatedDescs = newDescs;
2555 1934 : maxAllocatedDescs = newMax;
2556 1934 : return true;
2557 : }
2558 :
2559 : /*
2560 : * Consider enlarging the array beyond the initial allocation used above.
2561 : * By the time this happens, max_safe_fds should be known accurately.
2562 : *
2563 : * We mustn't let allocated descriptors hog all the available FDs, and in
2564 : * practice we'd better leave a reasonable number of FDs for VFD use. So
2565 : * set the maximum to max_safe_fds / 3. (This should certainly be at
2566 : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2567 : * tightening the restriction here.) Recall that "external" FDs are
2568 : * allowed to consume another third of max_safe_fds.
2569 : */
2570 0 : newMax = max_safe_fds / 3;
2571 0 : if (newMax > maxAllocatedDescs)
2572 : {
2573 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2574 : newMax * sizeof(AllocateDesc));
2575 : /* Treat out-of-memory as a non-fatal error. */
2576 0 : if (newDescs == NULL)
2577 0 : return false;
2578 0 : allocatedDescs = newDescs;
2579 0 : maxAllocatedDescs = newMax;
2580 0 : return true;
2581 : }
2582 :
2583 : /* Can't enlarge allocatedDescs[] any more. */
2584 0 : return false;
2585 : }
2586 :
2587 : /*
2588 : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2589 : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2590 : * necessary to open the file. When done, call FreeFile rather than fclose.
2591 : *
2592 : * Note that files that will be open for any significant length of time
2593 : * should NOT be handled this way, since they cannot share kernel file
2594 : * descriptors with other files; there is grave risk of running out of FDs
2595 : * if anyone locks down too many FDs. Most callers of this routine are
2596 : * simply reading a config file that they will read and close immediately.
2597 : *
2598 : * fd.c will automatically close all files opened with AllocateFile at
2599 : * transaction commit or abort; this prevents FD leakage if a routine
2600 : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2601 : *
2602 : * Ideally this should be the *only* direct call of fopen() in the backend.
2603 : */
2604 : FILE *
2605 150714 : AllocateFile(const char *name, const char *mode)
2606 : {
2607 : FILE *file;
2608 :
2609 : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2610 : numAllocatedDescs, name));
2611 :
2612 : /* Can we allocate another non-virtual FD? */
2613 150714 : if (!reserveAllocatedDesc())
2614 0 : ereport(ERROR,
2615 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2616 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2617 : maxAllocatedDescs, name)));
2618 :
2619 : /* Close excess kernel FDs. */
2620 150714 : ReleaseLruFiles();
2621 :
2622 150714 : TryAgain:
2623 150714 : if ((file = fopen(name, mode)) != NULL)
2624 : {
2625 139766 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2626 :
2627 139766 : desc->kind = AllocateDescFile;
2628 139766 : desc->desc.file = file;
2629 139766 : desc->create_subid = GetCurrentSubTransactionId();
2630 139766 : numAllocatedDescs++;
2631 139766 : return desc->desc.file;
2632 : }
2633 :
2634 10948 : if (errno == EMFILE || errno == ENFILE)
2635 : {
2636 0 : int save_errno = errno;
2637 :
2638 0 : ereport(LOG,
2639 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2640 : errmsg("out of file descriptors: %m; release and retry")));
2641 0 : errno = 0;
2642 0 : if (ReleaseLruFile())
2643 0 : goto TryAgain;
2644 0 : errno = save_errno;
2645 : }
2646 :
2647 10948 : return NULL;
2648 : }
2649 :
2650 : /*
2651 : * Open a file with OpenTransientFilePerm() and pass default file mode for
2652 : * the fileMode parameter.
2653 : */
2654 : int
2655 15359170 : OpenTransientFile(const char *fileName, int fileFlags)
2656 : {
2657 15359170 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2658 : }
2659 :
2660 : /*
2661 : * Like AllocateFile, but returns an unbuffered fd like open(2)
2662 : */
2663 : int
2664 15359182 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2665 : {
2666 : int fd;
2667 :
2668 : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2669 : numAllocatedDescs, fileName));
2670 :
2671 : /* Can we allocate another non-virtual FD? */
2672 15359182 : if (!reserveAllocatedDesc())
2673 0 : ereport(ERROR,
2674 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2675 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2676 : maxAllocatedDescs, fileName)));
2677 :
2678 : /* Close excess kernel FDs. */
2679 15359182 : ReleaseLruFiles();
2680 :
2681 15359182 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2682 :
2683 15359182 : if (fd >= 0)
2684 : {
2685 15351472 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2686 :
2687 15351472 : desc->kind = AllocateDescRawFD;
2688 15351472 : desc->desc.fd = fd;
2689 15351472 : desc->create_subid = GetCurrentSubTransactionId();
2690 15351472 : numAllocatedDescs++;
2691 :
2692 15351472 : return fd;
2693 : }
2694 :
2695 7710 : return -1; /* failure */
2696 : }
2697 :
2698 : /*
2699 : * Routines that want to initiate a pipe stream should use OpenPipeStream
2700 : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2701 : * necessary. When done, call ClosePipeStream rather than pclose.
2702 : *
2703 : * This function also ensures that the popen'd program is run with default
2704 : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2705 : * uses. This ensures desirable response to, eg, closing a read pipe early.
2706 : */
2707 : FILE *
2708 106 : OpenPipeStream(const char *command, const char *mode)
2709 : {
2710 : FILE *file;
2711 : int save_errno;
2712 :
2713 : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2714 : numAllocatedDescs, command));
2715 :
2716 : /* Can we allocate another non-virtual FD? */
2717 106 : if (!reserveAllocatedDesc())
2718 0 : ereport(ERROR,
2719 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2720 : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2721 : maxAllocatedDescs, command)));
2722 :
2723 : /* Close excess kernel FDs. */
2724 106 : ReleaseLruFiles();
2725 :
2726 106 : TryAgain:
2727 106 : fflush(NULL);
2728 106 : pqsignal(SIGPIPE, SIG_DFL);
2729 106 : errno = 0;
2730 106 : file = popen(command, mode);
2731 106 : save_errno = errno;
2732 106 : pqsignal(SIGPIPE, SIG_IGN);
2733 106 : errno = save_errno;
2734 106 : if (file != NULL)
2735 : {
2736 106 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2737 :
2738 106 : desc->kind = AllocateDescPipe;
2739 106 : desc->desc.file = file;
2740 106 : desc->create_subid = GetCurrentSubTransactionId();
2741 106 : numAllocatedDescs++;
2742 106 : return desc->desc.file;
2743 : }
2744 :
2745 0 : if (errno == EMFILE || errno == ENFILE)
2746 : {
2747 0 : ereport(LOG,
2748 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2749 : errmsg("out of file descriptors: %m; release and retry")));
2750 0 : if (ReleaseLruFile())
2751 0 : goto TryAgain;
2752 0 : errno = save_errno;
2753 : }
2754 :
2755 0 : return NULL;
2756 : }
2757 :
2758 : /*
2759 : * Free an AllocateDesc of any type.
2760 : *
2761 : * The argument *must* point into the allocatedDescs[] array.
2762 : */
2763 : static int
2764 15569494 : FreeDesc(AllocateDesc *desc)
2765 : {
2766 : int result;
2767 :
2768 : /* Close the underlying object */
2769 15569494 : switch (desc->kind)
2770 : {
2771 139766 : case AllocateDescFile:
2772 139766 : result = fclose(desc->desc.file);
2773 139766 : break;
2774 106 : case AllocateDescPipe:
2775 106 : result = pclose(desc->desc.file);
2776 106 : break;
2777 78150 : case AllocateDescDir:
2778 78150 : result = closedir(desc->desc.dir);
2779 78150 : break;
2780 15351472 : case AllocateDescRawFD:
2781 15351472 : result = close(desc->desc.fd);
2782 15351472 : break;
2783 0 : default:
2784 0 : elog(ERROR, "AllocateDesc kind not recognized");
2785 : result = 0; /* keep compiler quiet */
2786 : break;
2787 : }
2788 :
2789 : /* Compact storage in the allocatedDescs array */
2790 15569494 : numAllocatedDescs--;
2791 15569494 : *desc = allocatedDescs[numAllocatedDescs];
2792 :
2793 15569494 : return result;
2794 : }
2795 :
2796 : /*
2797 : * Close a file returned by AllocateFile.
2798 : *
2799 : * Note we do not check fclose's return value --- it is up to the caller
2800 : * to handle close errors.
2801 : */
2802 : int
2803 139734 : FreeFile(FILE *file)
2804 : {
2805 : int i;
2806 :
2807 : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2808 :
2809 : /* Remove file from list of allocated files, if it's present */
2810 139736 : for (i = numAllocatedDescs; --i >= 0;)
2811 : {
2812 139736 : AllocateDesc *desc = &allocatedDescs[i];
2813 :
2814 139736 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2815 139734 : return FreeDesc(desc);
2816 : }
2817 :
2818 : /* Only get here if someone passes us a file not in allocatedDescs */
2819 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2820 :
2821 0 : return fclose(file);
2822 : }
2823 :
2824 : /*
2825 : * Close a file returned by OpenTransientFile.
2826 : *
2827 : * Note we do not check close's return value --- it is up to the caller
2828 : * to handle close errors.
2829 : */
2830 : int
2831 15351470 : CloseTransientFile(int fd)
2832 : {
2833 : int i;
2834 :
2835 : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2836 :
2837 : /* Remove fd from list of allocated files, if it's present */
2838 15351490 : for (i = numAllocatedDescs; --i >= 0;)
2839 : {
2840 15351490 : AllocateDesc *desc = &allocatedDescs[i];
2841 :
2842 15351490 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2843 15351470 : return FreeDesc(desc);
2844 : }
2845 :
2846 : /* Only get here if someone passes us a file not in allocatedDescs */
2847 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2848 :
2849 0 : return close(fd);
2850 : }
2851 :
2852 : /*
2853 : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2854 : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2855 : * necessary to open the directory, and with closing it after an elog.
2856 : * When done, call FreeDir rather than closedir.
2857 : *
2858 : * Returns NULL, with errno set, on failure. Note that failure detection
2859 : * is commonly left to the following call of ReadDir or ReadDirExtended;
2860 : * see the comments for ReadDir.
2861 : *
2862 : * Ideally this should be the *only* direct call of opendir() in the backend.
2863 : */
2864 : DIR *
2865 79716 : AllocateDir(const char *dirname)
2866 : {
2867 : DIR *dir;
2868 :
2869 : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2870 : numAllocatedDescs, dirname));
2871 :
2872 : /* Can we allocate another non-virtual FD? */
2873 79716 : if (!reserveAllocatedDesc())
2874 0 : ereport(ERROR,
2875 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2876 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2877 : maxAllocatedDescs, dirname)));
2878 :
2879 : /* Close excess kernel FDs. */
2880 79716 : ReleaseLruFiles();
2881 :
2882 79716 : TryAgain:
2883 79716 : if ((dir = opendir(dirname)) != NULL)
2884 : {
2885 78150 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2886 :
2887 78150 : desc->kind = AllocateDescDir;
2888 78150 : desc->desc.dir = dir;
2889 78150 : desc->create_subid = GetCurrentSubTransactionId();
2890 78150 : numAllocatedDescs++;
2891 78150 : return desc->desc.dir;
2892 : }
2893 :
2894 1566 : if (errno == EMFILE || errno == ENFILE)
2895 : {
2896 0 : int save_errno = errno;
2897 :
2898 0 : ereport(LOG,
2899 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2900 : errmsg("out of file descriptors: %m; release and retry")));
2901 0 : errno = 0;
2902 0 : if (ReleaseLruFile())
2903 0 : goto TryAgain;
2904 0 : errno = save_errno;
2905 : }
2906 :
2907 1566 : return NULL;
2908 : }
2909 :
2910 : /*
2911 : * Read a directory opened with AllocateDir, ereport'ing any error.
2912 : *
2913 : * This is easier to use than raw readdir() since it takes care of some
2914 : * otherwise rather tedious and error-prone manipulation of errno. Also,
2915 : * if you are happy with a generic error message for AllocateDir failure,
2916 : * you can just do
2917 : *
2918 : * dir = AllocateDir(path);
2919 : * while ((dirent = ReadDir(dir, path)) != NULL)
2920 : * process dirent;
2921 : * FreeDir(dir);
2922 : *
2923 : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2924 : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2925 : * use this shortcut.)
2926 : *
2927 : * The pathname passed to AllocateDir must be passed to this routine too,
2928 : * but it is only used for error reporting.
2929 : */
2930 : struct dirent *
2931 3211636 : ReadDir(DIR *dir, const char *dirname)
2932 : {
2933 3211636 : return ReadDirExtended(dir, dirname, ERROR);
2934 : }
2935 :
2936 : /*
2937 : * Alternate version of ReadDir that allows caller to specify the elevel
2938 : * for any error report (whether it's reporting an initial failure of
2939 : * AllocateDir or a subsequent directory read failure).
2940 : *
2941 : * If elevel < ERROR, returns NULL after any error. With the normal coding
2942 : * pattern, this will result in falling out of the loop immediately as
2943 : * though the directory contained no (more) entries.
2944 : */
2945 : struct dirent *
2946 6140342 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2947 : {
2948 : struct dirent *dent;
2949 :
2950 : /* Give a generic message for AllocateDir failure, if caller didn't */
2951 6140342 : if (dir == NULL)
2952 : {
2953 6 : ereport(elevel,
2954 : (errcode_for_file_access(),
2955 : errmsg("could not open directory \"%s\": %m",
2956 : dirname)));
2957 0 : return NULL;
2958 : }
2959 :
2960 6140336 : errno = 0;
2961 6140336 : if ((dent = readdir(dir)) != NULL)
2962 6083570 : return dent;
2963 :
2964 56766 : if (errno)
2965 0 : ereport(elevel,
2966 : (errcode_for_file_access(),
2967 : errmsg("could not read directory \"%s\": %m",
2968 : dirname)));
2969 56766 : return NULL;
2970 : }
2971 :
2972 : /*
2973 : * Close a directory opened with AllocateDir.
2974 : *
2975 : * Returns closedir's return value (with errno set if it's not 0).
2976 : * Note we do not check the return value --- it is up to the caller
2977 : * to handle close errors if wanted.
2978 : *
2979 : * Does nothing if dir == NULL; we assume that directory open failure was
2980 : * already reported if desired.
2981 : */
2982 : int
2983 77910 : FreeDir(DIR *dir)
2984 : {
2985 : int i;
2986 :
2987 : /* Nothing to do if AllocateDir failed */
2988 77910 : if (dir == NULL)
2989 0 : return 0;
2990 :
2991 : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2992 :
2993 : /* Remove dir from list of allocated dirs, if it's present */
2994 77910 : for (i = numAllocatedDescs; --i >= 0;)
2995 : {
2996 77910 : AllocateDesc *desc = &allocatedDescs[i];
2997 :
2998 77910 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2999 77910 : return FreeDesc(desc);
3000 : }
3001 :
3002 : /* Only get here if someone passes us a dir not in allocatedDescs */
3003 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3004 :
3005 0 : return closedir(dir);
3006 : }
3007 :
3008 :
3009 : /*
3010 : * Close a pipe stream returned by OpenPipeStream.
3011 : */
3012 : int
3013 106 : ClosePipeStream(FILE *file)
3014 : {
3015 : int i;
3016 :
3017 : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3018 :
3019 : /* Remove file from list of allocated files, if it's present */
3020 106 : for (i = numAllocatedDescs; --i >= 0;)
3021 : {
3022 106 : AllocateDesc *desc = &allocatedDescs[i];
3023 :
3024 106 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3025 106 : return FreeDesc(desc);
3026 : }
3027 :
3028 : /* Only get here if someone passes us a file not in allocatedDescs */
3029 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3030 :
3031 0 : return pclose(file);
3032 : }
3033 :
3034 : /*
3035 : * closeAllVfds
3036 : *
3037 : * Force all VFDs into the physically-closed state, so that the fewest
3038 : * possible number of kernel file descriptors are in use. There is no
3039 : * change in the logical state of the VFDs.
3040 : */
3041 : void
3042 58 : closeAllVfds(void)
3043 : {
3044 : Index i;
3045 :
3046 58 : if (SizeVfdCache > 0)
3047 : {
3048 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3049 1856 : for (i = 1; i < SizeVfdCache; i++)
3050 : {
3051 1798 : if (!FileIsNotOpen(i))
3052 258 : LruDelete(i);
3053 : }
3054 : }
3055 58 : }
3056 :
3057 :
3058 : /*
3059 : * SetTempTablespaces
3060 : *
3061 : * Define a list (actually an array) of OIDs of tablespaces to use for
3062 : * temporary files. This list will be used until end of transaction,
3063 : * unless this function is called again before then. It is caller's
3064 : * responsibility that the passed-in array has adequate lifespan (typically
3065 : * it'd be allocated in TopTransactionContext).
3066 : *
3067 : * Some entries of the array may be InvalidOid, indicating that the current
3068 : * database's default tablespace should be used.
3069 : */
3070 : void
3071 5750 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3072 : {
3073 : Assert(numSpaces >= 0);
3074 5750 : tempTableSpaces = tableSpaces;
3075 5750 : numTempTableSpaces = numSpaces;
3076 :
3077 : /*
3078 : * Select a random starting point in the list. This is to minimize
3079 : * conflicts between backends that are most likely sharing the same list
3080 : * of temp tablespaces. Note that if we create multiple temp files in the
3081 : * same transaction, we'll advance circularly through the list --- this
3082 : * ensures that large temporary sort files are nicely spread across all
3083 : * available tablespaces.
3084 : */
3085 5750 : if (numSpaces > 1)
3086 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
3087 0 : 0, numSpaces - 1);
3088 : else
3089 5750 : nextTempTableSpace = 0;
3090 5750 : }
3091 :
3092 : /*
3093 : * TempTablespacesAreSet
3094 : *
3095 : * Returns true if SetTempTablespaces has been called in current transaction.
3096 : * (This is just so that tablespaces.c doesn't need its own per-transaction
3097 : * state.)
3098 : */
3099 : bool
3100 8450 : TempTablespacesAreSet(void)
3101 : {
3102 8450 : return (numTempTableSpaces >= 0);
3103 : }
3104 :
3105 : /*
3106 : * GetTempTablespaces
3107 : *
3108 : * Populate an array with the OIDs of the tablespaces that should be used for
3109 : * temporary files. (Some entries may be InvalidOid, indicating that the
3110 : * current database's default tablespace should be used.) At most numSpaces
3111 : * entries will be filled.
3112 : * Returns the number of OIDs that were copied into the output array.
3113 : */
3114 : int
3115 370 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3116 : {
3117 : int i;
3118 :
3119 : Assert(TempTablespacesAreSet());
3120 370 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3121 0 : tableSpaces[i] = tempTableSpaces[i];
3122 :
3123 370 : return i;
3124 : }
3125 :
3126 : /*
3127 : * GetNextTempTableSpace
3128 : *
3129 : * Select the next temp tablespace to use. A result of InvalidOid means
3130 : * to use the current database's default tablespace.
3131 : */
3132 : Oid
3133 4110 : GetNextTempTableSpace(void)
3134 : {
3135 4110 : if (numTempTableSpaces > 0)
3136 : {
3137 : /* Advance nextTempTableSpace counter with wraparound */
3138 2 : if (++nextTempTableSpace >= numTempTableSpaces)
3139 2 : nextTempTableSpace = 0;
3140 2 : return tempTableSpaces[nextTempTableSpace];
3141 : }
3142 4108 : return InvalidOid;
3143 : }
3144 :
3145 :
3146 : /*
3147 : * AtEOSubXact_Files
3148 : *
3149 : * Take care of subtransaction commit/abort. At abort, we close temp files
3150 : * that the subtransaction may have opened. At commit, we reassign the
3151 : * files that were opened to the parent subtransaction.
3152 : */
3153 : void
3154 20004 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3155 : SubTransactionId parentSubid)
3156 : {
3157 : Index i;
3158 :
3159 20004 : for (i = 0; i < numAllocatedDescs; i++)
3160 : {
3161 0 : if (allocatedDescs[i].create_subid == mySubid)
3162 : {
3163 0 : if (isCommit)
3164 0 : allocatedDescs[i].create_subid = parentSubid;
3165 : else
3166 : {
3167 : /* have to recheck the item after FreeDesc (ugly) */
3168 0 : FreeDesc(&allocatedDescs[i--]);
3169 : }
3170 : }
3171 : }
3172 20004 : }
3173 :
3174 : /*
3175 : * AtEOXact_Files
3176 : *
3177 : * This routine is called during transaction commit or abort. All still-open
3178 : * per-transaction temporary file VFDs are closed, which also causes the
3179 : * underlying files to be deleted (although they should've been closed already
3180 : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3181 : * closed. We also forget any transaction-local temp tablespace list.
3182 : *
3183 : * The isCommit flag is used only to decide whether to emit warnings about
3184 : * unclosed files.
3185 : */
3186 : void
3187 791110 : AtEOXact_Files(bool isCommit)
3188 : {
3189 791110 : CleanupTempFiles(isCommit, false);
3190 791110 : tempTableSpaces = NULL;
3191 791110 : numTempTableSpaces = -1;
3192 791110 : }
3193 :
3194 : /*
3195 : * BeforeShmemExit_Files
3196 : *
3197 : * before_shmem_exit hook to clean up temp files during backend shutdown.
3198 : * Here, we want to clean up *all* temp files including interXact ones.
3199 : */
3200 : static void
3201 37286 : BeforeShmemExit_Files(int code, Datum arg)
3202 : {
3203 37286 : CleanupTempFiles(false, true);
3204 :
3205 : /* prevent further temp files from being created */
3206 : #ifdef USE_ASSERT_CHECKING
3207 : temporary_files_allowed = false;
3208 : #endif
3209 37286 : }
3210 :
3211 : /*
3212 : * Close temporary files and delete their underlying files.
3213 : *
3214 : * isCommit: if true, this is normal transaction commit, and we don't
3215 : * expect any remaining files; warn if there are some.
3216 : *
3217 : * isProcExit: if true, this is being called as the backend process is
3218 : * exiting. If that's the case, we should remove all temporary files; if
3219 : * that's not the case, we are being called for transaction commit/abort
3220 : * and should only remove transaction-local temp files. In either case,
3221 : * also clean up "allocated" stdio files, dirs and fds.
3222 : */
3223 : static void
3224 828396 : CleanupTempFiles(bool isCommit, bool isProcExit)
3225 : {
3226 : Index i;
3227 :
3228 : /*
3229 : * Careful here: at proc_exit we need extra cleanup, not just
3230 : * xact_temporary files.
3231 : */
3232 828396 : if (isProcExit || have_xact_temporary_files)
3233 : {
3234 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3235 2163856 : for (i = 1; i < SizeVfdCache; i++)
3236 : {
3237 2125118 : unsigned short fdstate = VfdCache[i].fdstate;
3238 :
3239 2125118 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3240 10 : VfdCache[i].fileName != NULL)
3241 : {
3242 : /*
3243 : * If we're in the process of exiting a backend process, close
3244 : * all temporary files. Otherwise, only close temporary files
3245 : * local to the current transaction. They should be closed by
3246 : * the ResourceOwner mechanism already, so this is just a
3247 : * debugging cross-check.
3248 : */
3249 10 : if (isProcExit)
3250 10 : FileClose(i);
3251 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
3252 : {
3253 0 : elog(WARNING,
3254 : "temporary file %s not closed at end-of-transaction",
3255 : VfdCache[i].fileName);
3256 0 : FileClose(i);
3257 : }
3258 : }
3259 : }
3260 :
3261 38738 : have_xact_temporary_files = false;
3262 : }
3263 :
3264 : /* Complain if any allocated files remain open at commit. */
3265 828396 : if (isCommit && numAllocatedDescs > 0)
3266 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3267 : numAllocatedDescs);
3268 :
3269 : /* Clean up "allocated" stdio files, dirs and fds. */
3270 828670 : while (numAllocatedDescs > 0)
3271 274 : FreeDesc(&allocatedDescs[0]);
3272 828396 : }
3273 :
3274 :
3275 : /*
3276 : * Remove temporary and temporary relation files left over from a prior
3277 : * postmaster session
3278 : *
3279 : * This should be called during postmaster startup. It will forcibly
3280 : * remove any leftover files created by OpenTemporaryFile and any leftover
3281 : * temporary relation files created by mdcreate.
3282 : *
3283 : * During post-backend-crash restart cycle, this routine is called when
3284 : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3285 : * queries are using temp files could result in useless storage usage that can
3286 : * only be reclaimed by a service restart. The argument against enabling it is
3287 : * that someone might want to examine the temporary files for debugging
3288 : * purposes. This does however mean that OpenTemporaryFile had better allow for
3289 : * collision with an existing temp file name.
3290 : *
3291 : * NOTE: this function and its subroutines generally report syscall failures
3292 : * with ereport(LOG) and keep going. Removing temp files is not so critical
3293 : * that we should fail to start the database when we can't do it.
3294 : */
3295 : void
3296 1538 : RemovePgTempFiles(void)
3297 : {
3298 : char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3299 : DIR *spc_dir;
3300 : struct dirent *spc_de;
3301 :
3302 : /*
3303 : * First process temp files in pg_default ($PGDATA/base)
3304 : */
3305 1538 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3306 1538 : RemovePgTempFilesInDir(temp_path, true, false);
3307 1538 : RemovePgTempRelationFiles("base");
3308 :
3309 : /*
3310 : * Cycle through temp directories for all non-default tablespaces.
3311 : */
3312 1538 : spc_dir = AllocateDir(PG_TBLSPC_DIR);
3313 :
3314 4734 : while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3315 : {
3316 3196 : if (strcmp(spc_de->d_name, ".") == 0 ||
3317 1658 : strcmp(spc_de->d_name, "..") == 0)
3318 3076 : continue;
3319 :
3320 120 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3321 120 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY,
3322 : PG_TEMP_FILES_DIR);
3323 120 : RemovePgTempFilesInDir(temp_path, true, false);
3324 :
3325 120 : snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3326 120 : PG_TBLSPC_DIR, spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3327 120 : RemovePgTempRelationFiles(temp_path);
3328 : }
3329 :
3330 1538 : FreeDir(spc_dir);
3331 :
3332 : /*
3333 : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3334 : * DataDir as well. However, that is *not* cleaned here because doing so
3335 : * would create a race condition. It's done separately, earlier in
3336 : * postmaster startup.
3337 : */
3338 1538 : }
3339 :
3340 : /*
3341 : * Process one pgsql_tmp directory for RemovePgTempFiles.
3342 : *
3343 : * If missing_ok is true, it's all right for the named directory to not exist.
3344 : * Any other problem results in a LOG message. (missing_ok should be true at
3345 : * the top level, since pgsql_tmp directories are not created until needed.)
3346 : *
3347 : * At the top level, this should be called with unlink_all = false, so that
3348 : * only files matching the temporary name prefix will be unlinked. When
3349 : * recursing it will be called with unlink_all = true to unlink everything
3350 : * under a top-level temporary directory.
3351 : *
3352 : * (These two flags could be replaced by one, but it seems clearer to keep
3353 : * them separate.)
3354 : */
3355 : void
3356 1660 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3357 : {
3358 : DIR *temp_dir;
3359 : struct dirent *temp_de;
3360 : char rm_path[MAXPGPATH * 2];
3361 :
3362 1660 : temp_dir = AllocateDir(tmpdirname);
3363 :
3364 1660 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
3365 1534 : return;
3366 :
3367 384 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3368 : {
3369 258 : if (strcmp(temp_de->d_name, ".") == 0 ||
3370 132 : strcmp(temp_de->d_name, "..") == 0)
3371 252 : continue;
3372 :
3373 6 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3374 6 : tmpdirname, temp_de->d_name);
3375 :
3376 6 : if (unlink_all ||
3377 6 : strncmp(temp_de->d_name,
3378 : PG_TEMP_FILE_PREFIX,
3379 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
3380 6 : {
3381 6 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3382 :
3383 6 : if (type == PGFILETYPE_ERROR)
3384 0 : continue;
3385 6 : else if (type == PGFILETYPE_DIR)
3386 : {
3387 : /* recursively remove contents, then directory itself */
3388 2 : RemovePgTempFilesInDir(rm_path, false, true);
3389 :
3390 2 : if (rmdir(rm_path) < 0)
3391 0 : ereport(LOG,
3392 : (errcode_for_file_access(),
3393 : errmsg("could not remove directory \"%s\": %m",
3394 : rm_path)));
3395 : }
3396 : else
3397 : {
3398 4 : if (unlink(rm_path) < 0)
3399 0 : ereport(LOG,
3400 : (errcode_for_file_access(),
3401 : errmsg("could not remove file \"%s\": %m",
3402 : rm_path)));
3403 : }
3404 : }
3405 : else
3406 0 : ereport(LOG,
3407 : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3408 : rm_path)));
3409 : }
3410 :
3411 126 : FreeDir(temp_dir);
3412 : }
3413 :
3414 : /* Process one tablespace directory, look for per-DB subdirectories */
3415 : static void
3416 1658 : RemovePgTempRelationFiles(const char *tsdirname)
3417 : {
3418 : DIR *ts_dir;
3419 : struct dirent *de;
3420 : char dbspace_path[MAXPGPATH * 2];
3421 :
3422 1658 : ts_dir = AllocateDir(tsdirname);
3423 :
3424 10314 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3425 : {
3426 : /*
3427 : * We're only interested in the per-database directories, which have
3428 : * numeric names. Note that this code will also (properly) ignore "."
3429 : * and "..".
3430 : */
3431 8656 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3432 3440 : continue;
3433 :
3434 5216 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3435 5216 : tsdirname, de->d_name);
3436 5216 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3437 : }
3438 :
3439 1658 : FreeDir(ts_dir);
3440 1658 : }
3441 :
3442 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3443 : static void
3444 5216 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3445 : {
3446 : DIR *dbspace_dir;
3447 : struct dirent *de;
3448 : char rm_path[MAXPGPATH * 2];
3449 :
3450 5216 : dbspace_dir = AllocateDir(dbspacedirname);
3451 :
3452 1582738 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3453 : {
3454 1577522 : if (!looks_like_temp_rel_name(de->d_name))
3455 1577514 : continue;
3456 :
3457 8 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3458 8 : dbspacedirname, de->d_name);
3459 :
3460 8 : if (unlink(rm_path) < 0)
3461 0 : ereport(LOG,
3462 : (errcode_for_file_access(),
3463 : errmsg("could not remove file \"%s\": %m",
3464 : rm_path)));
3465 : }
3466 :
3467 5216 : FreeDir(dbspace_dir);
3468 5216 : }
3469 :
3470 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3471 : bool
3472 2148942 : looks_like_temp_rel_name(const char *name)
3473 : {
3474 : int pos;
3475 : int savepos;
3476 :
3477 : /* Must start with "t". */
3478 2148942 : if (name[0] != 't')
3479 2148862 : return false;
3480 :
3481 : /* Followed by a non-empty string of digits and then an underscore. */
3482 392 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3483 : ;
3484 80 : if (pos == 1 || name[pos] != '_')
3485 0 : return false;
3486 :
3487 : /* Followed by another nonempty string of digits. */
3488 392 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3489 : ;
3490 80 : if (savepos == pos)
3491 0 : return false;
3492 :
3493 : /* We might have _forkname or .segment or both. */
3494 80 : if (name[pos] == '_')
3495 : {
3496 40 : int forkchar = forkname_chars(&name[pos + 1], NULL);
3497 :
3498 40 : if (forkchar <= 0)
3499 0 : return false;
3500 40 : pos += forkchar + 1;
3501 : }
3502 80 : if (name[pos] == '.')
3503 : {
3504 : int segchar;
3505 :
3506 80 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3507 : ;
3508 40 : if (segchar <= 1)
3509 0 : return false;
3510 40 : pos += segchar;
3511 : }
3512 :
3513 : /* Now we should be at the end. */
3514 80 : if (name[pos] != '\0')
3515 0 : return false;
3516 80 : return true;
3517 : }
3518 :
3519 : #ifdef HAVE_SYNCFS
3520 : static void
3521 0 : do_syncfs(const char *path)
3522 : {
3523 : int fd;
3524 :
3525 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3526 : path);
3527 :
3528 0 : fd = OpenTransientFile(path, O_RDONLY);
3529 0 : if (fd < 0)
3530 : {
3531 0 : ereport(LOG,
3532 : (errcode_for_file_access(),
3533 : errmsg("could not open file \"%s\": %m", path)));
3534 0 : return;
3535 : }
3536 0 : if (syncfs(fd) < 0)
3537 0 : ereport(LOG,
3538 : (errcode_for_file_access(),
3539 : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3540 0 : CloseTransientFile(fd);
3541 : }
3542 : #endif
3543 :
3544 : /*
3545 : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3546 : * all potential filesystem, depending on recovery_init_sync_method setting.
3547 : *
3548 : * We fsync regular files and directories wherever they are, but we
3549 : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3550 : * Other symlinks are presumed to point at files we're not responsible
3551 : * for fsyncing, and might not have privileges to write at all.
3552 : *
3553 : * Errors are logged but not considered fatal; that's because this is used
3554 : * only during database startup, to deal with the possibility that there are
3555 : * issued-but-unsynced writes pending against the data directory. We want to
3556 : * ensure that such writes reach disk before anything that's done in the new
3557 : * run. However, aborting on error would result in failure to start for
3558 : * harmless cases such as read-only files in the data directory, and that's
3559 : * not good either.
3560 : *
3561 : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3562 : * rewriting all changes again during recovery.
3563 : *
3564 : * Note we assume we're chdir'd into PGDATA to begin with.
3565 : */
3566 : void
3567 340 : SyncDataDirectory(void)
3568 : {
3569 : bool xlog_is_symlink;
3570 :
3571 : /* We can skip this whole thing if fsync is disabled. */
3572 340 : if (!enableFsync)
3573 340 : return;
3574 :
3575 : /*
3576 : * If pg_wal is a symlink, we'll need to recurse into it separately,
3577 : * because the first walkdir below will ignore it.
3578 : */
3579 0 : xlog_is_symlink = false;
3580 :
3581 : {
3582 : struct stat st;
3583 :
3584 0 : if (lstat("pg_wal", &st) < 0)
3585 0 : ereport(LOG,
3586 : (errcode_for_file_access(),
3587 : errmsg("could not stat file \"%s\": %m",
3588 : "pg_wal")));
3589 0 : else if (S_ISLNK(st.st_mode))
3590 0 : xlog_is_symlink = true;
3591 : }
3592 :
3593 : #ifdef HAVE_SYNCFS
3594 0 : if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
3595 : {
3596 : DIR *dir;
3597 : struct dirent *de;
3598 :
3599 : /*
3600 : * On Linux, we don't have to open every single file one by one. We
3601 : * can use syncfs() to sync whole filesystems. We only expect
3602 : * filesystem boundaries to exist where we tolerate symlinks, namely
3603 : * pg_wal and the tablespaces, so we call syncfs() for each of those
3604 : * directories.
3605 : */
3606 :
3607 : /* Prepare to report progress syncing the data directory via syncfs. */
3608 0 : begin_startup_progress_phase();
3609 :
3610 : /* Sync the top level pgdata directory. */
3611 0 : do_syncfs(".");
3612 : /* If any tablespaces are configured, sync each of those. */
3613 0 : dir = AllocateDir(PG_TBLSPC_DIR);
3614 0 : while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3615 : {
3616 : char path[MAXPGPATH];
3617 :
3618 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3619 0 : continue;
3620 :
3621 0 : snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3622 0 : do_syncfs(path);
3623 : }
3624 0 : FreeDir(dir);
3625 : /* If pg_wal is a symlink, process that too. */
3626 0 : if (xlog_is_symlink)
3627 0 : do_syncfs("pg_wal");
3628 0 : return;
3629 : }
3630 : #endif /* !HAVE_SYNCFS */
3631 :
3632 : #ifdef PG_FLUSH_DATA_WORKS
3633 : /* Prepare to report progress of the pre-fsync phase. */
3634 0 : begin_startup_progress_phase();
3635 :
3636 : /*
3637 : * If possible, hint to the kernel that we're soon going to fsync the data
3638 : * directory and its contents. Errors in this step are even less
3639 : * interesting than normal, so log them only at DEBUG1.
3640 : */
3641 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
3642 0 : if (xlog_is_symlink)
3643 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3644 0 : walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3645 : #endif
3646 :
3647 : /* Prepare to report progress syncing the data directory via fsync. */
3648 0 : begin_startup_progress_phase();
3649 :
3650 : /*
3651 : * Now we do the fsync()s in the same order.
3652 : *
3653 : * The main call ignores symlinks, so in addition to specially processing
3654 : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3655 : * process_symlinks = true. Note that if there are any plain directories
3656 : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3657 : * so we don't worry about optimizing it.
3658 : */
3659 0 : walkdir(".", datadir_fsync_fname, false, LOG);
3660 0 : if (xlog_is_symlink)
3661 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3662 0 : walkdir(PG_TBLSPC_DIR, datadir_fsync_fname, true, LOG);
3663 : }
3664 :
3665 : /*
3666 : * walkdir: recursively walk a directory, applying the action to each
3667 : * regular file and directory (including the named directory itself).
3668 : *
3669 : * If process_symlinks is true, the action and recursion are also applied
3670 : * to regular files and directories that are pointed to by symlinks in the
3671 : * given directory; otherwise symlinks are ignored. Symlinks are always
3672 : * ignored in subdirectories, ie we intentionally don't pass down the
3673 : * process_symlinks flag to recursive calls.
3674 : *
3675 : * Errors are reported at level elevel, which might be ERROR or less.
3676 : *
3677 : * See also walkdir in file_utils.c, which is a frontend version of this
3678 : * logic.
3679 : */
3680 : static void
3681 338 : walkdir(const char *path,
3682 : void (*action) (const char *fname, bool isdir, int elevel),
3683 : bool process_symlinks,
3684 : int elevel)
3685 : {
3686 : DIR *dir;
3687 : struct dirent *de;
3688 :
3689 338 : dir = AllocateDir(path);
3690 :
3691 3302 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3692 : {
3693 : char subpath[MAXPGPATH * 2];
3694 :
3695 2964 : CHECK_FOR_INTERRUPTS();
3696 :
3697 2964 : if (strcmp(de->d_name, ".") == 0 ||
3698 2626 : strcmp(de->d_name, "..") == 0)
3699 676 : continue;
3700 :
3701 2288 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3702 :
3703 2288 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3704 : {
3705 2288 : case PGFILETYPE_REG:
3706 2288 : (*action) (subpath, false, elevel);
3707 2288 : break;
3708 0 : case PGFILETYPE_DIR:
3709 0 : walkdir(subpath, action, false, elevel);
3710 0 : break;
3711 0 : default:
3712 :
3713 : /*
3714 : * Errors are already reported directly by get_dirent_type(),
3715 : * and any remaining symlinks and unknown file types are
3716 : * ignored.
3717 : */
3718 0 : break;
3719 : }
3720 : }
3721 :
3722 338 : FreeDir(dir); /* we ignore any error here */
3723 :
3724 : /*
3725 : * It's important to fsync the destination directory itself as individual
3726 : * file fsyncs don't guarantee that the directory entry for the file is
3727 : * synced. However, skip this if AllocateDir failed; the action function
3728 : * might not be robust against that.
3729 : */
3730 338 : if (dir)
3731 338 : (*action) (path, true, elevel);
3732 338 : }
3733 :
3734 :
3735 : /*
3736 : * Hint to the OS that it should get ready to fsync() this file.
3737 : *
3738 : * Ignores errors trying to open unreadable files, and logs other errors at a
3739 : * caller-specified level.
3740 : */
3741 : #ifdef PG_FLUSH_DATA_WORKS
3742 :
3743 : static void
3744 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3745 : {
3746 : int fd;
3747 :
3748 : /* Don't try to flush directories, it'll likely just fail */
3749 0 : if (isdir)
3750 0 : return;
3751 :
3752 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3753 : fname);
3754 :
3755 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3756 :
3757 0 : if (fd < 0)
3758 : {
3759 0 : if (errno == EACCES)
3760 0 : return;
3761 0 : ereport(elevel,
3762 : (errcode_for_file_access(),
3763 : errmsg("could not open file \"%s\": %m", fname)));
3764 0 : return;
3765 : }
3766 :
3767 : /*
3768 : * pg_flush_data() ignores errors, which is ok because this is only a
3769 : * hint.
3770 : */
3771 0 : pg_flush_data(fd, 0, 0);
3772 :
3773 0 : if (CloseTransientFile(fd) != 0)
3774 0 : ereport(elevel,
3775 : (errcode_for_file_access(),
3776 : errmsg("could not close file \"%s\": %m", fname)));
3777 : }
3778 :
3779 : #endif /* PG_FLUSH_DATA_WORKS */
3780 :
3781 : static void
3782 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3783 : {
3784 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3785 : fname);
3786 :
3787 : /*
3788 : * We want to silently ignoring errors about unreadable files. Pass that
3789 : * desire on to fsync_fname_ext().
3790 : */
3791 0 : fsync_fname_ext(fname, isdir, true, elevel);
3792 0 : }
3793 :
3794 : static void
3795 2626 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3796 : {
3797 2626 : if (isdir)
3798 : {
3799 338 : if (rmdir(fname) != 0 && errno != ENOENT)
3800 0 : ereport(elevel,
3801 : (errcode_for_file_access(),
3802 : errmsg("could not remove directory \"%s\": %m", fname)));
3803 : }
3804 : else
3805 : {
3806 : /* Use PathNameDeleteTemporaryFile to report filesize */
3807 2288 : PathNameDeleteTemporaryFile(fname, false);
3808 : }
3809 2626 : }
3810 :
3811 : /*
3812 : * fsync_fname_ext -- Try to fsync a file or directory
3813 : *
3814 : * If ignore_perm is true, ignore errors upon trying to open unreadable
3815 : * files. Logs other errors at a caller-specified level.
3816 : *
3817 : * Returns 0 if the operation succeeded, -1 otherwise.
3818 : */
3819 : int
3820 66326 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3821 : {
3822 : int fd;
3823 : int flags;
3824 : int returncode;
3825 :
3826 : /*
3827 : * Some OSs require directories to be opened read-only whereas other
3828 : * systems don't allow us to fsync files opened read-only; so we need both
3829 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3830 : * not writable by our userid, but we assume that's OK.
3831 : */
3832 66326 : flags = PG_BINARY;
3833 66326 : if (!isdir)
3834 24426 : flags |= O_RDWR;
3835 : else
3836 41900 : flags |= O_RDONLY;
3837 :
3838 66326 : fd = OpenTransientFile(fname, flags);
3839 :
3840 : /*
3841 : * Some OSs don't allow us to open directories at all (Windows returns
3842 : * EACCES), just ignore the error in that case. If desired also silently
3843 : * ignoring errors about unreadable files. Log others.
3844 : */
3845 66326 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3846 0 : return 0;
3847 66326 : else if (fd < 0 && ignore_perm && errno == EACCES)
3848 0 : return 0;
3849 66326 : else if (fd < 0)
3850 : {
3851 0 : ereport(elevel,
3852 : (errcode_for_file_access(),
3853 : errmsg("could not open file \"%s\": %m", fname)));
3854 0 : return -1;
3855 : }
3856 :
3857 66326 : returncode = pg_fsync(fd);
3858 :
3859 : /*
3860 : * Some OSes don't allow us to fsync directories at all, so we can ignore
3861 : * those errors. Anything else needs to be logged.
3862 : */
3863 66326 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3864 : {
3865 : int save_errno;
3866 :
3867 : /* close file upon error, might not be in transaction context */
3868 0 : save_errno = errno;
3869 0 : (void) CloseTransientFile(fd);
3870 0 : errno = save_errno;
3871 :
3872 0 : ereport(elevel,
3873 : (errcode_for_file_access(),
3874 : errmsg("could not fsync file \"%s\": %m", fname)));
3875 0 : return -1;
3876 : }
3877 :
3878 66326 : if (CloseTransientFile(fd) != 0)
3879 : {
3880 0 : ereport(elevel,
3881 : (errcode_for_file_access(),
3882 : errmsg("could not close file \"%s\": %m", fname)));
3883 0 : return -1;
3884 : }
3885 :
3886 66326 : return 0;
3887 : }
3888 :
3889 : /*
3890 : * fsync_parent_path -- fsync the parent path of a file or directory
3891 : *
3892 : * This is aimed at making file operations persistent on disk in case of
3893 : * an OS crash or power failure.
3894 : */
3895 : static int
3896 11658 : fsync_parent_path(const char *fname, int elevel)
3897 : {
3898 : char parentpath[MAXPGPATH];
3899 :
3900 11658 : strlcpy(parentpath, fname, MAXPGPATH);
3901 11658 : get_parent_directory(parentpath);
3902 :
3903 : /*
3904 : * get_parent_directory() returns an empty string if the input argument is
3905 : * just a file name (see comments in path.c), so handle that as being the
3906 : * current directory.
3907 : */
3908 11658 : if (strlen(parentpath) == 0)
3909 370 : strlcpy(parentpath, ".", MAXPGPATH);
3910 :
3911 11658 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3912 0 : return -1;
3913 :
3914 11658 : return 0;
3915 : }
3916 :
3917 : /*
3918 : * Create a PostgreSQL data sub-directory
3919 : *
3920 : * The data directory itself, and most of its sub-directories, are created at
3921 : * initdb time, but we do have some occasions when we create directories in
3922 : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3923 : * make sure that those directories are created consistently. Today, that means
3924 : * making sure that the created directory has the correct permissions, which is
3925 : * what pg_dir_create_mode tracks for us.
3926 : *
3927 : * Note that we also set the umask() based on what we understand the correct
3928 : * permissions to be (see file_perm.c).
3929 : *
3930 : * For permissions other than the default, mkdir() can be used directly, but
3931 : * be sure to consider carefully such cases -- a sub-directory with incorrect
3932 : * permissions in a PostgreSQL data directory could cause backups and other
3933 : * processes to fail.
3934 : */
3935 : int
3936 2702 : MakePGDirectory(const char *directoryName)
3937 : {
3938 2702 : return mkdir(directoryName, pg_dir_create_mode);
3939 : }
3940 :
3941 : /*
3942 : * Return the passed-in error level, or PANIC if data_sync_retry is off.
3943 : *
3944 : * Failure to fsync any data file is cause for immediate panic, unless
3945 : * data_sync_retry is enabled. Data may have been written to the operating
3946 : * system and removed from our buffer pool already, and if we are running on
3947 : * an operating system that forgets dirty data on write-back failure, there
3948 : * may be only one copy of the data remaining: in the WAL. A later attempt to
3949 : * fsync again might falsely report success. Therefore we must not allow any
3950 : * further checkpoints to be attempted. data_sync_retry can in theory be
3951 : * enabled on systems known not to drop dirty buffered data on write-back
3952 : * failure (with the likely outcome that checkpoints will continue to fail
3953 : * until the underlying problem is fixed).
3954 : *
3955 : * Any code that reports a failure from fsync() or related functions should
3956 : * filter the error level with this function.
3957 : */
3958 : int
3959 35108 : data_sync_elevel(int elevel)
3960 : {
3961 35108 : return data_sync_retry ? elevel : PANIC;
3962 : }
3963 :
3964 : bool
3965 1986 : check_debug_io_direct(char **newval, void **extra, GucSource source)
3966 : {
3967 1986 : bool result = true;
3968 : int flags;
3969 :
3970 : #if PG_O_DIRECT == 0
3971 : if (strcmp(*newval, "") != 0)
3972 : {
3973 : GUC_check_errdetail("\"%s\" is not supported on this platform.",
3974 : "debug_io_direct");
3975 : result = false;
3976 : }
3977 : flags = 0;
3978 : #else
3979 : List *elemlist;
3980 : ListCell *l;
3981 : char *rawstring;
3982 :
3983 : /* Need a modifiable copy of string */
3984 1986 : rawstring = pstrdup(*newval);
3985 :
3986 1986 : if (!SplitGUCList(rawstring, ',', &elemlist))
3987 : {
3988 0 : GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
3989 : "debug_io_direct");
3990 0 : pfree(rawstring);
3991 0 : list_free(elemlist);
3992 0 : return false;
3993 : }
3994 :
3995 1986 : flags = 0;
3996 1998 : foreach(l, elemlist)
3997 : {
3998 12 : char *item = (char *) lfirst(l);
3999 :
4000 12 : if (pg_strcasecmp(item, "data") == 0)
4001 4 : flags |= IO_DIRECT_DATA;
4002 8 : else if (pg_strcasecmp(item, "wal") == 0)
4003 4 : flags |= IO_DIRECT_WAL;
4004 4 : else if (pg_strcasecmp(item, "wal_init") == 0)
4005 4 : flags |= IO_DIRECT_WAL_INIT;
4006 : else
4007 : {
4008 0 : GUC_check_errdetail("Invalid option \"%s\".", item);
4009 0 : result = false;
4010 0 : break;
4011 : }
4012 : }
4013 :
4014 : /*
4015 : * It's possible to configure block sizes smaller than our assumed I/O
4016 : * alignment size, which could result in invalid I/O requests.
4017 : */
4018 : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4019 : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4020 : {
4021 : GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4022 : "debug_io_direct", "XLOG_BLCKSZ");
4023 : result = false;
4024 : }
4025 : #endif
4026 : #if BLCKSZ < PG_IO_ALIGN_SIZE
4027 : if (result && (flags & IO_DIRECT_DATA))
4028 : {
4029 : GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4030 : "debug_io_direct", "BLCKSZ");
4031 : result = false;
4032 : }
4033 : #endif
4034 :
4035 1986 : pfree(rawstring);
4036 1986 : list_free(elemlist);
4037 : #endif
4038 :
4039 1986 : if (!result)
4040 0 : return result;
4041 :
4042 : /* Save the flags in *extra, for use by assign_debug_io_direct */
4043 1986 : *extra = guc_malloc(ERROR, sizeof(int));
4044 1986 : *((int *) *extra) = flags;
4045 :
4046 1986 : return result;
4047 : }
4048 :
4049 : void
4050 1986 : assign_debug_io_direct(const char *newval, void *extra)
4051 : {
4052 1986 : int *flags = (int *) extra;
4053 :
4054 1986 : io_direct_flags = *flags;
4055 1986 : }
4056 :
4057 : /* ResourceOwner callbacks */
4058 :
4059 : static void
4060 10 : ResOwnerReleaseFile(Datum res)
4061 : {
4062 10 : File file = (File) DatumGetInt32(res);
4063 : Vfd *vfdP;
4064 :
4065 : Assert(FileIsValid(file));
4066 :
4067 10 : vfdP = &VfdCache[file];
4068 10 : vfdP->resowner = NULL;
4069 :
4070 10 : FileClose(file);
4071 10 : }
4072 :
4073 : static char *
4074 0 : ResOwnerPrintFile(Datum res)
4075 : {
4076 0 : return psprintf("File %d", DatumGetInt32(res));
4077 : }
|