Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * fd.c
4 : * Virtual file descriptor code.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/backend/storage/file/fd.c
11 : *
12 : * NOTES:
13 : *
14 : * This code manages a cache of 'virtual' file descriptors (VFDs).
15 : * The server opens many file descriptors for a variety of reasons,
16 : * including base tables, scratch files (e.g., sort and hash spool
17 : * files), and random calls to C library routines like system(3); it
18 : * is quite easy to exceed system limits on the number of open files a
19 : * single process can have. (This is around 1024 on many modern
20 : * operating systems, but may be lower on others.)
21 : *
22 : * VFDs are managed as an LRU pool, with actual OS file descriptors
23 : * being opened and closed as needed. Obviously, if a routine is
24 : * opened using these interfaces, all subsequent operations must also
25 : * be through these interfaces (the File type is not a real file
26 : * descriptor).
27 : *
28 : * For this scheme to work, most (if not all) routines throughout the
29 : * server should use these interfaces instead of calling the C library
30 : * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 : * may find ourselves short of real file descriptors anyway.
32 : *
33 : * INTERFACE ROUTINES
34 : *
35 : * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 : * A File opened with OpenTemporaryFile is automatically deleted when the
37 : * File is closed, either explicitly or implicitly at end of transaction or
38 : * process exit. PathNameOpenFile is intended for files that are held open
39 : * for a long time, like relation files. It is the caller's responsibility
40 : * to close them, there is no automatic mechanism in fd.c for that.
41 : *
42 : * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 : * temporary files that have names so that they can be shared between
44 : * backends. Such files are automatically closed and count against the
45 : * temporary file limit of the backend that creates them, but unlike anonymous
46 : * files they are not automatically deleted. See sharedfileset.c for a shared
47 : * ownership mechanism that provides automatic cleanup for shared files when
48 : * the last of a group of backends detaches.
49 : *
50 : * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 : * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 : * They behave like the corresponding native functions, except that the handle
53 : * is registered with the current subtransaction, and will be automatically
54 : * closed at abort. These are intended mainly for short operations like
55 : * reading a configuration file; there is a limit on the number of files that
56 : * can be opened using these functions at any one time.
57 : *
58 : * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 : * release file descriptors in use by the virtual file descriptors if
60 : * necessary. There is no automatic cleanup of file descriptors returned by
61 : * BasicOpenFile, it is solely the caller's responsibility to close the file
62 : * descriptor by calling close(2).
63 : *
64 : * If a non-virtual file descriptor needs to be held open for any length of
65 : * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 : * (and eventually ReleaseExternalFD), so that we can take it into account
67 : * while deciding how many VFDs can be open. This applies to FDs obtained
68 : * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 : *
70 : *-------------------------------------------------------------------------
71 : */
72 :
73 : #include "postgres.h"
74 :
75 : #include <dirent.h>
76 : #include <sys/file.h>
77 : #include <sys/param.h>
78 : #include <sys/resource.h> /* for getrlimit */
79 : #include <sys/stat.h>
80 : #include <sys/types.h>
81 : #ifndef WIN32
82 : #include <sys/mman.h>
83 : #endif
84 : #include <limits.h>
85 : #include <unistd.h>
86 : #include <fcntl.h>
87 :
88 : #include "access/xact.h"
89 : #include "access/xlog.h"
90 : #include "catalog/pg_tablespace.h"
91 : #include "common/file_perm.h"
92 : #include "common/file_utils.h"
93 : #include "common/pg_prng.h"
94 : #include "miscadmin.h"
95 : #include "pgstat.h"
96 : #include "portability/mem.h"
97 : #include "postmaster/startup.h"
98 : #include "storage/fd.h"
99 : #include "storage/ipc.h"
100 : #include "utils/guc.h"
101 : #include "utils/guc_hooks.h"
102 : #include "utils/resowner_private.h"
103 : #include "utils/varlena.h"
104 :
105 : /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 : #if defined(HAVE_SYNC_FILE_RANGE)
107 : #define PG_FLUSH_DATA_WORKS 1
108 : #elif !defined(WIN32) && defined(MS_ASYNC)
109 : #define PG_FLUSH_DATA_WORKS 1
110 : #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 : #define PG_FLUSH_DATA_WORKS 1
112 : #endif
113 :
114 : /*
115 : * We must leave some file descriptors free for system(), the dynamic loader,
116 : * and other code that tries to open files without consulting fd.c. This
117 : * is the number left free. (While we try fairly hard to prevent EMFILE
118 : * errors, there's never any guarantee that we won't get ENFILE due to
119 : * other processes chewing up FDs. So it's a bad idea to try to open files
120 : * without consulting fd.c. Nonetheless we cannot control all code.)
121 : *
122 : * Because this is just a fixed setting, we are effectively assuming that
123 : * no such code will leave FDs open over the long term; otherwise the slop
124 : * is likely to be insufficient. Note in particular that we expect that
125 : * loading a shared library does not result in any permanent increase in
126 : * the number of open files. (This appears to be true on most if not
127 : * all platforms as of Feb 2004.)
128 : */
129 : #define NUM_RESERVED_FDS 10
130 :
131 : /*
132 : * If we have fewer than this many usable FDs after allowing for the reserved
133 : * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 : * much less than that. Note that this value ensures numExternalFDs can be
135 : * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 : * will not pass unless that can grow to at least 14.)
137 : */
138 : #define FD_MINFREE 48
139 :
140 : /*
141 : * A number of platforms allow individual processes to open many more files
142 : * than they can really support when *many* processes do the same thing.
143 : * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 : * what the postmaster's initial probe suggests will work.
145 : */
146 : int max_files_per_process = 1000;
147 :
148 : /*
149 : * Maximum number of file descriptors to open for operations that fd.c knows
150 : * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 : * to a conservative value, and remains that way indefinitely in bootstrap or
152 : * standalone-backend cases. In normal postmaster operation, the postmaster
153 : * calls set_max_safe_fds() late in initialization to update the value, and
154 : * that value is then inherited by forked subprocesses.
155 : *
156 : * Note: the value of max_files_per_process is taken into account while
157 : * setting this variable, and so need not be tested separately.
158 : */
159 : int max_safe_fds = FD_MINFREE; /* default if not changed */
160 :
161 : /* Whether it is safe to continue running after fsync() fails. */
162 : bool data_sync_retry = false;
163 :
164 : /* How SyncDataDirectory() should do its job. */
165 : int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
166 :
167 : /* Which kinds of files should be opened with PG_O_DIRECT. */
168 : int io_direct_flags;
169 :
170 : /* Debugging.... */
171 :
172 : #ifdef FDDEBUG
173 : #define DO_DB(A) \
174 : do { \
175 : int _do_db_save_errno = errno; \
176 : A; \
177 : errno = _do_db_save_errno; \
178 : } while (0)
179 : #else
180 : #define DO_DB(A) \
181 : ((void) 0)
182 : #endif
183 :
184 : #define VFD_CLOSED (-1)
185 :
186 : #define FileIsValid(file) \
187 : ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188 :
189 : #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190 :
191 : /* these are the assigned bits in fdstate below: */
192 : #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193 : #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194 : #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195 :
196 : typedef struct vfd
197 : {
198 : int fd; /* current FD, or VFD_CLOSED if none */
199 : unsigned short fdstate; /* bitflags for VFD's state */
200 : ResourceOwner resowner; /* owner, for automatic cleanup */
201 : File nextFree; /* link to next free VFD, if in freelist */
202 : File lruMoreRecently; /* doubly linked recency-of-use list */
203 : File lruLessRecently;
204 : off_t fileSize; /* current size of file (0 if not temporary) */
205 : char *fileName; /* name of file, or NULL for unused VFD */
206 : /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 : int fileFlags; /* open(2) flags for (re)opening the file */
208 : mode_t fileMode; /* mode to pass to open(2) */
209 : } Vfd;
210 :
211 : /*
212 : * Virtual File Descriptor array pointer and size. This grows as
213 : * needed. 'File' values are indexes into this array.
214 : * Note that VfdCache[0] is not a usable VFD, just a list header.
215 : */
216 : static Vfd *VfdCache;
217 : static Size SizeVfdCache = 0;
218 :
219 : /*
220 : * Number of file descriptors known to be in use by VFD entries.
221 : */
222 : static int nfile = 0;
223 :
224 : /*
225 : * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 : * to close
227 : */
228 : static bool have_xact_temporary_files = false;
229 :
230 : /*
231 : * Tracks the total size of all temporary files. Note: when temp_file_limit
232 : * is being enforced, this cannot overflow since the limit cannot be more
233 : * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 : * overflow, but we don't care.
235 : */
236 : static uint64 temporary_files_size = 0;
237 :
238 : /* Temporary file access initialized and not yet shut down? */
239 : #ifdef USE_ASSERT_CHECKING
240 : static bool temporary_files_allowed = false;
241 : #endif
242 :
243 : /*
244 : * List of OS handles opened with AllocateFile, AllocateDir and
245 : * OpenTransientFile.
246 : */
247 : typedef enum
248 : {
249 : AllocateDescFile,
250 : AllocateDescPipe,
251 : AllocateDescDir,
252 : AllocateDescRawFD
253 : } AllocateDescKind;
254 :
255 : typedef struct
256 : {
257 : AllocateDescKind kind;
258 : SubTransactionId create_subid;
259 : union
260 : {
261 : FILE *file;
262 : DIR *dir;
263 : int fd;
264 : } desc;
265 : } AllocateDesc;
266 :
267 : static int numAllocatedDescs = 0;
268 : static int maxAllocatedDescs = 0;
269 : static AllocateDesc *allocatedDescs = NULL;
270 :
271 : /*
272 : * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 : */
274 : static int numExternalFDs = 0;
275 :
276 : /*
277 : * Number of temporary files opened during the current session;
278 : * this is used in generation of tempfile names.
279 : */
280 : static long tempFileCounter = 0;
281 :
282 : /*
283 : * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 : * indicating that the current database's default tablespace should be used.)
285 : * When numTempTableSpaces is -1, this has not been set in the current
286 : * transaction.
287 : */
288 : static Oid *tempTableSpaces = NULL;
289 : static int numTempTableSpaces = -1;
290 : static int nextTempTableSpace = 0;
291 :
292 :
293 : /*--------------------
294 : *
295 : * Private Routines
296 : *
297 : * Delete - delete a file from the Lru ring
298 : * LruDelete - remove a file from the Lru ring and close its FD
299 : * Insert - put a file at the front of the Lru ring
300 : * LruInsert - put a file at the front of the Lru ring and open it
301 : * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 : * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 : * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 : * FreeVfd - free a file record
305 : *
306 : * The Least Recently Used ring is a doubly linked list that begins and
307 : * ends on element zero. Element zero is special -- it doesn't represent
308 : * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 : * anchor that shows us the beginning/end of the ring.
310 : * Only VFD elements that are currently really open (have an FD assigned) are
311 : * in the Lru ring. Elements that are "virtually" open can be recognized
312 : * by having a non-null fileName field.
313 : *
314 : * example:
315 : *
316 : * /--less----\ /---------\
317 : * v \ v \
318 : * #0 --more---> LeastRecentlyUsed --more-\ \
319 : * ^\ | |
320 : * \\less--> MostRecentlyUsedFile <---/ |
321 : * \more---/ \--less--/
322 : *
323 : *--------------------
324 : */
325 : static void Delete(File file);
326 : static void LruDelete(File file);
327 : static void Insert(File file);
328 : static int LruInsert(File file);
329 : static bool ReleaseLruFile(void);
330 : static void ReleaseLruFiles(void);
331 : static File AllocateVfd(void);
332 : static void FreeVfd(File file);
333 :
334 : static int FileAccess(File file);
335 : static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336 : static bool reserveAllocatedDesc(void);
337 : static int FreeDesc(AllocateDesc *desc);
338 :
339 : static void BeforeShmemExit_Files(int code, Datum arg);
340 : static void CleanupTempFiles(bool isCommit, bool isProcExit);
341 : static void RemovePgTempRelationFiles(const char *tsdirname);
342 : static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343 :
344 : static void walkdir(const char *path,
345 : void (*action) (const char *fname, bool isdir, int elevel),
346 : bool process_symlinks,
347 : int elevel);
348 : #ifdef PG_FLUSH_DATA_WORKS
349 : static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350 : #endif
351 : static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352 : static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353 :
354 : static int fsync_parent_path(const char *fname, int elevel);
355 :
356 :
357 : /*
358 : * pg_fsync --- do fsync with or without writethrough
359 : */
360 : int
361 99690 : pg_fsync(int fd)
362 : {
363 : #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
364 : struct stat st;
365 :
366 : /*
367 : * Some operating system implementations of fsync() have requirements
368 : * about the file access modes that were used when their file descriptor
369 : * argument was opened, and these requirements differ depending on whether
370 : * the file descriptor is for a directory.
371 : *
372 : * For any file descriptor that may eventually be handed to fsync(), we
373 : * should have opened it with access modes that are compatible with
374 : * fsync() on all supported systems, otherwise the code may not be
375 : * portable, even if it runs ok on the current system.
376 : *
377 : * We assert here that a descriptor for a file was opened with write
378 : * permissions (either O_RDWR or O_WRONLY) and for a directory without
379 : * write permissions (O_RDONLY).
380 : *
381 : * Ignore any fstat errors and let the follow-up fsync() do its work.
382 : * Doing this sanity check here counts for the case where fsync() is
383 : * disabled.
384 : */
385 : if (fstat(fd, &st) == 0)
386 : {
387 : int desc_flags = fcntl(fd, F_GETFL);
388 :
389 : /*
390 : * O_RDONLY is historically 0, so just make sure that for directories
391 : * no write flags are used.
392 : */
393 : if (S_ISDIR(st.st_mode))
394 : Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
395 : else
396 : Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
397 : }
398 : errno = 0;
399 : #endif
400 :
401 : /* #if is to skip the sync_method test if there's no need for it */
402 : #if defined(HAVE_FSYNC_WRITETHROUGH)
403 : if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
404 : return pg_fsync_writethrough(fd);
405 : else
406 : #endif
407 99690 : return pg_fsync_no_writethrough(fd);
408 : }
409 :
410 :
411 : /*
412 : * pg_fsync_no_writethrough --- same as fsync except does nothing if
413 : * enableFsync is off
414 : */
415 : int
416 99690 : pg_fsync_no_writethrough(int fd)
417 : {
418 : int rc;
419 :
420 99690 : if (!enableFsync)
421 99690 : return 0;
422 :
423 0 : retry:
424 0 : rc = fsync(fd);
425 :
426 0 : if (rc == -1 && errno == EINTR)
427 0 : goto retry;
428 :
429 0 : return rc;
430 : }
431 :
432 : /*
433 : * pg_fsync_writethrough
434 : */
435 : int
436 0 : pg_fsync_writethrough(int fd)
437 : {
438 0 : if (enableFsync)
439 : {
440 : #if defined(F_FULLFSYNC)
441 : return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
442 : #else
443 0 : errno = ENOSYS;
444 0 : return -1;
445 : #endif
446 : }
447 : else
448 0 : return 0;
449 : }
450 :
451 : /*
452 : * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
453 : */
454 : int
455 0 : pg_fdatasync(int fd)
456 : {
457 : int rc;
458 :
459 0 : if (!enableFsync)
460 0 : return 0;
461 :
462 0 : retry:
463 0 : rc = fdatasync(fd);
464 :
465 0 : if (rc == -1 && errno == EINTR)
466 0 : goto retry;
467 :
468 0 : return rc;
469 : }
470 :
471 : /*
472 : * pg_flush_data --- advise OS that the described dirty data should be flushed
473 : *
474 : * offset of 0 with nbytes 0 means that the entire file should be flushed
475 : */
476 : void
477 156460 : pg_flush_data(int fd, off_t offset, off_t nbytes)
478 : {
479 : /*
480 : * Right now file flushing is primarily used to avoid making later
481 : * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
482 : * if fsyncs are disabled - that's a decision we might want to make
483 : * configurable at some point.
484 : */
485 156460 : if (!enableFsync)
486 156460 : return;
487 :
488 : /*
489 : * We compile all alternatives that are supported on the current platform,
490 : * to find portability problems more easily.
491 : */
492 : #if defined(HAVE_SYNC_FILE_RANGE)
493 : {
494 : int rc;
495 : static bool not_implemented_by_kernel = false;
496 :
497 0 : if (not_implemented_by_kernel)
498 0 : return;
499 :
500 0 : retry:
501 :
502 : /*
503 : * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
504 : * tells the OS that writeback for the specified blocks should be
505 : * started, but that we don't want to wait for completion. Note that
506 : * this call might block if too much dirty data exists in the range.
507 : * This is the preferable method on OSs supporting it, as it works
508 : * reliably when available (contrast to msync()) and doesn't flush out
509 : * clean data (like FADV_DONTNEED).
510 : */
511 0 : rc = sync_file_range(fd, offset, nbytes,
512 : SYNC_FILE_RANGE_WRITE);
513 0 : if (rc != 0)
514 : {
515 : int elevel;
516 :
517 0 : if (rc == EINTR)
518 0 : goto retry;
519 :
520 : /*
521 : * For systems that don't have an implementation of
522 : * sync_file_range() such as Windows WSL, generate only one
523 : * warning and then suppress all further attempts by this process.
524 : */
525 0 : if (errno == ENOSYS)
526 : {
527 0 : elevel = WARNING;
528 0 : not_implemented_by_kernel = true;
529 : }
530 : else
531 0 : elevel = data_sync_elevel(WARNING);
532 :
533 0 : ereport(elevel,
534 : (errcode_for_file_access(),
535 : errmsg("could not flush dirty data: %m")));
536 : }
537 :
538 0 : return;
539 : }
540 : #endif
541 : #if !defined(WIN32) && defined(MS_ASYNC)
542 : {
543 : void *p;
544 : static int pagesize = 0;
545 :
546 : /*
547 : * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
548 : * writeback. On linux it only does so if MS_SYNC is specified, but
549 : * then it does the writeback synchronously. Luckily all common linux
550 : * systems have sync_file_range(). This is preferable over
551 : * FADV_DONTNEED because it doesn't flush out clean data.
552 : *
553 : * We map the file (mmap()), tell the kernel to sync back the contents
554 : * (msync()), and then remove the mapping again (munmap()).
555 : */
556 :
557 : /* mmap() needs actual length if we want to map whole file */
558 : if (offset == 0 && nbytes == 0)
559 : {
560 : nbytes = lseek(fd, 0, SEEK_END);
561 : if (nbytes < 0)
562 : {
563 : ereport(WARNING,
564 : (errcode_for_file_access(),
565 : errmsg("could not determine dirty data size: %m")));
566 : return;
567 : }
568 : }
569 :
570 : /*
571 : * Some platforms reject partial-page mmap() attempts. To deal with
572 : * that, just truncate the request to a page boundary. If any extra
573 : * bytes don't get flushed, well, it's only a hint anyway.
574 : */
575 :
576 : /* fetch pagesize only once */
577 : if (pagesize == 0)
578 : pagesize = sysconf(_SC_PAGESIZE);
579 :
580 : /* align length to pagesize, dropping any fractional page */
581 : if (pagesize > 0)
582 : nbytes = (nbytes / pagesize) * pagesize;
583 :
584 : /* fractional-page request is a no-op */
585 : if (nbytes <= 0)
586 : return;
587 :
588 : /*
589 : * mmap could well fail, particularly on 32-bit platforms where there
590 : * may simply not be enough address space. If so, silently fall
591 : * through to the next implementation.
592 : */
593 : if (nbytes <= (off_t) SSIZE_MAX)
594 : p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
595 : else
596 : p = MAP_FAILED;
597 :
598 : if (p != MAP_FAILED)
599 : {
600 : int rc;
601 :
602 : rc = msync(p, (size_t) nbytes, MS_ASYNC);
603 : if (rc != 0)
604 : {
605 : ereport(data_sync_elevel(WARNING),
606 : (errcode_for_file_access(),
607 : errmsg("could not flush dirty data: %m")));
608 : /* NB: need to fall through to munmap()! */
609 : }
610 :
611 : rc = munmap(p, (size_t) nbytes);
612 : if (rc != 0)
613 : {
614 : /* FATAL error because mapping would remain */
615 : ereport(FATAL,
616 : (errcode_for_file_access(),
617 : errmsg("could not munmap() while flushing data: %m")));
618 : }
619 :
620 : return;
621 : }
622 : }
623 : #endif
624 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
625 : {
626 : int rc;
627 :
628 : /*
629 : * Signal the kernel that the passed in range should not be cached
630 : * anymore. This has the, desired, side effect of writing out dirty
631 : * data, and the, undesired, side effect of likely discarding useful
632 : * clean cached blocks. For the latter reason this is the least
633 : * preferable method.
634 : */
635 :
636 : rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
637 :
638 : if (rc != 0)
639 : {
640 : /* don't error out, this is just a performance optimization */
641 : ereport(WARNING,
642 : (errcode_for_file_access(),
643 : errmsg("could not flush dirty data: %m")));
644 : }
645 :
646 : return;
647 : }
648 : #endif
649 : }
650 :
651 : /*
652 : * Truncate an open file to a given length.
653 : */
654 : static int
655 950 : pg_ftruncate(int fd, off_t length)
656 : {
657 : int ret;
658 :
659 950 : retry:
660 950 : ret = ftruncate(fd, length);
661 :
662 950 : if (ret == -1 && errno == EINTR)
663 0 : goto retry;
664 :
665 950 : return ret;
666 : }
667 :
668 : /*
669 : * Truncate a file to a given length by name.
670 : */
671 : int
672 348514 : pg_truncate(const char *path, off_t length)
673 : {
674 : int ret;
675 : #ifdef WIN32
676 : int save_errno;
677 : int fd;
678 :
679 : fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
680 : if (fd >= 0)
681 : {
682 : ret = pg_ftruncate(fd, length);
683 : save_errno = errno;
684 : CloseTransientFile(fd);
685 : errno = save_errno;
686 : }
687 : else
688 : ret = -1;
689 : #else
690 :
691 348514 : retry:
692 348514 : ret = truncate(path, length);
693 :
694 348514 : if (ret == -1 && errno == EINTR)
695 0 : goto retry;
696 : #endif
697 :
698 348514 : return ret;
699 : }
700 :
701 : /*
702 : * fsync_fname -- fsync a file or directory, handling errors properly
703 : *
704 : * Try to fsync a file or directory. When doing the latter, ignore errors that
705 : * indicate the OS just doesn't allow/require fsyncing directories.
706 : */
707 : void
708 23692 : fsync_fname(const char *fname, bool isdir)
709 : {
710 23692 : fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
711 23692 : }
712 :
713 : /*
714 : * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
715 : *
716 : * This routine ensures that, after returning, the effect of renaming file
717 : * persists in case of a crash. A crash while this routine is running will
718 : * leave you with either the pre-existing or the moved file in place of the
719 : * new file; no mixed state or truncated files are possible.
720 : *
721 : * It does so by using fsync on the old filename and the possibly existing
722 : * target filename before the rename, and the target file and directory after.
723 : *
724 : * Note that rename() cannot be used across arbitrary directories, as they
725 : * might not be on the same filesystem. Therefore this routine does not
726 : * support renaming across directories.
727 : *
728 : * Log errors with the caller specified severity.
729 : *
730 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
731 : * valid upon return.
732 : */
733 : int
734 4494 : durable_rename(const char *oldfile, const char *newfile, int elevel)
735 : {
736 : int fd;
737 :
738 : /*
739 : * First fsync the old and target path (if it exists), to ensure that they
740 : * are properly persistent on disk. Syncing the target file is not
741 : * strictly necessary, but it makes it easier to reason about crashes;
742 : * because it's then guaranteed that either source or target file exists
743 : * after a crash.
744 : */
745 4494 : if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
746 0 : return -1;
747 :
748 4494 : fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
749 4494 : if (fd < 0)
750 : {
751 2816 : if (errno != ENOENT)
752 : {
753 0 : ereport(elevel,
754 : (errcode_for_file_access(),
755 : errmsg("could not open file \"%s\": %m", newfile)));
756 0 : return -1;
757 : }
758 : }
759 : else
760 : {
761 1678 : if (pg_fsync(fd) != 0)
762 : {
763 : int save_errno;
764 :
765 : /* close file upon error, might not be in transaction context */
766 0 : save_errno = errno;
767 0 : CloseTransientFile(fd);
768 0 : errno = save_errno;
769 :
770 0 : ereport(elevel,
771 : (errcode_for_file_access(),
772 : errmsg("could not fsync file \"%s\": %m", newfile)));
773 0 : return -1;
774 : }
775 :
776 1678 : if (CloseTransientFile(fd) != 0)
777 : {
778 0 : ereport(elevel,
779 : (errcode_for_file_access(),
780 : errmsg("could not close file \"%s\": %m", newfile)));
781 0 : return -1;
782 : }
783 : }
784 :
785 : /* Time to do the real deal... */
786 4494 : if (rename(oldfile, newfile) < 0)
787 : {
788 0 : ereport(elevel,
789 : (errcode_for_file_access(),
790 : errmsg("could not rename file \"%s\" to \"%s\": %m",
791 : oldfile, newfile)));
792 0 : return -1;
793 : }
794 :
795 : /*
796 : * To guarantee renaming the file is persistent, fsync the file with its
797 : * new name, and its containing directory.
798 : */
799 4494 : if (fsync_fname_ext(newfile, false, false, elevel) != 0)
800 0 : return -1;
801 :
802 4494 : if (fsync_parent_path(newfile, elevel) != 0)
803 0 : return -1;
804 :
805 4494 : return 0;
806 : }
807 :
808 : /*
809 : * durable_unlink -- remove a file in a durable manner
810 : *
811 : * This routine ensures that, after returning, the effect of removing file
812 : * persists in case of a crash. A crash while this routine is running will
813 : * leave the system in no mixed state.
814 : *
815 : * It does so by using fsync on the parent directory of the file after the
816 : * actual removal is done.
817 : *
818 : * Log errors with the severity specified by caller.
819 : *
820 : * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
821 : * valid upon return.
822 : */
823 : int
824 256 : durable_unlink(const char *fname, int elevel)
825 : {
826 256 : if (unlink(fname) < 0)
827 : {
828 60 : ereport(elevel,
829 : (errcode_for_file_access(),
830 : errmsg("could not remove file \"%s\": %m",
831 : fname)));
832 60 : return -1;
833 : }
834 :
835 : /*
836 : * To guarantee that the removal of the file is persistent, fsync its
837 : * parent directory.
838 : */
839 196 : if (fsync_parent_path(fname, elevel) != 0)
840 0 : return -1;
841 :
842 196 : return 0;
843 : }
844 :
845 : /*
846 : * InitFileAccess --- initialize this module during backend startup
847 : *
848 : * This is called during either normal or standalone backend start.
849 : * It is *not* called in the postmaster.
850 : *
851 : * Note that this does not initialize temporary file access, that is
852 : * separately initialized via InitTemporaryFileAccess().
853 : */
854 : void
855 26920 : InitFileAccess(void)
856 : {
857 : Assert(SizeVfdCache == 0); /* call me only once */
858 :
859 : /* initialize cache header entry */
860 26920 : VfdCache = (Vfd *) malloc(sizeof(Vfd));
861 26920 : if (VfdCache == NULL)
862 0 : ereport(FATAL,
863 : (errcode(ERRCODE_OUT_OF_MEMORY),
864 : errmsg("out of memory")));
865 :
866 215360 : MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
867 26920 : VfdCache->fd = VFD_CLOSED;
868 :
869 26920 : SizeVfdCache = 1;
870 26920 : }
871 :
872 : /*
873 : * InitTemporaryFileAccess --- initialize temporary file access during startup
874 : *
875 : * This is called during either normal or standalone backend start.
876 : * It is *not* called in the postmaster.
877 : *
878 : * This is separate from InitFileAccess() because temporary file cleanup can
879 : * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
880 : * our reporting has to happen before that. Low level file access should be
881 : * available for longer, hence the separate initialization / shutdown of
882 : * temporary file handling.
883 : */
884 : void
885 26920 : InitTemporaryFileAccess(void)
886 : {
887 : Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
888 : Assert(!temporary_files_allowed); /* call me only once */
889 :
890 : /*
891 : * Register before-shmem-exit hook to ensure temp files are dropped while
892 : * we can still report stats.
893 : */
894 26920 : before_shmem_exit(BeforeShmemExit_Files, 0);
895 :
896 : #ifdef USE_ASSERT_CHECKING
897 : temporary_files_allowed = true;
898 : #endif
899 26920 : }
900 :
901 : /*
902 : * count_usable_fds --- count how many FDs the system will let us open,
903 : * and estimate how many are already open.
904 : *
905 : * We stop counting if usable_fds reaches max_to_probe. Note: a small
906 : * value of max_to_probe might result in an underestimate of already_open;
907 : * we must fill in any "gaps" in the set of used FDs before the calculation
908 : * of already_open will give the right answer. In practice, max_to_probe
909 : * of a couple of dozen should be enough to ensure good results.
910 : *
911 : * We assume stderr (FD 2) is available for dup'ing. While the calling
912 : * script could theoretically close that, it would be a really bad idea,
913 : * since then one risks loss of error messages from, e.g., libc.
914 : */
915 : static void
916 1230 : count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
917 : {
918 : int *fd;
919 : int size;
920 1230 : int used = 0;
921 1230 : int highestfd = 0;
922 : int j;
923 :
924 : #ifdef HAVE_GETRLIMIT
925 : struct rlimit rlim;
926 : int getrlimit_status;
927 : #endif
928 :
929 1230 : size = 1024;
930 1230 : fd = (int *) palloc(size * sizeof(int));
931 :
932 : #ifdef HAVE_GETRLIMIT
933 1230 : getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
934 1230 : if (getrlimit_status != 0)
935 0 : ereport(WARNING, (errmsg("getrlimit failed: %m")));
936 : #endif /* HAVE_GETRLIMIT */
937 :
938 : /* dup until failure or probe limit reached */
939 : for (;;)
940 1228770 : {
941 : int thisfd;
942 :
943 : #ifdef HAVE_GETRLIMIT
944 :
945 : /*
946 : * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
947 : * some platforms
948 : */
949 1230000 : if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
950 0 : break;
951 : #endif
952 :
953 1230000 : thisfd = dup(2);
954 1230000 : if (thisfd < 0)
955 : {
956 : /* Expect EMFILE or ENFILE, else it's fishy */
957 0 : if (errno != EMFILE && errno != ENFILE)
958 0 : elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
959 0 : break;
960 : }
961 :
962 1230000 : if (used >= size)
963 : {
964 0 : size *= 2;
965 0 : fd = (int *) repalloc(fd, size * sizeof(int));
966 : }
967 1230000 : fd[used++] = thisfd;
968 :
969 1230000 : if (highestfd < thisfd)
970 1230000 : highestfd = thisfd;
971 :
972 1230000 : if (used >= max_to_probe)
973 1230 : break;
974 : }
975 :
976 : /* release the files we opened */
977 1231230 : for (j = 0; j < used; j++)
978 1230000 : close(fd[j]);
979 :
980 1230 : pfree(fd);
981 :
982 : /*
983 : * Return results. usable_fds is just the number of successful dups. We
984 : * assume that the system limit is highestfd+1 (remember 0 is a legal FD
985 : * number) and so already_open is highestfd+1 - usable_fds.
986 : */
987 1230 : *usable_fds = used;
988 1230 : *already_open = highestfd + 1 - used;
989 1230 : }
990 :
991 : /*
992 : * set_max_safe_fds
993 : * Determine number of file descriptors that fd.c is allowed to use
994 : */
995 : void
996 1230 : set_max_safe_fds(void)
997 : {
998 : int usable_fds;
999 : int already_open;
1000 :
1001 : /*----------
1002 : * We want to set max_safe_fds to
1003 : * MIN(usable_fds, max_files_per_process - already_open)
1004 : * less the slop factor for files that are opened without consulting
1005 : * fd.c. This ensures that we won't exceed either max_files_per_process
1006 : * or the experimentally-determined EMFILE limit.
1007 : *----------
1008 : */
1009 1230 : count_usable_fds(max_files_per_process,
1010 : &usable_fds, &already_open);
1011 :
1012 1230 : max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1013 :
1014 : /*
1015 : * Take off the FDs reserved for system() etc.
1016 : */
1017 1230 : max_safe_fds -= NUM_RESERVED_FDS;
1018 :
1019 : /*
1020 : * Make sure we still have enough to get by.
1021 : */
1022 1230 : if (max_safe_fds < FD_MINFREE)
1023 0 : ereport(FATAL,
1024 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1025 : errmsg("insufficient file descriptors available to start server process"),
1026 : errdetail("System allows %d, server needs at least %d.",
1027 : max_safe_fds + NUM_RESERVED_FDS,
1028 : FD_MINFREE + NUM_RESERVED_FDS)));
1029 :
1030 1230 : elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1031 : max_safe_fds, usable_fds, already_open);
1032 1230 : }
1033 :
1034 : /*
1035 : * Open a file with BasicOpenFilePerm() and pass default file mode for the
1036 : * fileMode parameter.
1037 : */
1038 : int
1039 62824 : BasicOpenFile(const char *fileName, int fileFlags)
1040 : {
1041 62824 : return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1042 : }
1043 :
1044 : /*
1045 : * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1046 : *
1047 : * This is exported for use by places that really want a plain kernel FD,
1048 : * but need to be proof against running out of FDs. Once an FD has been
1049 : * successfully returned, it is the caller's responsibility to ensure that
1050 : * it will not be leaked on ereport()! Most users should *not* call this
1051 : * routine directly, but instead use the VFD abstraction level, which
1052 : * provides protection against descriptor leaks as well as management of
1053 : * files that need to be open for more than a short period of time.
1054 : *
1055 : * Ideally this should be the *only* direct call of open() in the backend.
1056 : * In practice, the postmaster calls open() directly, and there are some
1057 : * direct open() calls done early in backend startup. Those are OK since
1058 : * this module wouldn't have any open files to close at that point anyway.
1059 : */
1060 : int
1061 2405062 : BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1062 : {
1063 : int fd;
1064 :
1065 2405062 : tryAgain:
1066 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1067 :
1068 : /*
1069 : * The value we defined to stand in for O_DIRECT when simulating it with
1070 : * F_NOCACHE had better not collide with any of the standard flags.
1071 : */
1072 : StaticAssertStmt((PG_O_DIRECT &
1073 : (O_APPEND |
1074 : O_CLOEXEC |
1075 : O_CREAT |
1076 : O_DSYNC |
1077 : O_EXCL |
1078 : O_RDWR |
1079 : O_RDONLY |
1080 : O_SYNC |
1081 : O_TRUNC |
1082 : O_WRONLY)) == 0,
1083 : "PG_O_DIRECT value collides with standard flag");
1084 : fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1085 : #else
1086 2405062 : fd = open(fileName, fileFlags, fileMode);
1087 : #endif
1088 :
1089 2405062 : if (fd >= 0)
1090 : {
1091 : #ifdef PG_O_DIRECT_USE_F_NOCACHE
1092 : if (fileFlags & PG_O_DIRECT)
1093 : {
1094 : if (fcntl(fd, F_NOCACHE, 1) < 0)
1095 : {
1096 : int save_errno = errno;
1097 :
1098 : close(fd);
1099 : errno = save_errno;
1100 : return -1;
1101 : }
1102 : }
1103 : #endif
1104 :
1105 1806162 : return fd; /* success! */
1106 : }
1107 :
1108 598900 : if (errno == EMFILE || errno == ENFILE)
1109 : {
1110 0 : int save_errno = errno;
1111 :
1112 0 : ereport(LOG,
1113 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1114 : errmsg("out of file descriptors: %m; release and retry")));
1115 0 : errno = 0;
1116 0 : if (ReleaseLruFile())
1117 0 : goto tryAgain;
1118 0 : errno = save_errno;
1119 : }
1120 :
1121 598900 : return -1; /* failure */
1122 : }
1123 :
1124 : /*
1125 : * AcquireExternalFD - attempt to reserve an external file descriptor
1126 : *
1127 : * This should be used by callers that need to hold a file descriptor open
1128 : * over more than a short interval, but cannot use any of the other facilities
1129 : * provided by this module.
1130 : *
1131 : * The difference between this and the underlying ReserveExternalFD function
1132 : * is that this will report failure (by setting errno and returning false)
1133 : * if "too many" external FDs are already reserved. This should be used in
1134 : * any code where the total number of FDs to be reserved is not predictable
1135 : * and small.
1136 : */
1137 : bool
1138 203322 : AcquireExternalFD(void)
1139 : {
1140 : /*
1141 : * We don't want more than max_safe_fds / 3 FDs to be consumed for
1142 : * "external" FDs.
1143 : */
1144 203322 : if (numExternalFDs < max_safe_fds / 3)
1145 : {
1146 203322 : ReserveExternalFD();
1147 203322 : return true;
1148 : }
1149 0 : errno = EMFILE;
1150 0 : return false;
1151 : }
1152 :
1153 : /*
1154 : * ReserveExternalFD - report external consumption of a file descriptor
1155 : *
1156 : * This should be used by callers that need to hold a file descriptor open
1157 : * over more than a short interval, but cannot use any of the other facilities
1158 : * provided by this module. This just tracks the use of the FD and closes
1159 : * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1160 : *
1161 : * Call this directly only in code where failure to reserve the FD would be
1162 : * fatal; for example, the WAL-writing code does so, since the alternative is
1163 : * session failure. Also, it's very unwise to do so in code that could
1164 : * consume more than one FD per process.
1165 : *
1166 : * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1167 : * available, it doesn't matter too much whether this is called before or
1168 : * after actually opening the FD; but doing so beforehand reduces the risk of
1169 : * an EMFILE failure if not everybody played nice. In any case, it's solely
1170 : * caller's responsibility to keep the external-FD count in sync with reality.
1171 : */
1172 : void
1173 299856 : ReserveExternalFD(void)
1174 : {
1175 : /*
1176 : * Release VFDs if needed to stay safe. Because we do this before
1177 : * incrementing numExternalFDs, the final state will be as desired, i.e.,
1178 : * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1179 : */
1180 299856 : ReleaseLruFiles();
1181 :
1182 299856 : numExternalFDs++;
1183 299856 : }
1184 :
1185 : /*
1186 : * ReleaseExternalFD - report release of an external file descriptor
1187 : *
1188 : * This is guaranteed not to change errno, so it can be used in failure paths.
1189 : */
1190 : void
1191 270746 : ReleaseExternalFD(void)
1192 : {
1193 : Assert(numExternalFDs > 0);
1194 270746 : numExternalFDs--;
1195 270746 : }
1196 :
1197 :
1198 : #if defined(FDDEBUG)
1199 :
1200 : static void
1201 : _dump_lru(void)
1202 : {
1203 : int mru = VfdCache[0].lruLessRecently;
1204 : Vfd *vfdP = &VfdCache[mru];
1205 : char buf[2048];
1206 :
1207 : snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1208 : while (mru != 0)
1209 : {
1210 : mru = vfdP->lruLessRecently;
1211 : vfdP = &VfdCache[mru];
1212 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1213 : }
1214 : snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1215 : elog(LOG, "%s", buf);
1216 : }
1217 : #endif /* FDDEBUG */
1218 :
1219 : static void
1220 2093856 : Delete(File file)
1221 : {
1222 : Vfd *vfdP;
1223 :
1224 : Assert(file != 0);
1225 :
1226 : DO_DB(elog(LOG, "Delete %d (%s)",
1227 : file, VfdCache[file].fileName));
1228 : DO_DB(_dump_lru());
1229 :
1230 2093856 : vfdP = &VfdCache[file];
1231 :
1232 2093856 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1233 2093856 : VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1234 :
1235 : DO_DB(_dump_lru());
1236 2093856 : }
1237 :
1238 : static void
1239 62652 : LruDelete(File file)
1240 : {
1241 : Vfd *vfdP;
1242 :
1243 : Assert(file != 0);
1244 :
1245 : DO_DB(elog(LOG, "LruDelete %d (%s)",
1246 : file, VfdCache[file].fileName));
1247 :
1248 62652 : vfdP = &VfdCache[file];
1249 :
1250 : /*
1251 : * Close the file. We aren't expecting this to fail; if it does, better
1252 : * to leak the FD than to mess up our internal state.
1253 : */
1254 62652 : if (close(vfdP->fd) != 0)
1255 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1256 : "could not close file \"%s\": %m", vfdP->fileName);
1257 62652 : vfdP->fd = VFD_CLOSED;
1258 62652 : --nfile;
1259 :
1260 : /* delete the vfd record from the LRU ring */
1261 62652 : Delete(file);
1262 62652 : }
1263 :
1264 : static void
1265 2537410 : Insert(File file)
1266 : {
1267 : Vfd *vfdP;
1268 :
1269 : Assert(file != 0);
1270 :
1271 : DO_DB(elog(LOG, "Insert %d (%s)",
1272 : file, VfdCache[file].fileName));
1273 : DO_DB(_dump_lru());
1274 :
1275 2537410 : vfdP = &VfdCache[file];
1276 :
1277 2537410 : vfdP->lruMoreRecently = 0;
1278 2537410 : vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1279 2537410 : VfdCache[0].lruLessRecently = file;
1280 2537410 : VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1281 :
1282 : DO_DB(_dump_lru());
1283 2537410 : }
1284 :
1285 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1286 : static int
1287 22748 : LruInsert(File file)
1288 : {
1289 : Vfd *vfdP;
1290 :
1291 : Assert(file != 0);
1292 :
1293 : DO_DB(elog(LOG, "LruInsert %d (%s)",
1294 : file, VfdCache[file].fileName));
1295 :
1296 22748 : vfdP = &VfdCache[file];
1297 :
1298 22748 : if (FileIsNotOpen(file))
1299 : {
1300 : /* Close excess kernel FDs. */
1301 22748 : ReleaseLruFiles();
1302 :
1303 : /*
1304 : * The open could still fail for lack of file descriptors, eg due to
1305 : * overall system file table being full. So, be prepared to release
1306 : * another FD if necessary...
1307 : */
1308 22748 : vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1309 : vfdP->fileMode);
1310 22748 : if (vfdP->fd < 0)
1311 : {
1312 : DO_DB(elog(LOG, "re-open failed: %m"));
1313 0 : return -1;
1314 : }
1315 : else
1316 : {
1317 22748 : ++nfile;
1318 : }
1319 : }
1320 :
1321 : /*
1322 : * put it at the head of the Lru ring
1323 : */
1324 :
1325 22748 : Insert(file);
1326 :
1327 22748 : return 0;
1328 : }
1329 :
1330 : /*
1331 : * Release one kernel FD by closing the least-recently-used VFD.
1332 : */
1333 : static bool
1334 62492 : ReleaseLruFile(void)
1335 : {
1336 : DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1337 :
1338 62492 : if (nfile > 0)
1339 : {
1340 : /*
1341 : * There are opened files and so there should be at least one used vfd
1342 : * in the ring.
1343 : */
1344 : Assert(VfdCache[0].lruMoreRecently != 0);
1345 62492 : LruDelete(VfdCache[0].lruMoreRecently);
1346 62492 : return true; /* freed a file */
1347 : }
1348 0 : return false; /* no files available to free */
1349 : }
1350 :
1351 : /*
1352 : * Release kernel FDs as needed to get under the max_safe_fds limit.
1353 : * After calling this, it's OK to try to open another file.
1354 : */
1355 : static void
1356 2808298 : ReleaseLruFiles(void)
1357 : {
1358 2870790 : while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
1359 : {
1360 62492 : if (!ReleaseLruFile())
1361 0 : break;
1362 : }
1363 2808298 : }
1364 :
1365 : static File
1366 1837616 : AllocateVfd(void)
1367 : {
1368 : Index i;
1369 : File file;
1370 :
1371 : DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1372 :
1373 : Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1374 :
1375 1837616 : if (VfdCache[0].nextFree == 0)
1376 : {
1377 : /*
1378 : * The free list is empty so it is time to increase the size of the
1379 : * array. We choose to double it each time this happens. However,
1380 : * there's not much point in starting *real* small.
1381 : */
1382 30804 : Size newCacheSize = SizeVfdCache * 2;
1383 : Vfd *newVfdCache;
1384 :
1385 30804 : if (newCacheSize < 32)
1386 23922 : newCacheSize = 32;
1387 :
1388 : /*
1389 : * Be careful not to clobber VfdCache ptr if realloc fails.
1390 : */
1391 30804 : newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1392 30804 : if (newVfdCache == NULL)
1393 0 : ereport(ERROR,
1394 : (errcode(ERRCODE_OUT_OF_MEMORY),
1395 : errmsg("out of memory")));
1396 30804 : VfdCache = newVfdCache;
1397 :
1398 : /*
1399 : * Initialize the new entries and link them into the free list.
1400 : */
1401 1260450 : for (i = SizeVfdCache; i < newCacheSize; i++)
1402 : {
1403 9837168 : MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1404 1229646 : VfdCache[i].nextFree = i + 1;
1405 1229646 : VfdCache[i].fd = VFD_CLOSED;
1406 : }
1407 30804 : VfdCache[newCacheSize - 1].nextFree = 0;
1408 30804 : VfdCache[0].nextFree = SizeVfdCache;
1409 :
1410 : /*
1411 : * Record the new size
1412 : */
1413 30804 : SizeVfdCache = newCacheSize;
1414 : }
1415 :
1416 1837616 : file = VfdCache[0].nextFree;
1417 :
1418 1837616 : VfdCache[0].nextFree = VfdCache[file].nextFree;
1419 :
1420 1837616 : return file;
1421 : }
1422 :
1423 : static void
1424 1377124 : FreeVfd(File file)
1425 : {
1426 1377124 : Vfd *vfdP = &VfdCache[file];
1427 :
1428 : DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1429 : file, vfdP->fileName ? vfdP->fileName : ""));
1430 :
1431 1377124 : if (vfdP->fileName != NULL)
1432 : {
1433 782374 : free(vfdP->fileName);
1434 782374 : vfdP->fileName = NULL;
1435 : }
1436 1377124 : vfdP->fdstate = 0x0;
1437 :
1438 1377124 : vfdP->nextFree = VfdCache[0].nextFree;
1439 1377124 : VfdCache[0].nextFree = file;
1440 1377124 : }
1441 :
1442 : /* returns 0 on success, -1 on re-open failure (with errno set) */
1443 : static int
1444 4727468 : FileAccess(File file)
1445 : {
1446 : int returnValue;
1447 :
1448 : DO_DB(elog(LOG, "FileAccess %d (%s)",
1449 : file, VfdCache[file].fileName));
1450 :
1451 : /*
1452 : * Is the file open? If not, open it and put it at the head of the LRU
1453 : * ring (possibly closing the least recently used file to get an FD).
1454 : */
1455 :
1456 4727468 : if (FileIsNotOpen(file))
1457 : {
1458 22748 : returnValue = LruInsert(file);
1459 22748 : if (returnValue != 0)
1460 0 : return returnValue;
1461 : }
1462 4704720 : else if (VfdCache[0].lruLessRecently != file)
1463 : {
1464 : /*
1465 : * We now know that the file is open and that it is not the last one
1466 : * accessed, so we need to move it to the head of the Lru ring.
1467 : */
1468 :
1469 1271796 : Delete(file);
1470 1271796 : Insert(file);
1471 : }
1472 :
1473 4727468 : return 0;
1474 : }
1475 :
1476 : /*
1477 : * Called whenever a temporary file is deleted to report its size.
1478 : */
1479 : static void
1480 5382 : ReportTemporaryFileUsage(const char *path, off_t size)
1481 : {
1482 5382 : pgstat_report_tempfile(size);
1483 :
1484 5382 : if (log_temp_files >= 0)
1485 : {
1486 1746 : if ((size / 1024) >= log_temp_files)
1487 240 : ereport(LOG,
1488 : (errmsg("temporary file: path \"%s\", size %lu",
1489 : path, (unsigned long) size)));
1490 : }
1491 5382 : }
1492 :
1493 : /*
1494 : * Called to register a temporary file for automatic close.
1495 : * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1496 : * before the file was opened.
1497 : */
1498 : static void
1499 8654 : RegisterTemporaryFile(File file)
1500 : {
1501 8654 : ResourceOwnerRememberFile(CurrentResourceOwner, file);
1502 8654 : VfdCache[file].resowner = CurrentResourceOwner;
1503 :
1504 : /* Backup mechanism for closing at end of xact. */
1505 8654 : VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1506 8654 : have_xact_temporary_files = true;
1507 8654 : }
1508 :
1509 : /*
1510 : * Called when we get a shared invalidation message on some relation.
1511 : */
1512 : #ifdef NOT_USED
1513 : void
1514 : FileInvalidate(File file)
1515 : {
1516 : Assert(FileIsValid(file));
1517 : if (!FileIsNotOpen(file))
1518 : LruDelete(file);
1519 : }
1520 : #endif
1521 :
1522 : /*
1523 : * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1524 : * fileMode parameter.
1525 : */
1526 : File
1527 1837616 : PathNameOpenFile(const char *fileName, int fileFlags)
1528 : {
1529 1837616 : return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1530 : }
1531 :
1532 : /*
1533 : * open a file in an arbitrary directory
1534 : *
1535 : * NB: if the passed pathname is relative (which it usually is),
1536 : * it will be interpreted relative to the process' working directory
1537 : * (which should always be $PGDATA when this code is running).
1538 : */
1539 : File
1540 1837616 : PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1541 : {
1542 : char *fnamecopy;
1543 : File file;
1544 : Vfd *vfdP;
1545 :
1546 : DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1547 : fileName, fileFlags, fileMode));
1548 :
1549 : /*
1550 : * We need a malloc'd copy of the file name; fail cleanly if no room.
1551 : */
1552 1837616 : fnamecopy = strdup(fileName);
1553 1837616 : if (fnamecopy == NULL)
1554 0 : ereport(ERROR,
1555 : (errcode(ERRCODE_OUT_OF_MEMORY),
1556 : errmsg("out of memory")));
1557 :
1558 1837616 : file = AllocateVfd();
1559 1837616 : vfdP = &VfdCache[file];
1560 :
1561 : /* Close excess kernel FDs. */
1562 1837616 : ReleaseLruFiles();
1563 :
1564 : /*
1565 : * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1566 : * client shouldn't be expected to know which kernel descriptors are
1567 : * currently open, so it wouldn't make sense for them to be inherited by
1568 : * executed subprograms.
1569 : */
1570 1837616 : fileFlags |= O_CLOEXEC;
1571 :
1572 1837616 : vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1573 :
1574 1837616 : if (vfdP->fd < 0)
1575 : {
1576 594750 : int save_errno = errno;
1577 :
1578 594750 : FreeVfd(file);
1579 594750 : free(fnamecopy);
1580 594750 : errno = save_errno;
1581 594750 : return -1;
1582 : }
1583 1242866 : ++nfile;
1584 : DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1585 : vfdP->fd));
1586 :
1587 1242866 : vfdP->fileName = fnamecopy;
1588 : /* Saved flags are adjusted to be OK for re-opening file */
1589 1242866 : vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1590 1242866 : vfdP->fileMode = fileMode;
1591 1242866 : vfdP->fileSize = 0;
1592 1242866 : vfdP->fdstate = 0x0;
1593 1242866 : vfdP->resowner = NULL;
1594 :
1595 1242866 : Insert(file);
1596 :
1597 1242866 : return file;
1598 : }
1599 :
1600 : /*
1601 : * Create directory 'directory'. If necessary, create 'basedir', which must
1602 : * be the directory above it. This is designed for creating the top-level
1603 : * temporary directory on demand before creating a directory underneath it.
1604 : * Do nothing if the directory already exists.
1605 : *
1606 : * Directories created within the top-level temporary directory should begin
1607 : * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1608 : * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1609 : * that do not need any particular prefix.
1610 : */
1611 : void
1612 332 : PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1613 : {
1614 332 : if (MakePGDirectory(directory) < 0)
1615 : {
1616 26 : if (errno == EEXIST)
1617 8 : return;
1618 :
1619 : /*
1620 : * Failed. Try to create basedir first in case it's missing. Tolerate
1621 : * EEXIST to close a race against another process following the same
1622 : * algorithm.
1623 : */
1624 18 : if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1625 0 : ereport(ERROR,
1626 : (errcode_for_file_access(),
1627 : errmsg("cannot create temporary directory \"%s\": %m",
1628 : basedir)));
1629 :
1630 : /* Try again. */
1631 18 : if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1632 0 : ereport(ERROR,
1633 : (errcode_for_file_access(),
1634 : errmsg("cannot create temporary subdirectory \"%s\": %m",
1635 : directory)));
1636 : }
1637 : }
1638 :
1639 : /*
1640 : * Delete a directory and everything in it, if it exists.
1641 : */
1642 : void
1643 402 : PathNameDeleteTemporaryDir(const char *dirname)
1644 : {
1645 : struct stat statbuf;
1646 :
1647 : /* Silently ignore missing directory. */
1648 402 : if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1649 78 : return;
1650 :
1651 : /*
1652 : * Currently, walkdir doesn't offer a way for our passed in function to
1653 : * maintain state. Perhaps it should, so that we could tell the caller
1654 : * whether this operation succeeded or failed. Since this operation is
1655 : * used in a cleanup path, we wouldn't actually behave differently: we'll
1656 : * just log failures.
1657 : */
1658 324 : walkdir(dirname, unlink_if_exists_fname, false, LOG);
1659 : }
1660 :
1661 : /*
1662 : * Open a temporary file that will disappear when we close it.
1663 : *
1664 : * This routine takes care of generating an appropriate tempfile name.
1665 : * There's no need to pass in fileFlags or fileMode either, since only
1666 : * one setting makes any sense for a temp file.
1667 : *
1668 : * Unless interXact is true, the file is remembered by CurrentResourceOwner
1669 : * to ensure it's closed and deleted when it's no longer needed, typically at
1670 : * the end-of-transaction. In most cases, you don't want temporary files to
1671 : * outlive the transaction that created them, so this should be false -- but
1672 : * if you need "somewhat" temporary storage, this might be useful. In either
1673 : * case, the file is removed when the File is explicitly closed.
1674 : */
1675 : File
1676 2980 : OpenTemporaryFile(bool interXact)
1677 : {
1678 2980 : File file = 0;
1679 :
1680 : Assert(temporary_files_allowed); /* check temp file access is up */
1681 :
1682 : /*
1683 : * Make sure the current resource owner has space for this File before we
1684 : * open it, if we'll be registering it below.
1685 : */
1686 2980 : if (!interXact)
1687 2968 : ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1688 :
1689 : /*
1690 : * If some temp tablespace(s) have been given to us, try to use the next
1691 : * one. If a given tablespace can't be found, we silently fall back to
1692 : * the database's default tablespace.
1693 : *
1694 : * BUT: if the temp file is slated to outlive the current transaction,
1695 : * force it into the database's default tablespace, so that it will not
1696 : * pose a threat to possible tablespace drop attempts.
1697 : */
1698 2980 : if (numTempTableSpaces > 0 && !interXact)
1699 : {
1700 2 : Oid tblspcOid = GetNextTempTableSpace();
1701 :
1702 2 : if (OidIsValid(tblspcOid))
1703 2 : file = OpenTemporaryFileInTablespace(tblspcOid, false);
1704 : }
1705 :
1706 : /*
1707 : * If not, or if tablespace is bad, create in database's default
1708 : * tablespace. MyDatabaseTableSpace should normally be set before we get
1709 : * here, but just in case it isn't, fall back to pg_default tablespace.
1710 : */
1711 2980 : if (file <= 0)
1712 2978 : file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
1713 : MyDatabaseTableSpace :
1714 : DEFAULTTABLESPACE_OID,
1715 : true);
1716 :
1717 : /* Mark it for deletion at close and temporary file size limit */
1718 2980 : VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1719 :
1720 : /* Register it with the current resource owner */
1721 2980 : if (!interXact)
1722 2968 : RegisterTemporaryFile(file);
1723 :
1724 2980 : return file;
1725 : }
1726 :
1727 : /*
1728 : * Return the path of the temp directory in a given tablespace.
1729 : */
1730 : void
1731 16746 : TempTablespacePath(char *path, Oid tablespace)
1732 : {
1733 : /*
1734 : * Identify the tempfile directory for this tablespace.
1735 : *
1736 : * If someone tries to specify pg_global, use pg_default instead.
1737 : */
1738 16746 : if (tablespace == InvalidOid ||
1739 2 : tablespace == DEFAULTTABLESPACE_OID ||
1740 : tablespace == GLOBALTABLESPACE_OID)
1741 16744 : snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1742 : else
1743 : {
1744 : /* All other tablespaces are accessed via symlinks */
1745 2 : snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1746 : tablespace, TABLESPACE_VERSION_DIRECTORY,
1747 : PG_TEMP_FILES_DIR);
1748 : }
1749 16746 : }
1750 :
1751 : /*
1752 : * Open a temporary file in a specific tablespace.
1753 : * Subroutine for OpenTemporaryFile, which see for details.
1754 : */
1755 : static File
1756 2980 : OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1757 : {
1758 : char tempdirpath[MAXPGPATH];
1759 : char tempfilepath[MAXPGPATH];
1760 : File file;
1761 :
1762 2980 : TempTablespacePath(tempdirpath, tblspcOid);
1763 :
1764 : /*
1765 : * Generate a tempfile name that should be unique within the current
1766 : * database instance.
1767 : */
1768 2980 : snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1769 : tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1770 :
1771 : /*
1772 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1773 : * temp file that can be reused.
1774 : */
1775 2980 : file = PathNameOpenFile(tempfilepath,
1776 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1777 2980 : if (file <= 0)
1778 : {
1779 : /*
1780 : * We might need to create the tablespace's tempfile directory, if no
1781 : * one has yet done so.
1782 : *
1783 : * Don't check for an error from MakePGDirectory; it could fail if
1784 : * someone else just did the same thing. If it doesn't work then
1785 : * we'll bomb out on the second create attempt, instead.
1786 : */
1787 140 : (void) MakePGDirectory(tempdirpath);
1788 :
1789 140 : file = PathNameOpenFile(tempfilepath,
1790 : O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1791 140 : if (file <= 0 && rejectError)
1792 0 : elog(ERROR, "could not create temporary file \"%s\": %m",
1793 : tempfilepath);
1794 : }
1795 :
1796 2980 : return file;
1797 : }
1798 :
1799 :
1800 : /*
1801 : * Create a new file. The directory containing it must already exist. Files
1802 : * created this way are subject to temp_file_limit and are automatically
1803 : * closed at end of transaction, but are not automatically deleted on close
1804 : * because they are intended to be shared between cooperating backends.
1805 : *
1806 : * If the file is inside the top-level temporary directory, its name should
1807 : * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1808 : * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1809 : * inside a directory created with PathNameCreateTemporaryDir(), in which case
1810 : * the prefix isn't needed.
1811 : */
1812 : File
1813 2734 : PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1814 : {
1815 : File file;
1816 :
1817 : Assert(temporary_files_allowed); /* check temp file access is up */
1818 :
1819 2734 : ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1820 :
1821 : /*
1822 : * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1823 : * temp file that can be reused.
1824 : */
1825 2734 : file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1826 2734 : if (file <= 0)
1827 : {
1828 332 : if (error_on_failure)
1829 0 : ereport(ERROR,
1830 : (errcode_for_file_access(),
1831 : errmsg("could not create temporary file \"%s\": %m",
1832 : path)));
1833 : else
1834 332 : return file;
1835 : }
1836 :
1837 : /* Mark it for temp_file_limit accounting. */
1838 2402 : VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1839 :
1840 : /* Register it for automatic close. */
1841 2402 : RegisterTemporaryFile(file);
1842 :
1843 2402 : return file;
1844 : }
1845 :
1846 : /*
1847 : * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1848 : * another backend. Files opened this way don't count against the
1849 : * temp_file_limit of the caller, are automatically closed at the end of the
1850 : * transaction but are not deleted on close.
1851 : */
1852 : File
1853 7114 : PathNameOpenTemporaryFile(const char *path, int mode)
1854 : {
1855 : File file;
1856 :
1857 : Assert(temporary_files_allowed); /* check temp file access is up */
1858 :
1859 7114 : ResourceOwnerEnlargeFiles(CurrentResourceOwner);
1860 :
1861 7114 : file = PathNameOpenFile(path, mode | PG_BINARY);
1862 :
1863 : /* If no such file, then we don't raise an error. */
1864 7114 : if (file <= 0 && errno != ENOENT)
1865 0 : ereport(ERROR,
1866 : (errcode_for_file_access(),
1867 : errmsg("could not open temporary file \"%s\": %m",
1868 : path)));
1869 :
1870 7114 : if (file > 0)
1871 : {
1872 : /* Register it for automatic close. */
1873 3284 : RegisterTemporaryFile(file);
1874 : }
1875 :
1876 7114 : return file;
1877 : }
1878 :
1879 : /*
1880 : * Delete a file by pathname. Return true if the file existed, false if
1881 : * didn't.
1882 : */
1883 : bool
1884 5508 : PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1885 : {
1886 : struct stat filestats;
1887 : int stat_errno;
1888 :
1889 : /* Get the final size for pgstat reporting. */
1890 5508 : if (stat(path, &filestats) != 0)
1891 3106 : stat_errno = errno;
1892 : else
1893 2402 : stat_errno = 0;
1894 :
1895 : /*
1896 : * Unlike FileClose's automatic file deletion code, we tolerate
1897 : * non-existence to support BufFileDeleteFileSet which doesn't know how
1898 : * many segments it has to delete until it runs out.
1899 : */
1900 5508 : if (stat_errno == ENOENT)
1901 3106 : return false;
1902 :
1903 2402 : if (unlink(path) < 0)
1904 : {
1905 0 : if (errno != ENOENT)
1906 0 : ereport(error_on_failure ? ERROR : LOG,
1907 : (errcode_for_file_access(),
1908 : errmsg("could not unlink temporary file \"%s\": %m",
1909 : path)));
1910 0 : return false;
1911 : }
1912 :
1913 2402 : if (stat_errno == 0)
1914 2402 : ReportTemporaryFileUsage(path, filestats.st_size);
1915 : else
1916 : {
1917 0 : errno = stat_errno;
1918 0 : ereport(LOG,
1919 : (errcode_for_file_access(),
1920 : errmsg("could not stat file \"%s\": %m", path)));
1921 : }
1922 :
1923 2402 : return true;
1924 : }
1925 :
1926 : /*
1927 : * close a file when done with it
1928 : */
1929 : void
1930 782374 : FileClose(File file)
1931 : {
1932 : Vfd *vfdP;
1933 :
1934 : Assert(FileIsValid(file));
1935 :
1936 : DO_DB(elog(LOG, "FileClose: %d (%s)",
1937 : file, VfdCache[file].fileName));
1938 :
1939 782374 : vfdP = &VfdCache[file];
1940 :
1941 782374 : if (!FileIsNotOpen(file))
1942 : {
1943 : /* close the file */
1944 759408 : if (close(vfdP->fd) != 0)
1945 : {
1946 : /*
1947 : * We may need to panic on failure to close non-temporary files;
1948 : * see LruDelete.
1949 : */
1950 0 : elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
1951 : "could not close file \"%s\": %m", vfdP->fileName);
1952 : }
1953 :
1954 759408 : --nfile;
1955 759408 : vfdP->fd = VFD_CLOSED;
1956 :
1957 : /* remove the file from the lru ring */
1958 759408 : Delete(file);
1959 : }
1960 :
1961 782374 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1962 : {
1963 : /* Subtract its size from current usage (do first in case of error) */
1964 5382 : temporary_files_size -= vfdP->fileSize;
1965 5382 : vfdP->fileSize = 0;
1966 : }
1967 :
1968 : /*
1969 : * Delete the file if it was temporary, and make a log entry if wanted
1970 : */
1971 782374 : if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1972 : {
1973 : struct stat filestats;
1974 : int stat_errno;
1975 :
1976 : /*
1977 : * If we get an error, as could happen within the ereport/elog calls,
1978 : * we'll come right back here during transaction abort. Reset the
1979 : * flag to ensure that we can't get into an infinite loop. This code
1980 : * is arranged to ensure that the worst-case consequence is failing to
1981 : * emit log message(s), not failing to attempt the unlink.
1982 : */
1983 2980 : vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1984 :
1985 :
1986 : /* first try the stat() */
1987 2980 : if (stat(vfdP->fileName, &filestats))
1988 0 : stat_errno = errno;
1989 : else
1990 2980 : stat_errno = 0;
1991 :
1992 : /* in any case do the unlink */
1993 2980 : if (unlink(vfdP->fileName))
1994 0 : ereport(LOG,
1995 : (errcode_for_file_access(),
1996 : errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1997 :
1998 : /* and last report the stat results */
1999 2980 : if (stat_errno == 0)
2000 2980 : ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2001 : else
2002 : {
2003 0 : errno = stat_errno;
2004 0 : ereport(LOG,
2005 : (errcode_for_file_access(),
2006 : errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2007 : }
2008 : }
2009 :
2010 : /* Unregister it from the resource owner */
2011 782374 : if (vfdP->resowner)
2012 8654 : ResourceOwnerForgetFile(vfdP->resowner, file);
2013 :
2014 : /*
2015 : * Return the Vfd slot to the free list
2016 : */
2017 782374 : FreeVfd(file);
2018 782374 : }
2019 :
2020 : /*
2021 : * FilePrefetch - initiate asynchronous read of a given range of the file.
2022 : *
2023 : * Currently the only implementation of this function is using posix_fadvise
2024 : * which is the simplest standardized interface that accomplishes this.
2025 : * We could add an implementation using libaio in the future; but note that
2026 : * this API is inappropriate for libaio, which wants to have a buffer provided
2027 : * to read into.
2028 : */
2029 : int
2030 247636 : FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2031 : {
2032 : #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2033 : int returnCode;
2034 :
2035 : Assert(FileIsValid(file));
2036 :
2037 : DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2038 : file, VfdCache[file].fileName,
2039 : (int64) offset, (int64) amount));
2040 :
2041 247636 : returnCode = FileAccess(file);
2042 247636 : if (returnCode < 0)
2043 0 : return returnCode;
2044 :
2045 247636 : retry:
2046 247636 : pgstat_report_wait_start(wait_event_info);
2047 247636 : returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2048 : POSIX_FADV_WILLNEED);
2049 247636 : pgstat_report_wait_end();
2050 :
2051 247636 : if (returnCode == EINTR)
2052 0 : goto retry;
2053 :
2054 247636 : return returnCode;
2055 : #else
2056 : Assert(FileIsValid(file));
2057 : return 0;
2058 : #endif
2059 : }
2060 :
2061 : void
2062 120228 : FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2063 : {
2064 : int returnCode;
2065 :
2066 : Assert(FileIsValid(file));
2067 :
2068 : DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2069 : file, VfdCache[file].fileName,
2070 : (int64) offset, (int64) nbytes));
2071 :
2072 120228 : if (nbytes <= 0)
2073 0 : return;
2074 :
2075 120228 : if (VfdCache[file].fileFlags & PG_O_DIRECT)
2076 0 : return;
2077 :
2078 120228 : returnCode = FileAccess(file);
2079 120228 : if (returnCode < 0)
2080 0 : return;
2081 :
2082 120228 : pgstat_report_wait_start(wait_event_info);
2083 120228 : pg_flush_data(VfdCache[file].fd, offset, nbytes);
2084 120228 : pgstat_report_wait_end();
2085 : }
2086 :
2087 : int
2088 2811466 : FileRead(File file, void *buffer, size_t amount, off_t offset,
2089 : uint32 wait_event_info)
2090 : {
2091 : int returnCode;
2092 : Vfd *vfdP;
2093 :
2094 : Assert(FileIsValid(file));
2095 :
2096 : DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
2097 : file, VfdCache[file].fileName,
2098 : (int64) offset,
2099 : amount, buffer));
2100 :
2101 2811466 : returnCode = FileAccess(file);
2102 2811466 : if (returnCode < 0)
2103 0 : return returnCode;
2104 :
2105 2811466 : vfdP = &VfdCache[file];
2106 :
2107 2811466 : retry:
2108 2811466 : pgstat_report_wait_start(wait_event_info);
2109 2811466 : returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2110 2811466 : pgstat_report_wait_end();
2111 :
2112 2811466 : if (returnCode < 0)
2113 : {
2114 : /*
2115 : * Windows may run out of kernel buffers and return "Insufficient
2116 : * system resources" error. Wait a bit and retry to solve it.
2117 : *
2118 : * It is rumored that EINTR is also possible on some Unix filesystems,
2119 : * in which case immediate retry is indicated.
2120 : */
2121 : #ifdef WIN32
2122 : DWORD error = GetLastError();
2123 :
2124 : switch (error)
2125 : {
2126 : case ERROR_NO_SYSTEM_RESOURCES:
2127 : pg_usleep(1000L);
2128 : errno = EINTR;
2129 : break;
2130 : default:
2131 : _dosmaperr(error);
2132 : break;
2133 : }
2134 : #endif
2135 : /* OK to retry if interrupted */
2136 0 : if (errno == EINTR)
2137 0 : goto retry;
2138 : }
2139 :
2140 2811466 : return returnCode;
2141 : }
2142 :
2143 : int
2144 1155452 : FileWrite(File file, const void *buffer, size_t amount, off_t offset,
2145 : uint32 wait_event_info)
2146 : {
2147 : int returnCode;
2148 : Vfd *vfdP;
2149 :
2150 : Assert(FileIsValid(file));
2151 :
2152 : DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p",
2153 : file, VfdCache[file].fileName,
2154 : (int64) offset,
2155 : amount, buffer));
2156 :
2157 1155452 : returnCode = FileAccess(file);
2158 1155452 : if (returnCode < 0)
2159 0 : return returnCode;
2160 :
2161 1155452 : vfdP = &VfdCache[file];
2162 :
2163 : /*
2164 : * If enforcing temp_file_limit and it's a temp file, check to see if the
2165 : * write would overrun temp_file_limit, and throw error if so. Note: it's
2166 : * really a modularity violation to throw error here; we should set errno
2167 : * and return -1. However, there's no way to report a suitable error
2168 : * message if we do that. All current callers would just throw error
2169 : * immediately anyway, so this is safe at present.
2170 : */
2171 1155452 : if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2172 : {
2173 0 : off_t past_write = offset + amount;
2174 :
2175 0 : if (past_write > vfdP->fileSize)
2176 : {
2177 0 : uint64 newTotal = temporary_files_size;
2178 :
2179 0 : newTotal += past_write - vfdP->fileSize;
2180 0 : if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2181 0 : ereport(ERROR,
2182 : (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2183 : errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2184 : temp_file_limit)));
2185 : }
2186 : }
2187 :
2188 1155452 : retry:
2189 1155452 : errno = 0;
2190 1155452 : pgstat_report_wait_start(wait_event_info);
2191 1155452 : returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2192 1155452 : pgstat_report_wait_end();
2193 :
2194 : /* if write didn't set errno, assume problem is no disk space */
2195 1155452 : if (returnCode != amount && errno == 0)
2196 0 : errno = ENOSPC;
2197 :
2198 1155452 : if (returnCode >= 0)
2199 : {
2200 : /*
2201 : * Maintain fileSize and temporary_files_size if it's a temp file.
2202 : */
2203 1155452 : if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2204 : {
2205 113256 : off_t past_write = offset + amount;
2206 :
2207 113256 : if (past_write > vfdP->fileSize)
2208 : {
2209 78300 : temporary_files_size += past_write - vfdP->fileSize;
2210 78300 : vfdP->fileSize = past_write;
2211 : }
2212 : }
2213 : }
2214 : else
2215 : {
2216 : /*
2217 : * See comments in FileRead()
2218 : */
2219 : #ifdef WIN32
2220 : DWORD error = GetLastError();
2221 :
2222 : switch (error)
2223 : {
2224 : case ERROR_NO_SYSTEM_RESOURCES:
2225 : pg_usleep(1000L);
2226 : errno = EINTR;
2227 : break;
2228 : default:
2229 : _dosmaperr(error);
2230 : break;
2231 : }
2232 : #endif
2233 : /* OK to retry if interrupted */
2234 0 : if (errno == EINTR)
2235 0 : goto retry;
2236 : }
2237 :
2238 1155452 : return returnCode;
2239 : }
2240 :
2241 : int
2242 29268 : FileSync(File file, uint32 wait_event_info)
2243 : {
2244 : int returnCode;
2245 :
2246 : Assert(FileIsValid(file));
2247 :
2248 : DO_DB(elog(LOG, "FileSync: %d (%s)",
2249 : file, VfdCache[file].fileName));
2250 :
2251 29268 : returnCode = FileAccess(file);
2252 29268 : if (returnCode < 0)
2253 0 : return returnCode;
2254 :
2255 29268 : pgstat_report_wait_start(wait_event_info);
2256 29268 : returnCode = pg_fsync(VfdCache[file].fd);
2257 29268 : pgstat_report_wait_end();
2258 :
2259 29268 : return returnCode;
2260 : }
2261 :
2262 : /*
2263 : * Zero a region of the file.
2264 : *
2265 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2266 : * appropriate error.
2267 : */
2268 : int
2269 345724 : FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2270 : {
2271 : int returnCode;
2272 : ssize_t written;
2273 :
2274 : Assert(FileIsValid(file));
2275 :
2276 : DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2277 : file, VfdCache[file].fileName,
2278 : (int64) offset, (int64) amount));
2279 :
2280 345724 : returnCode = FileAccess(file);
2281 345724 : if (returnCode < 0)
2282 0 : return returnCode;
2283 :
2284 345724 : pgstat_report_wait_start(wait_event_info);
2285 345724 : written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2286 345724 : pgstat_report_wait_end();
2287 :
2288 345724 : if (written < 0)
2289 0 : return -1;
2290 345724 : else if (written != amount)
2291 : {
2292 : /* if errno is unset, assume problem is no disk space */
2293 0 : if (errno == 0)
2294 0 : errno = ENOSPC;
2295 0 : return -1;
2296 : }
2297 :
2298 345724 : return 0;
2299 : }
2300 :
2301 : /*
2302 : * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2303 : * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2304 : * use FileZero() instead.
2305 : *
2306 : * Note that at least glibc() implements posix_fallocate() in userspace if not
2307 : * implemented by the filesystem. That's not the case for all environments
2308 : * though.
2309 : *
2310 : * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2311 : * appropriate error.
2312 : */
2313 : int
2314 990 : FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2315 : {
2316 : #ifdef HAVE_POSIX_FALLOCATE
2317 : int returnCode;
2318 :
2319 : Assert(FileIsValid(file));
2320 :
2321 : DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2322 : file, VfdCache[file].fileName,
2323 : (int64) offset, (int64) amount));
2324 :
2325 990 : returnCode = FileAccess(file);
2326 990 : if (returnCode < 0)
2327 0 : return -1;
2328 :
2329 990 : retry:
2330 990 : pgstat_report_wait_start(wait_event_info);
2331 990 : returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2332 990 : pgstat_report_wait_end();
2333 :
2334 990 : if (returnCode == 0)
2335 990 : return 0;
2336 0 : else if (returnCode == EINTR)
2337 0 : goto retry;
2338 :
2339 : /* for compatibility with %m printing etc */
2340 0 : errno = returnCode;
2341 :
2342 : /*
2343 : * Return in cases of a "real" failure, if fallocate is not supported,
2344 : * fall through to the FileZero() backed implementation.
2345 : */
2346 0 : if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2347 0 : return -1;
2348 : #endif
2349 :
2350 0 : return FileZero(file, offset, amount, wait_event_info);
2351 : }
2352 :
2353 : off_t
2354 2991178 : FileSize(File file)
2355 : {
2356 : Assert(FileIsValid(file));
2357 :
2358 : DO_DB(elog(LOG, "FileSize %d (%s)",
2359 : file, VfdCache[file].fileName));
2360 :
2361 2991178 : if (FileIsNotOpen(file))
2362 : {
2363 15754 : if (FileAccess(file) < 0)
2364 0 : return (off_t) -1;
2365 : }
2366 :
2367 2991178 : return lseek(VfdCache[file].fd, 0, SEEK_END);
2368 : }
2369 :
2370 : int
2371 950 : FileTruncate(File file, off_t offset, uint32 wait_event_info)
2372 : {
2373 : int returnCode;
2374 :
2375 : Assert(FileIsValid(file));
2376 :
2377 : DO_DB(elog(LOG, "FileTruncate %d (%s)",
2378 : file, VfdCache[file].fileName));
2379 :
2380 950 : returnCode = FileAccess(file);
2381 950 : if (returnCode < 0)
2382 0 : return returnCode;
2383 :
2384 950 : pgstat_report_wait_start(wait_event_info);
2385 950 : returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2386 950 : pgstat_report_wait_end();
2387 :
2388 950 : if (returnCode == 0 && VfdCache[file].fileSize > offset)
2389 : {
2390 : /* adjust our state for truncation of a temp file */
2391 : Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2392 0 : temporary_files_size -= VfdCache[file].fileSize - offset;
2393 0 : VfdCache[file].fileSize = offset;
2394 : }
2395 :
2396 950 : return returnCode;
2397 : }
2398 :
2399 : /*
2400 : * Return the pathname associated with an open file.
2401 : *
2402 : * The returned string points to an internal buffer, which is valid until
2403 : * the file is closed.
2404 : */
2405 : char *
2406 0 : FilePathName(File file)
2407 : {
2408 : Assert(FileIsValid(file));
2409 :
2410 0 : return VfdCache[file].fileName;
2411 : }
2412 :
2413 : /*
2414 : * Return the raw file descriptor of an opened file.
2415 : *
2416 : * The returned file descriptor will be valid until the file is closed, but
2417 : * there are a lot of things that can make that happen. So the caller should
2418 : * be careful not to do much of anything else before it finishes using the
2419 : * returned file descriptor.
2420 : */
2421 : int
2422 0 : FileGetRawDesc(File file)
2423 : {
2424 : Assert(FileIsValid(file));
2425 0 : return VfdCache[file].fd;
2426 : }
2427 :
2428 : /*
2429 : * FileGetRawFlags - returns the file flags on open(2)
2430 : */
2431 : int
2432 0 : FileGetRawFlags(File file)
2433 : {
2434 : Assert(FileIsValid(file));
2435 0 : return VfdCache[file].fileFlags;
2436 : }
2437 :
2438 : /*
2439 : * FileGetRawMode - returns the mode bitmask passed to open(2)
2440 : */
2441 : mode_t
2442 0 : FileGetRawMode(File file)
2443 : {
2444 : Assert(FileIsValid(file));
2445 0 : return VfdCache[file].fileMode;
2446 : }
2447 :
2448 : /*
2449 : * Make room for another allocatedDescs[] array entry if needed and possible.
2450 : * Returns true if an array element is available.
2451 : */
2452 : static bool
2453 648078 : reserveAllocatedDesc(void)
2454 : {
2455 : AllocateDesc *newDescs;
2456 : int newMax;
2457 :
2458 : /* Quick out if array already has a free slot. */
2459 648078 : if (numAllocatedDescs < maxAllocatedDescs)
2460 646560 : return true;
2461 :
2462 : /*
2463 : * If the array hasn't yet been created in the current process, initialize
2464 : * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2465 : * we will ever need, anyway. We don't want to look at max_safe_fds
2466 : * immediately because set_max_safe_fds() may not have run yet.
2467 : */
2468 1518 : if (allocatedDescs == NULL)
2469 : {
2470 1518 : newMax = FD_MINFREE / 3;
2471 1518 : newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2472 : /* Out of memory already? Treat as fatal error. */
2473 1518 : if (newDescs == NULL)
2474 0 : ereport(ERROR,
2475 : (errcode(ERRCODE_OUT_OF_MEMORY),
2476 : errmsg("out of memory")));
2477 1518 : allocatedDescs = newDescs;
2478 1518 : maxAllocatedDescs = newMax;
2479 1518 : return true;
2480 : }
2481 :
2482 : /*
2483 : * Consider enlarging the array beyond the initial allocation used above.
2484 : * By the time this happens, max_safe_fds should be known accurately.
2485 : *
2486 : * We mustn't let allocated descriptors hog all the available FDs, and in
2487 : * practice we'd better leave a reasonable number of FDs for VFD use. So
2488 : * set the maximum to max_safe_fds / 3. (This should certainly be at
2489 : * least as large as the initial size, FD_MINFREE / 3, so we aren't
2490 : * tightening the restriction here.) Recall that "external" FDs are
2491 : * allowed to consume another third of max_safe_fds.
2492 : */
2493 0 : newMax = max_safe_fds / 3;
2494 0 : if (newMax > maxAllocatedDescs)
2495 : {
2496 0 : newDescs = (AllocateDesc *) realloc(allocatedDescs,
2497 : newMax * sizeof(AllocateDesc));
2498 : /* Treat out-of-memory as a non-fatal error. */
2499 0 : if (newDescs == NULL)
2500 0 : return false;
2501 0 : allocatedDescs = newDescs;
2502 0 : maxAllocatedDescs = newMax;
2503 0 : return true;
2504 : }
2505 :
2506 : /* Can't enlarge allocatedDescs[] any more. */
2507 0 : return false;
2508 : }
2509 :
2510 : /*
2511 : * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2512 : * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2513 : * necessary to open the file. When done, call FreeFile rather than fclose.
2514 : *
2515 : * Note that files that will be open for any significant length of time
2516 : * should NOT be handled this way, since they cannot share kernel file
2517 : * descriptors with other files; there is grave risk of running out of FDs
2518 : * if anyone locks down too many FDs. Most callers of this routine are
2519 : * simply reading a config file that they will read and close immediately.
2520 : *
2521 : * fd.c will automatically close all files opened with AllocateFile at
2522 : * transaction commit or abort; this prevents FD leakage if a routine
2523 : * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2524 : *
2525 : * Ideally this should be the *only* direct call of fopen() in the backend.
2526 : */
2527 : FILE *
2528 110728 : AllocateFile(const char *name, const char *mode)
2529 : {
2530 : FILE *file;
2531 :
2532 : DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2533 : numAllocatedDescs, name));
2534 :
2535 : /* Can we allocate another non-virtual FD? */
2536 110728 : if (!reserveAllocatedDesc())
2537 0 : ereport(ERROR,
2538 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2539 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2540 : maxAllocatedDescs, name)));
2541 :
2542 : /* Close excess kernel FDs. */
2543 110728 : ReleaseLruFiles();
2544 :
2545 110728 : TryAgain:
2546 110728 : if ((file = fopen(name, mode)) != NULL)
2547 : {
2548 102486 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2549 :
2550 102486 : desc->kind = AllocateDescFile;
2551 102486 : desc->desc.file = file;
2552 102486 : desc->create_subid = GetCurrentSubTransactionId();
2553 102486 : numAllocatedDescs++;
2554 102486 : return desc->desc.file;
2555 : }
2556 :
2557 8242 : if (errno == EMFILE || errno == ENFILE)
2558 : {
2559 0 : int save_errno = errno;
2560 :
2561 0 : ereport(LOG,
2562 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2563 : errmsg("out of file descriptors: %m; release and retry")));
2564 0 : errno = 0;
2565 0 : if (ReleaseLruFile())
2566 0 : goto TryAgain;
2567 0 : errno = save_errno;
2568 : }
2569 :
2570 8242 : return NULL;
2571 : }
2572 :
2573 : /*
2574 : * Open a file with OpenTransientFilePerm() and pass default file mode for
2575 : * the fileMode parameter.
2576 : */
2577 : int
2578 481716 : OpenTransientFile(const char *fileName, int fileFlags)
2579 : {
2580 481716 : return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2581 : }
2582 :
2583 : /*
2584 : * Like AllocateFile, but returns an unbuffered fd like open(2)
2585 : */
2586 : int
2587 481728 : OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2588 : {
2589 : int fd;
2590 :
2591 : DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2592 : numAllocatedDescs, fileName));
2593 :
2594 : /* Can we allocate another non-virtual FD? */
2595 481728 : if (!reserveAllocatedDesc())
2596 0 : ereport(ERROR,
2597 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2598 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2599 : maxAllocatedDescs, fileName)));
2600 :
2601 : /* Close excess kernel FDs. */
2602 481728 : ReleaseLruFiles();
2603 :
2604 481728 : fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2605 :
2606 481728 : if (fd >= 0)
2607 : {
2608 478668 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2609 :
2610 478668 : desc->kind = AllocateDescRawFD;
2611 478668 : desc->desc.fd = fd;
2612 478668 : desc->create_subid = GetCurrentSubTransactionId();
2613 478668 : numAllocatedDescs++;
2614 :
2615 478668 : return fd;
2616 : }
2617 :
2618 3060 : return -1; /* failure */
2619 : }
2620 :
2621 : /*
2622 : * Routines that want to initiate a pipe stream should use OpenPipeStream
2623 : * rather than plain popen(). This lets fd.c deal with freeing FDs if
2624 : * necessary. When done, call ClosePipeStream rather than pclose.
2625 : *
2626 : * This function also ensures that the popen'd program is run with default
2627 : * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2628 : * uses. This ensures desirable response to, eg, closing a read pipe early.
2629 : */
2630 : FILE *
2631 74 : OpenPipeStream(const char *command, const char *mode)
2632 : {
2633 : FILE *file;
2634 : int save_errno;
2635 :
2636 : DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2637 : numAllocatedDescs, command));
2638 :
2639 : /* Can we allocate another non-virtual FD? */
2640 74 : if (!reserveAllocatedDesc())
2641 0 : ereport(ERROR,
2642 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2643 : errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2644 : maxAllocatedDescs, command)));
2645 :
2646 : /* Close excess kernel FDs. */
2647 74 : ReleaseLruFiles();
2648 :
2649 74 : TryAgain:
2650 74 : fflush(NULL);
2651 74 : pqsignal(SIGPIPE, SIG_DFL);
2652 74 : errno = 0;
2653 74 : file = popen(command, mode);
2654 74 : save_errno = errno;
2655 74 : pqsignal(SIGPIPE, SIG_IGN);
2656 74 : errno = save_errno;
2657 74 : if (file != NULL)
2658 : {
2659 74 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2660 :
2661 74 : desc->kind = AllocateDescPipe;
2662 74 : desc->desc.file = file;
2663 74 : desc->create_subid = GetCurrentSubTransactionId();
2664 74 : numAllocatedDescs++;
2665 74 : return desc->desc.file;
2666 : }
2667 :
2668 0 : if (errno == EMFILE || errno == ENFILE)
2669 : {
2670 0 : ereport(LOG,
2671 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2672 : errmsg("out of file descriptors: %m; release and retry")));
2673 0 : if (ReleaseLruFile())
2674 0 : goto TryAgain;
2675 0 : errno = save_errno;
2676 : }
2677 :
2678 0 : return NULL;
2679 : }
2680 :
2681 : /*
2682 : * Free an AllocateDesc of any type.
2683 : *
2684 : * The argument *must* point into the allocatedDescs[] array.
2685 : */
2686 : static int
2687 635476 : FreeDesc(AllocateDesc *desc)
2688 : {
2689 : int result;
2690 :
2691 : /* Close the underlying object */
2692 635476 : switch (desc->kind)
2693 : {
2694 102486 : case AllocateDescFile:
2695 102486 : result = fclose(desc->desc.file);
2696 102486 : break;
2697 74 : case AllocateDescPipe:
2698 74 : result = pclose(desc->desc.file);
2699 74 : break;
2700 54248 : case AllocateDescDir:
2701 54248 : result = closedir(desc->desc.dir);
2702 54248 : break;
2703 478668 : case AllocateDescRawFD:
2704 478668 : result = close(desc->desc.fd);
2705 478668 : break;
2706 0 : default:
2707 0 : elog(ERROR, "AllocateDesc kind not recognized");
2708 : result = 0; /* keep compiler quiet */
2709 : break;
2710 : }
2711 :
2712 : /* Compact storage in the allocatedDescs array */
2713 635476 : numAllocatedDescs--;
2714 635476 : *desc = allocatedDescs[numAllocatedDescs];
2715 :
2716 635476 : return result;
2717 : }
2718 :
2719 : /*
2720 : * Close a file returned by AllocateFile.
2721 : *
2722 : * Note we do not check fclose's return value --- it is up to the caller
2723 : * to handle close errors.
2724 : */
2725 : int
2726 102468 : FreeFile(FILE *file)
2727 : {
2728 : int i;
2729 :
2730 : DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2731 :
2732 : /* Remove file from list of allocated files, if it's present */
2733 102470 : for (i = numAllocatedDescs; --i >= 0;)
2734 : {
2735 102470 : AllocateDesc *desc = &allocatedDescs[i];
2736 :
2737 102470 : if (desc->kind == AllocateDescFile && desc->desc.file == file)
2738 102468 : return FreeDesc(desc);
2739 : }
2740 :
2741 : /* Only get here if someone passes us a file not in allocatedDescs */
2742 0 : elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2743 :
2744 0 : return fclose(file);
2745 : }
2746 :
2747 : /*
2748 : * Close a file returned by OpenTransientFile.
2749 : *
2750 : * Note we do not check close's return value --- it is up to the caller
2751 : * to handle close errors.
2752 : */
2753 : int
2754 478666 : CloseTransientFile(int fd)
2755 : {
2756 : int i;
2757 :
2758 : DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2759 :
2760 : /* Remove fd from list of allocated files, if it's present */
2761 478666 : for (i = numAllocatedDescs; --i >= 0;)
2762 : {
2763 478666 : AllocateDesc *desc = &allocatedDescs[i];
2764 :
2765 478666 : if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2766 478666 : return FreeDesc(desc);
2767 : }
2768 :
2769 : /* Only get here if someone passes us a file not in allocatedDescs */
2770 0 : elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2771 :
2772 0 : return close(fd);
2773 : }
2774 :
2775 : /*
2776 : * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2777 : * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2778 : * necessary to open the directory, and with closing it after an elog.
2779 : * When done, call FreeDir rather than closedir.
2780 : *
2781 : * Returns NULL, with errno set, on failure. Note that failure detection
2782 : * is commonly left to the following call of ReadDir or ReadDirExtended;
2783 : * see the comments for ReadDir.
2784 : *
2785 : * Ideally this should be the *only* direct call of opendir() in the backend.
2786 : */
2787 : DIR *
2788 55548 : AllocateDir(const char *dirname)
2789 : {
2790 : DIR *dir;
2791 :
2792 : DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2793 : numAllocatedDescs, dirname));
2794 :
2795 : /* Can we allocate another non-virtual FD? */
2796 55548 : if (!reserveAllocatedDesc())
2797 0 : ereport(ERROR,
2798 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2799 : errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2800 : maxAllocatedDescs, dirname)));
2801 :
2802 : /* Close excess kernel FDs. */
2803 55548 : ReleaseLruFiles();
2804 :
2805 55548 : TryAgain:
2806 55548 : if ((dir = opendir(dirname)) != NULL)
2807 : {
2808 54248 : AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2809 :
2810 54248 : desc->kind = AllocateDescDir;
2811 54248 : desc->desc.dir = dir;
2812 54248 : desc->create_subid = GetCurrentSubTransactionId();
2813 54248 : numAllocatedDescs++;
2814 54248 : return desc->desc.dir;
2815 : }
2816 :
2817 1300 : if (errno == EMFILE || errno == ENFILE)
2818 : {
2819 0 : int save_errno = errno;
2820 :
2821 0 : ereport(LOG,
2822 : (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2823 : errmsg("out of file descriptors: %m; release and retry")));
2824 0 : errno = 0;
2825 0 : if (ReleaseLruFile())
2826 0 : goto TryAgain;
2827 0 : errno = save_errno;
2828 : }
2829 :
2830 1300 : return NULL;
2831 : }
2832 :
2833 : /*
2834 : * Read a directory opened with AllocateDir, ereport'ing any error.
2835 : *
2836 : * This is easier to use than raw readdir() since it takes care of some
2837 : * otherwise rather tedious and error-prone manipulation of errno. Also,
2838 : * if you are happy with a generic error message for AllocateDir failure,
2839 : * you can just do
2840 : *
2841 : * dir = AllocateDir(path);
2842 : * while ((dirent = ReadDir(dir, path)) != NULL)
2843 : * process dirent;
2844 : * FreeDir(dir);
2845 : *
2846 : * since a NULL dir parameter is taken as indicating AllocateDir failed.
2847 : * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2848 : * use this shortcut.)
2849 : *
2850 : * The pathname passed to AllocateDir must be passed to this routine too,
2851 : * but it is only used for error reporting.
2852 : */
2853 : struct dirent *
2854 1684432 : ReadDir(DIR *dir, const char *dirname)
2855 : {
2856 1684432 : return ReadDirExtended(dir, dirname, ERROR);
2857 : }
2858 :
2859 : /*
2860 : * Alternate version of ReadDir that allows caller to specify the elevel
2861 : * for any error report (whether it's reporting an initial failure of
2862 : * AllocateDir or a subsequent directory read failure).
2863 : *
2864 : * If elevel < ERROR, returns NULL after any error. With the normal coding
2865 : * pattern, this will result in falling out of the loop immediately as
2866 : * though the directory contained no (more) entries.
2867 : */
2868 : struct dirent *
2869 3435462 : ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2870 : {
2871 : struct dirent *dent;
2872 :
2873 : /* Give a generic message for AllocateDir failure, if caller didn't */
2874 3435462 : if (dir == NULL)
2875 : {
2876 6 : ereport(elevel,
2877 : (errcode_for_file_access(),
2878 : errmsg("could not open directory \"%s\": %m",
2879 : dirname)));
2880 0 : return NULL;
2881 : }
2882 :
2883 3435456 : errno = 0;
2884 3435456 : if ((dent = readdir(dir)) != NULL)
2885 3393778 : return dent;
2886 :
2887 41678 : if (errno)
2888 0 : ereport(elevel,
2889 : (errcode_for_file_access(),
2890 : errmsg("could not read directory \"%s\": %m",
2891 : dirname)));
2892 41678 : return NULL;
2893 : }
2894 :
2895 : /*
2896 : * Close a directory opened with AllocateDir.
2897 : *
2898 : * Returns closedir's return value (with errno set if it's not 0).
2899 : * Note we do not check the return value --- it is up to the caller
2900 : * to handle close errors if wanted.
2901 : *
2902 : * Does nothing if dir == NULL; we assume that directory open failure was
2903 : * already reported if desired.
2904 : */
2905 : int
2906 54072 : FreeDir(DIR *dir)
2907 : {
2908 : int i;
2909 :
2910 : /* Nothing to do if AllocateDir failed */
2911 54072 : if (dir == NULL)
2912 0 : return 0;
2913 :
2914 : DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2915 :
2916 : /* Remove dir from list of allocated dirs, if it's present */
2917 54072 : for (i = numAllocatedDescs; --i >= 0;)
2918 : {
2919 54072 : AllocateDesc *desc = &allocatedDescs[i];
2920 :
2921 54072 : if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2922 54072 : return FreeDesc(desc);
2923 : }
2924 :
2925 : /* Only get here if someone passes us a dir not in allocatedDescs */
2926 0 : elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2927 :
2928 0 : return closedir(dir);
2929 : }
2930 :
2931 :
2932 : /*
2933 : * Close a pipe stream returned by OpenPipeStream.
2934 : */
2935 : int
2936 74 : ClosePipeStream(FILE *file)
2937 : {
2938 : int i;
2939 :
2940 : DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2941 :
2942 : /* Remove file from list of allocated files, if it's present */
2943 74 : for (i = numAllocatedDescs; --i >= 0;)
2944 : {
2945 74 : AllocateDesc *desc = &allocatedDescs[i];
2946 :
2947 74 : if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2948 74 : return FreeDesc(desc);
2949 : }
2950 :
2951 : /* Only get here if someone passes us a file not in allocatedDescs */
2952 0 : elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2953 :
2954 0 : return pclose(file);
2955 : }
2956 :
2957 : /*
2958 : * closeAllVfds
2959 : *
2960 : * Force all VFDs into the physically-closed state, so that the fewest
2961 : * possible number of kernel file descriptors are in use. There is no
2962 : * change in the logical state of the VFDs.
2963 : */
2964 : void
2965 54 : closeAllVfds(void)
2966 : {
2967 : Index i;
2968 :
2969 54 : if (SizeVfdCache > 0)
2970 : {
2971 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2972 1728 : for (i = 1; i < SizeVfdCache; i++)
2973 : {
2974 1674 : if (!FileIsNotOpen(i))
2975 160 : LruDelete(i);
2976 : }
2977 : }
2978 54 : }
2979 :
2980 :
2981 : /*
2982 : * SetTempTablespaces
2983 : *
2984 : * Define a list (actually an array) of OIDs of tablespaces to use for
2985 : * temporary files. This list will be used until end of transaction,
2986 : * unless this function is called again before then. It is caller's
2987 : * responsibility that the passed-in array has adequate lifespan (typically
2988 : * it'd be allocated in TopTransactionContext).
2989 : *
2990 : * Some entries of the array may be InvalidOid, indicating that the current
2991 : * database's default tablespace should be used.
2992 : */
2993 : void
2994 5110 : SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2995 : {
2996 : Assert(numSpaces >= 0);
2997 5110 : tempTableSpaces = tableSpaces;
2998 5110 : numTempTableSpaces = numSpaces;
2999 :
3000 : /*
3001 : * Select a random starting point in the list. This is to minimize
3002 : * conflicts between backends that are most likely sharing the same list
3003 : * of temp tablespaces. Note that if we create multiple temp files in the
3004 : * same transaction, we'll advance circularly through the list --- this
3005 : * ensures that large temporary sort files are nicely spread across all
3006 : * available tablespaces.
3007 : */
3008 5110 : if (numSpaces > 1)
3009 0 : nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
3010 0 : 0, numSpaces - 1);
3011 : else
3012 5110 : nextTempTableSpace = 0;
3013 5110 : }
3014 :
3015 : /*
3016 : * TempTablespacesAreSet
3017 : *
3018 : * Returns true if SetTempTablespaces has been called in current transaction.
3019 : * (This is just so that tablespaces.c doesn't need its own per-transaction
3020 : * state.)
3021 : */
3022 : bool
3023 8106 : TempTablespacesAreSet(void)
3024 : {
3025 8106 : return (numTempTableSpaces >= 0);
3026 : }
3027 :
3028 : /*
3029 : * GetTempTablespaces
3030 : *
3031 : * Populate an array with the OIDs of the tablespaces that should be used for
3032 : * temporary files. (Some entries may be InvalidOid, indicating that the
3033 : * current database's default tablespace should be used.) At most numSpaces
3034 : * entries will be filled.
3035 : * Returns the number of OIDs that were copied into the output array.
3036 : */
3037 : int
3038 354 : GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3039 : {
3040 : int i;
3041 :
3042 : Assert(TempTablespacesAreSet());
3043 354 : for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3044 0 : tableSpaces[i] = tempTableSpaces[i];
3045 :
3046 354 : return i;
3047 : }
3048 :
3049 : /*
3050 : * GetNextTempTableSpace
3051 : *
3052 : * Select the next temp tablespace to use. A result of InvalidOid means
3053 : * to use the current database's default tablespace.
3054 : */
3055 : Oid
3056 3826 : GetNextTempTableSpace(void)
3057 : {
3058 3826 : if (numTempTableSpaces > 0)
3059 : {
3060 : /* Advance nextTempTableSpace counter with wraparound */
3061 2 : if (++nextTempTableSpace >= numTempTableSpaces)
3062 2 : nextTempTableSpace = 0;
3063 2 : return tempTableSpaces[nextTempTableSpace];
3064 : }
3065 3824 : return InvalidOid;
3066 : }
3067 :
3068 :
3069 : /*
3070 : * AtEOSubXact_Files
3071 : *
3072 : * Take care of subtransaction commit/abort. At abort, we close temp files
3073 : * that the subtransaction may have opened. At commit, we reassign the
3074 : * files that were opened to the parent subtransaction.
3075 : */
3076 : void
3077 17608 : AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3078 : SubTransactionId parentSubid)
3079 : {
3080 : Index i;
3081 :
3082 17608 : for (i = 0; i < numAllocatedDescs; i++)
3083 : {
3084 0 : if (allocatedDescs[i].create_subid == mySubid)
3085 : {
3086 0 : if (isCommit)
3087 0 : allocatedDescs[i].create_subid = parentSubid;
3088 : else
3089 : {
3090 : /* have to recheck the item after FreeDesc (ugly) */
3091 0 : FreeDesc(&allocatedDescs[i--]);
3092 : }
3093 : }
3094 : }
3095 17608 : }
3096 :
3097 : /*
3098 : * AtEOXact_Files
3099 : *
3100 : * This routine is called during transaction commit or abort. All still-open
3101 : * per-transaction temporary file VFDs are closed, which also causes the
3102 : * underlying files to be deleted (although they should've been closed already
3103 : * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3104 : * closed. We also forget any transaction-local temp tablespace list.
3105 : *
3106 : * The isCommit flag is used only to decide whether to emit warnings about
3107 : * unclosed files.
3108 : */
3109 : void
3110 498324 : AtEOXact_Files(bool isCommit)
3111 : {
3112 498324 : CleanupTempFiles(isCommit, false);
3113 498324 : tempTableSpaces = NULL;
3114 498324 : numTempTableSpaces = -1;
3115 498324 : }
3116 :
3117 : /*
3118 : * BeforeShmemExit_Files
3119 : *
3120 : * before_shmem_exit hook to clean up temp files during backend shutdown.
3121 : * Here, we want to clean up *all* temp files including interXact ones.
3122 : */
3123 : static void
3124 26920 : BeforeShmemExit_Files(int code, Datum arg)
3125 : {
3126 26920 : CleanupTempFiles(false, true);
3127 :
3128 : /* prevent further temp files from being created */
3129 : #ifdef USE_ASSERT_CHECKING
3130 : temporary_files_allowed = false;
3131 : #endif
3132 26920 : }
3133 :
3134 : /*
3135 : * Close temporary files and delete their underlying files.
3136 : *
3137 : * isCommit: if true, this is normal transaction commit, and we don't
3138 : * expect any remaining files; warn if there are some.
3139 : *
3140 : * isProcExit: if true, this is being called as the backend process is
3141 : * exiting. If that's the case, we should remove all temporary files; if
3142 : * that's not the case, we are being called for transaction commit/abort
3143 : * and should only remove transaction-local temp files. In either case,
3144 : * also clean up "allocated" stdio files, dirs and fds.
3145 : */
3146 : static void
3147 525244 : CleanupTempFiles(bool isCommit, bool isProcExit)
3148 : {
3149 : Index i;
3150 :
3151 : /*
3152 : * Careful here: at proc_exit we need extra cleanup, not just
3153 : * xact_temporary files.
3154 : */
3155 525244 : if (isProcExit || have_xact_temporary_files)
3156 : {
3157 : Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3158 1326390 : for (i = 1; i < SizeVfdCache; i++)
3159 : {
3160 1298040 : unsigned short fdstate = VfdCache[i].fdstate;
3161 :
3162 1298040 : if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3163 10 : VfdCache[i].fileName != NULL)
3164 : {
3165 : /*
3166 : * If we're in the process of exiting a backend process, close
3167 : * all temporary files. Otherwise, only close temporary files
3168 : * local to the current transaction. They should be closed by
3169 : * the ResourceOwner mechanism already, so this is just a
3170 : * debugging cross-check.
3171 : */
3172 10 : if (isProcExit)
3173 10 : FileClose(i);
3174 0 : else if (fdstate & FD_CLOSE_AT_EOXACT)
3175 : {
3176 0 : elog(WARNING,
3177 : "temporary file %s not closed at end-of-transaction",
3178 : VfdCache[i].fileName);
3179 0 : FileClose(i);
3180 : }
3181 : }
3182 : }
3183 :
3184 28350 : have_xact_temporary_files = false;
3185 : }
3186 :
3187 : /* Complain if any allocated files remain open at commit. */
3188 525244 : if (isCommit && numAllocatedDescs > 0)
3189 0 : elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3190 : numAllocatedDescs);
3191 :
3192 : /* Clean up "allocated" stdio files, dirs and fds. */
3193 525440 : while (numAllocatedDescs > 0)
3194 196 : FreeDesc(&allocatedDescs[0]);
3195 525244 : }
3196 :
3197 :
3198 : /*
3199 : * Remove temporary and temporary relation files left over from a prior
3200 : * postmaster session
3201 : *
3202 : * This should be called during postmaster startup. It will forcibly
3203 : * remove any leftover files created by OpenTemporaryFile and any leftover
3204 : * temporary relation files created by mdcreate.
3205 : *
3206 : * During post-backend-crash restart cycle, this routine is called when
3207 : * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3208 : * queries are using temp files could result in useless storage usage that can
3209 : * only be reclaimed by a service restart. The argument against enabling it is
3210 : * that someone might want to examine the temporary files for debugging
3211 : * purposes. This does however mean that OpenTemporaryFile had better allow for
3212 : * collision with an existing temp file name.
3213 : *
3214 : * NOTE: this function and its subroutines generally report syscall failures
3215 : * with ereport(LOG) and keep going. Removing temp files is not so critical
3216 : * that we should fail to start the database when we can't do it.
3217 : */
3218 : void
3219 1236 : RemovePgTempFiles(void)
3220 : {
3221 : char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3222 : DIR *spc_dir;
3223 : struct dirent *spc_de;
3224 :
3225 : /*
3226 : * First process temp files in pg_default ($PGDATA/base)
3227 : */
3228 1236 : snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3229 1236 : RemovePgTempFilesInDir(temp_path, true, false);
3230 1236 : RemovePgTempRelationFiles("base");
3231 :
3232 : /*
3233 : * Cycle through temp directories for all non-default tablespaces.
3234 : */
3235 1236 : spc_dir = AllocateDir("pg_tblspc");
3236 :
3237 3826 : while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3238 : {
3239 2590 : if (strcmp(spc_de->d_name, ".") == 0 ||
3240 1354 : strcmp(spc_de->d_name, "..") == 0)
3241 2472 : continue;
3242 :
3243 118 : snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3244 118 : spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
3245 118 : RemovePgTempFilesInDir(temp_path, true, false);
3246 :
3247 118 : snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3248 118 : spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
3249 118 : RemovePgTempRelationFiles(temp_path);
3250 : }
3251 :
3252 1236 : FreeDir(spc_dir);
3253 :
3254 : /*
3255 : * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3256 : * DataDir as well. However, that is *not* cleaned here because doing so
3257 : * would create a race condition. It's done separately, earlier in
3258 : * postmaster startup.
3259 : */
3260 1236 : }
3261 :
3262 : /*
3263 : * Process one pgsql_tmp directory for RemovePgTempFiles.
3264 : *
3265 : * If missing_ok is true, it's all right for the named directory to not exist.
3266 : * Any other problem results in a LOG message. (missing_ok should be true at
3267 : * the top level, since pgsql_tmp directories are not created until needed.)
3268 : *
3269 : * At the top level, this should be called with unlink_all = false, so that
3270 : * only files matching the temporary name prefix will be unlinked. When
3271 : * recursing it will be called with unlink_all = true to unlink everything
3272 : * under a top-level temporary directory.
3273 : *
3274 : * (These two flags could be replaced by one, but it seems clearer to keep
3275 : * them separate.)
3276 : */
3277 : void
3278 1356 : RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3279 : {
3280 : DIR *temp_dir;
3281 : struct dirent *temp_de;
3282 : char rm_path[MAXPGPATH * 2];
3283 :
3284 1356 : temp_dir = AllocateDir(tmpdirname);
3285 :
3286 1356 : if (temp_dir == NULL && errno == ENOENT && missing_ok)
3287 1274 : return;
3288 :
3289 252 : while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3290 : {
3291 170 : if (strcmp(temp_de->d_name, ".") == 0 ||
3292 88 : strcmp(temp_de->d_name, "..") == 0)
3293 164 : continue;
3294 :
3295 6 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3296 6 : tmpdirname, temp_de->d_name);
3297 :
3298 6 : if (unlink_all ||
3299 6 : strncmp(temp_de->d_name,
3300 : PG_TEMP_FILE_PREFIX,
3301 : strlen(PG_TEMP_FILE_PREFIX)) == 0)
3302 6 : {
3303 6 : PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3304 :
3305 6 : if (type == PGFILETYPE_ERROR)
3306 0 : continue;
3307 6 : else if (type == PGFILETYPE_DIR)
3308 : {
3309 : /* recursively remove contents, then directory itself */
3310 2 : RemovePgTempFilesInDir(rm_path, false, true);
3311 :
3312 2 : if (rmdir(rm_path) < 0)
3313 0 : ereport(LOG,
3314 : (errcode_for_file_access(),
3315 : errmsg("could not remove directory \"%s\": %m",
3316 : rm_path)));
3317 : }
3318 : else
3319 : {
3320 4 : if (unlink(rm_path) < 0)
3321 0 : ereport(LOG,
3322 : (errcode_for_file_access(),
3323 : errmsg("could not remove file \"%s\": %m",
3324 : rm_path)));
3325 : }
3326 : }
3327 : else
3328 0 : ereport(LOG,
3329 : (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3330 : rm_path)));
3331 : }
3332 :
3333 82 : FreeDir(temp_dir);
3334 : }
3335 :
3336 : /* Process one tablespace directory, look for per-DB subdirectories */
3337 : static void
3338 1354 : RemovePgTempRelationFiles(const char *tsdirname)
3339 : {
3340 : DIR *ts_dir;
3341 : struct dirent *de;
3342 : char dbspace_path[MAXPGPATH * 2];
3343 :
3344 1354 : ts_dir = AllocateDir(tsdirname);
3345 :
3346 8366 : while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3347 : {
3348 : /*
3349 : * We're only interested in the per-database directories, which have
3350 : * numeric names. Note that this code will also (properly) ignore "."
3351 : * and "..".
3352 : */
3353 7012 : if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3354 2788 : continue;
3355 :
3356 4224 : snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3357 4224 : tsdirname, de->d_name);
3358 4224 : RemovePgTempRelationFilesInDbspace(dbspace_path);
3359 : }
3360 :
3361 1354 : FreeDir(ts_dir);
3362 1354 : }
3363 :
3364 : /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3365 : static void
3366 4224 : RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3367 : {
3368 : DIR *dbspace_dir;
3369 : struct dirent *de;
3370 : char rm_path[MAXPGPATH * 2];
3371 :
3372 4224 : dbspace_dir = AllocateDir(dbspacedirname);
3373 :
3374 1269116 : while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3375 : {
3376 1264892 : if (!looks_like_temp_rel_name(de->d_name))
3377 1264884 : continue;
3378 :
3379 8 : snprintf(rm_path, sizeof(rm_path), "%s/%s",
3380 8 : dbspacedirname, de->d_name);
3381 :
3382 8 : if (unlink(rm_path) < 0)
3383 0 : ereport(LOG,
3384 : (errcode_for_file_access(),
3385 : errmsg("could not remove file \"%s\": %m",
3386 : rm_path)));
3387 : }
3388 :
3389 4224 : FreeDir(dbspace_dir);
3390 4224 : }
3391 :
3392 : /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3393 : bool
3394 1687022 : looks_like_temp_rel_name(const char *name)
3395 : {
3396 : int pos;
3397 : int savepos;
3398 :
3399 : /* Must start with "t". */
3400 1687022 : if (name[0] != 't')
3401 1686942 : return false;
3402 :
3403 : /* Followed by a non-empty string of digits and then an underscore. */
3404 392 : for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3405 : ;
3406 80 : if (pos == 1 || name[pos] != '_')
3407 0 : return false;
3408 :
3409 : /* Followed by another nonempty string of digits. */
3410 392 : for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3411 : ;
3412 80 : if (savepos == pos)
3413 0 : return false;
3414 :
3415 : /* We might have _forkname or .segment or both. */
3416 80 : if (name[pos] == '_')
3417 : {
3418 40 : int forkchar = forkname_chars(&name[pos + 1], NULL);
3419 :
3420 40 : if (forkchar <= 0)
3421 0 : return false;
3422 40 : pos += forkchar + 1;
3423 : }
3424 80 : if (name[pos] == '.')
3425 : {
3426 : int segchar;
3427 :
3428 80 : for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3429 : ;
3430 40 : if (segchar <= 1)
3431 0 : return false;
3432 40 : pos += segchar;
3433 : }
3434 :
3435 : /* Now we should be at the end. */
3436 80 : if (name[pos] != '\0')
3437 0 : return false;
3438 80 : return true;
3439 : }
3440 :
3441 : #ifdef HAVE_SYNCFS
3442 : static void
3443 0 : do_syncfs(const char *path)
3444 : {
3445 : int fd;
3446 :
3447 0 : ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3448 : path);
3449 :
3450 0 : fd = OpenTransientFile(path, O_RDONLY);
3451 0 : if (fd < 0)
3452 : {
3453 0 : ereport(LOG,
3454 : (errcode_for_file_access(),
3455 : errmsg("could not open file \"%s\": %m", path)));
3456 0 : return;
3457 : }
3458 0 : if (syncfs(fd) < 0)
3459 0 : ereport(LOG,
3460 : (errcode_for_file_access(),
3461 : errmsg("could not synchronize file system for file \"%s\": %m", path)));
3462 0 : CloseTransientFile(fd);
3463 : }
3464 : #endif
3465 :
3466 : /*
3467 : * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3468 : * all potential filesystem, depending on recovery_init_sync_method setting.
3469 : *
3470 : * We fsync regular files and directories wherever they are, but we
3471 : * follow symlinks only for pg_wal and immediately under pg_tblspc.
3472 : * Other symlinks are presumed to point at files we're not responsible
3473 : * for fsyncing, and might not have privileges to write at all.
3474 : *
3475 : * Errors are logged but not considered fatal; that's because this is used
3476 : * only during database startup, to deal with the possibility that there are
3477 : * issued-but-unsynced writes pending against the data directory. We want to
3478 : * ensure that such writes reach disk before anything that's done in the new
3479 : * run. However, aborting on error would result in failure to start for
3480 : * harmless cases such as read-only files in the data directory, and that's
3481 : * not good either.
3482 : *
3483 : * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3484 : * rewriting all changes again during recovery.
3485 : *
3486 : * Note we assume we're chdir'd into PGDATA to begin with.
3487 : */
3488 : void
3489 300 : SyncDataDirectory(void)
3490 : {
3491 : bool xlog_is_symlink;
3492 :
3493 : /* We can skip this whole thing if fsync is disabled. */
3494 300 : if (!enableFsync)
3495 300 : return;
3496 :
3497 : /*
3498 : * If pg_wal is a symlink, we'll need to recurse into it separately,
3499 : * because the first walkdir below will ignore it.
3500 : */
3501 0 : xlog_is_symlink = false;
3502 :
3503 : {
3504 : struct stat st;
3505 :
3506 0 : if (lstat("pg_wal", &st) < 0)
3507 0 : ereport(LOG,
3508 : (errcode_for_file_access(),
3509 : errmsg("could not stat file \"%s\": %m",
3510 : "pg_wal")));
3511 0 : else if (S_ISLNK(st.st_mode))
3512 0 : xlog_is_symlink = true;
3513 : }
3514 :
3515 : #ifdef HAVE_SYNCFS
3516 0 : if (recovery_init_sync_method == DATA_DIR_SYNC_METHOD_SYNCFS)
3517 : {
3518 : DIR *dir;
3519 : struct dirent *de;
3520 :
3521 : /*
3522 : * On Linux, we don't have to open every single file one by one. We
3523 : * can use syncfs() to sync whole filesystems. We only expect
3524 : * filesystem boundaries to exist where we tolerate symlinks, namely
3525 : * pg_wal and the tablespaces, so we call syncfs() for each of those
3526 : * directories.
3527 : */
3528 :
3529 : /* Prepare to report progress syncing the data directory via syncfs. */
3530 0 : begin_startup_progress_phase();
3531 :
3532 : /* Sync the top level pgdata directory. */
3533 0 : do_syncfs(".");
3534 : /* If any tablespaces are configured, sync each of those. */
3535 0 : dir = AllocateDir("pg_tblspc");
3536 0 : while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3537 : {
3538 : char path[MAXPGPATH];
3539 :
3540 0 : if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3541 0 : continue;
3542 :
3543 0 : snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3544 0 : do_syncfs(path);
3545 : }
3546 0 : FreeDir(dir);
3547 : /* If pg_wal is a symlink, process that too. */
3548 0 : if (xlog_is_symlink)
3549 0 : do_syncfs("pg_wal");
3550 0 : return;
3551 : }
3552 : #endif /* !HAVE_SYNCFS */
3553 :
3554 : #ifdef PG_FLUSH_DATA_WORKS
3555 : /* Prepare to report progress of the pre-fsync phase. */
3556 0 : begin_startup_progress_phase();
3557 :
3558 : /*
3559 : * If possible, hint to the kernel that we're soon going to fsync the data
3560 : * directory and its contents. Errors in this step are even less
3561 : * interesting than normal, so log them only at DEBUG1.
3562 : */
3563 0 : walkdir(".", pre_sync_fname, false, DEBUG1);
3564 0 : if (xlog_is_symlink)
3565 0 : walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3566 0 : walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3567 : #endif
3568 :
3569 : /* Prepare to report progress syncing the data directory via fsync. */
3570 0 : begin_startup_progress_phase();
3571 :
3572 : /*
3573 : * Now we do the fsync()s in the same order.
3574 : *
3575 : * The main call ignores symlinks, so in addition to specially processing
3576 : * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3577 : * process_symlinks = true. Note that if there are any plain directories
3578 : * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3579 : * so we don't worry about optimizing it.
3580 : */
3581 0 : walkdir(".", datadir_fsync_fname, false, LOG);
3582 0 : if (xlog_is_symlink)
3583 0 : walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3584 0 : walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3585 : }
3586 :
3587 : /*
3588 : * walkdir: recursively walk a directory, applying the action to each
3589 : * regular file and directory (including the named directory itself).
3590 : *
3591 : * If process_symlinks is true, the action and recursion are also applied
3592 : * to regular files and directories that are pointed to by symlinks in the
3593 : * given directory; otherwise symlinks are ignored. Symlinks are always
3594 : * ignored in subdirectories, ie we intentionally don't pass down the
3595 : * process_symlinks flag to recursive calls.
3596 : *
3597 : * Errors are reported at level elevel, which might be ERROR or less.
3598 : *
3599 : * See also walkdir in file_utils.c, which is a frontend version of this
3600 : * logic.
3601 : */
3602 : static void
3603 324 : walkdir(const char *path,
3604 : void (*action) (const char *fname, bool isdir, int elevel),
3605 : bool process_symlinks,
3606 : int elevel)
3607 : {
3608 : DIR *dir;
3609 : struct dirent *de;
3610 :
3611 324 : dir = AllocateDir(path);
3612 :
3613 3296 : while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3614 : {
3615 : char subpath[MAXPGPATH * 2];
3616 :
3617 2972 : CHECK_FOR_INTERRUPTS();
3618 :
3619 2972 : if (strcmp(de->d_name, ".") == 0 ||
3620 2648 : strcmp(de->d_name, "..") == 0)
3621 648 : continue;
3622 :
3623 2324 : snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3624 :
3625 2324 : switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3626 : {
3627 2324 : case PGFILETYPE_REG:
3628 2324 : (*action) (subpath, false, elevel);
3629 2324 : break;
3630 0 : case PGFILETYPE_DIR:
3631 0 : walkdir(subpath, action, false, elevel);
3632 0 : break;
3633 0 : default:
3634 :
3635 : /*
3636 : * Errors are already reported directly by get_dirent_type(),
3637 : * and any remaining symlinks and unknown file types are
3638 : * ignored.
3639 : */
3640 0 : break;
3641 : }
3642 : }
3643 :
3644 324 : FreeDir(dir); /* we ignore any error here */
3645 :
3646 : /*
3647 : * It's important to fsync the destination directory itself as individual
3648 : * file fsyncs don't guarantee that the directory entry for the file is
3649 : * synced. However, skip this if AllocateDir failed; the action function
3650 : * might not be robust against that.
3651 : */
3652 324 : if (dir)
3653 324 : (*action) (path, true, elevel);
3654 324 : }
3655 :
3656 :
3657 : /*
3658 : * Hint to the OS that it should get ready to fsync() this file.
3659 : *
3660 : * Ignores errors trying to open unreadable files, and logs other errors at a
3661 : * caller-specified level.
3662 : */
3663 : #ifdef PG_FLUSH_DATA_WORKS
3664 :
3665 : static void
3666 0 : pre_sync_fname(const char *fname, bool isdir, int elevel)
3667 : {
3668 : int fd;
3669 :
3670 : /* Don't try to flush directories, it'll likely just fail */
3671 0 : if (isdir)
3672 0 : return;
3673 :
3674 0 : ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3675 : fname);
3676 :
3677 0 : fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3678 :
3679 0 : if (fd < 0)
3680 : {
3681 0 : if (errno == EACCES)
3682 0 : return;
3683 0 : ereport(elevel,
3684 : (errcode_for_file_access(),
3685 : errmsg("could not open file \"%s\": %m", fname)));
3686 0 : return;
3687 : }
3688 :
3689 : /*
3690 : * pg_flush_data() ignores errors, which is ok because this is only a
3691 : * hint.
3692 : */
3693 0 : pg_flush_data(fd, 0, 0);
3694 :
3695 0 : if (CloseTransientFile(fd) != 0)
3696 0 : ereport(elevel,
3697 : (errcode_for_file_access(),
3698 : errmsg("could not close file \"%s\": %m", fname)));
3699 : }
3700 :
3701 : #endif /* PG_FLUSH_DATA_WORKS */
3702 :
3703 : static void
3704 0 : datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3705 : {
3706 0 : ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3707 : fname);
3708 :
3709 : /*
3710 : * We want to silently ignoring errors about unreadable files. Pass that
3711 : * desire on to fsync_fname_ext().
3712 : */
3713 0 : fsync_fname_ext(fname, isdir, true, elevel);
3714 0 : }
3715 :
3716 : static void
3717 2648 : unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3718 : {
3719 2648 : if (isdir)
3720 : {
3721 324 : if (rmdir(fname) != 0 && errno != ENOENT)
3722 0 : ereport(elevel,
3723 : (errcode_for_file_access(),
3724 : errmsg("could not remove directory \"%s\": %m", fname)));
3725 : }
3726 : else
3727 : {
3728 : /* Use PathNameDeleteTemporaryFile to report filesize */
3729 2324 : PathNameDeleteTemporaryFile(fname, false);
3730 : }
3731 2648 : }
3732 :
3733 : /*
3734 : * fsync_fname_ext -- Try to fsync a file or directory
3735 : *
3736 : * If ignore_perm is true, ignore errors upon trying to open unreadable
3737 : * files. Logs other errors at a caller-specified level.
3738 : *
3739 : * Returns 0 if the operation succeeded, -1 otherwise.
3740 : */
3741 : int
3742 37374 : fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3743 : {
3744 : int fd;
3745 : int flags;
3746 : int returncode;
3747 :
3748 : /*
3749 : * Some OSs require directories to be opened read-only whereas other
3750 : * systems don't allow us to fsync files opened read-only; so we need both
3751 : * cases here. Using O_RDWR will cause us to fail to fsync files that are
3752 : * not writable by our userid, but we assume that's OK.
3753 : */
3754 37374 : flags = PG_BINARY;
3755 37374 : if (!isdir)
3756 12576 : flags |= O_RDWR;
3757 : else
3758 24798 : flags |= O_RDONLY;
3759 :
3760 37374 : fd = OpenTransientFile(fname, flags);
3761 :
3762 : /*
3763 : * Some OSs don't allow us to open directories at all (Windows returns
3764 : * EACCES), just ignore the error in that case. If desired also silently
3765 : * ignoring errors about unreadable files. Log others.
3766 : */
3767 37374 : if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3768 0 : return 0;
3769 37374 : else if (fd < 0 && ignore_perm && errno == EACCES)
3770 0 : return 0;
3771 37374 : else if (fd < 0)
3772 : {
3773 0 : ereport(elevel,
3774 : (errcode_for_file_access(),
3775 : errmsg("could not open file \"%s\": %m", fname)));
3776 0 : return -1;
3777 : }
3778 :
3779 37374 : returncode = pg_fsync(fd);
3780 :
3781 : /*
3782 : * Some OSes don't allow us to fsync directories at all, so we can ignore
3783 : * those errors. Anything else needs to be logged.
3784 : */
3785 37374 : if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3786 : {
3787 : int save_errno;
3788 :
3789 : /* close file upon error, might not be in transaction context */
3790 0 : save_errno = errno;
3791 0 : (void) CloseTransientFile(fd);
3792 0 : errno = save_errno;
3793 :
3794 0 : ereport(elevel,
3795 : (errcode_for_file_access(),
3796 : errmsg("could not fsync file \"%s\": %m", fname)));
3797 0 : return -1;
3798 : }
3799 :
3800 37374 : if (CloseTransientFile(fd) != 0)
3801 : {
3802 0 : ereport(elevel,
3803 : (errcode_for_file_access(),
3804 : errmsg("could not close file \"%s\": %m", fname)));
3805 0 : return -1;
3806 : }
3807 :
3808 37374 : return 0;
3809 : }
3810 :
3811 : /*
3812 : * fsync_parent_path -- fsync the parent path of a file or directory
3813 : *
3814 : * This is aimed at making file operations persistent on disk in case of
3815 : * an OS crash or power failure.
3816 : */
3817 : static int
3818 4690 : fsync_parent_path(const char *fname, int elevel)
3819 : {
3820 : char parentpath[MAXPGPATH];
3821 :
3822 4690 : strlcpy(parentpath, fname, MAXPGPATH);
3823 4690 : get_parent_directory(parentpath);
3824 :
3825 : /*
3826 : * get_parent_directory() returns an empty string if the input argument is
3827 : * just a file name (see comments in path.c), so handle that as being the
3828 : * current directory.
3829 : */
3830 4690 : if (strlen(parentpath) == 0)
3831 302 : strlcpy(parentpath, ".", MAXPGPATH);
3832 :
3833 4690 : if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3834 0 : return -1;
3835 :
3836 4690 : return 0;
3837 : }
3838 :
3839 : /*
3840 : * Create a PostgreSQL data sub-directory
3841 : *
3842 : * The data directory itself, and most of its sub-directories, are created at
3843 : * initdb time, but we do have some occasions when we create directories in
3844 : * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3845 : * make sure that those directories are created consistently. Today, that means
3846 : * making sure that the created directory has the correct permissions, which is
3847 : * what pg_dir_create_mode tracks for us.
3848 : *
3849 : * Note that we also set the umask() based on what we understand the correct
3850 : * permissions to be (see file_perm.c).
3851 : *
3852 : * For permissions other than the default, mkdir() can be used directly, but
3853 : * be sure to consider carefully such cases -- a sub-directory with incorrect
3854 : * permissions in a PostgreSQL data directory could cause backups and other
3855 : * processes to fail.
3856 : */
3857 : int
3858 2236 : MakePGDirectory(const char *directoryName)
3859 : {
3860 2236 : return mkdir(directoryName, pg_dir_create_mode);
3861 : }
3862 :
3863 : /*
3864 : * Return the passed-in error level, or PANIC if data_sync_retry is off.
3865 : *
3866 : * Failure to fsync any data file is cause for immediate panic, unless
3867 : * data_sync_retry is enabled. Data may have been written to the operating
3868 : * system and removed from our buffer pool already, and if we are running on
3869 : * an operating system that forgets dirty data on write-back failure, there
3870 : * may be only one copy of the data remaining: in the WAL. A later attempt to
3871 : * fsync again might falsely report success. Therefore we must not allow any
3872 : * further checkpoints to be attempted. data_sync_retry can in theory be
3873 : * enabled on systems known not to drop dirty buffered data on write-back
3874 : * failure (with the likely outcome that checkpoints will continue to fail
3875 : * until the underlying problem is fixed).
3876 : *
3877 : * Any code that reports a failure from fsync() or related functions should
3878 : * filter the error level with this function.
3879 : */
3880 : int
3881 23692 : data_sync_elevel(int elevel)
3882 : {
3883 23692 : return data_sync_retry ? elevel : PANIC;
3884 : }
3885 :
3886 : bool
3887 1570 : check_debug_io_direct(char **newval, void **extra, GucSource source)
3888 : {
3889 1570 : bool result = true;
3890 : int flags;
3891 :
3892 : #if PG_O_DIRECT == 0
3893 : if (strcmp(*newval, "") != 0)
3894 : {
3895 : GUC_check_errdetail("debug_io_direct is not supported on this platform.");
3896 : result = false;
3897 : }
3898 : flags = 0;
3899 : #else
3900 : List *elemlist;
3901 : ListCell *l;
3902 : char *rawstring;
3903 :
3904 : /* Need a modifiable copy of string */
3905 1570 : rawstring = pstrdup(*newval);
3906 :
3907 1570 : if (!SplitGUCList(rawstring, ',', &elemlist))
3908 : {
3909 0 : GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
3910 : "debug_io_direct");
3911 0 : pfree(rawstring);
3912 0 : list_free(elemlist);
3913 0 : return false;
3914 : }
3915 :
3916 1570 : flags = 0;
3917 1582 : foreach(l, elemlist)
3918 : {
3919 12 : char *item = (char *) lfirst(l);
3920 :
3921 12 : if (pg_strcasecmp(item, "data") == 0)
3922 4 : flags |= IO_DIRECT_DATA;
3923 8 : else if (pg_strcasecmp(item, "wal") == 0)
3924 4 : flags |= IO_DIRECT_WAL;
3925 4 : else if (pg_strcasecmp(item, "wal_init") == 0)
3926 4 : flags |= IO_DIRECT_WAL_INIT;
3927 : else
3928 : {
3929 0 : GUC_check_errdetail("invalid option \"%s\"", item);
3930 0 : result = false;
3931 0 : break;
3932 : }
3933 : }
3934 :
3935 : /*
3936 : * It's possible to configure block sizes smaller than our assumed I/O
3937 : * alignment size, which could result in invalid I/O requests.
3938 : */
3939 : #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
3940 : if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
3941 : {
3942 : GUC_check_errdetail("debug_io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
3943 : result = false;
3944 : }
3945 : #endif
3946 : #if BLCKSZ < PG_IO_ALIGN_SIZE
3947 : if (result && (flags & IO_DIRECT_DATA))
3948 : {
3949 : GUC_check_errdetail("debug_io_direct is not supported for data because BLCKSZ is too small");
3950 : result = false;
3951 : }
3952 : #endif
3953 :
3954 1570 : pfree(rawstring);
3955 1570 : list_free(elemlist);
3956 : #endif
3957 :
3958 1570 : if (!result)
3959 0 : return result;
3960 :
3961 : /* Save the flags in *extra, for use by assign_debug_io_direct */
3962 1570 : *extra = guc_malloc(ERROR, sizeof(int));
3963 1570 : *((int *) *extra) = flags;
3964 :
3965 1570 : return result;
3966 : }
3967 :
3968 : extern void
3969 1570 : assign_debug_io_direct(const char *newval, void *extra)
3970 : {
3971 1570 : int *flags = (int *) extra;
3972 :
3973 1570 : io_direct_flags = *flags;
3974 1570 : }
|