Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dsm_impl.c
4 : * manage dynamic shared memory segments
5 : *
6 : * This file provides low-level APIs for creating and destroying shared
7 : * memory segments using several different possible techniques. We refer
8 : * to these segments as dynamic because they can be created, altered, and
9 : * destroyed at any point during the server life cycle. This is unlike
10 : * the main shared memory segment, of which there is always exactly one
11 : * and which is always mapped at a fixed address in every PostgreSQL
12 : * background process.
13 : *
14 : * Because not all systems provide the same primitives in this area, nor
15 : * do all primitives behave the same way on all systems, we provide
16 : * several implementations of this facility. Many systems implement
17 : * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 : * in this area, with the exception that shared memory identifiers live
19 : * in a flat system-wide namespace, raising the uncomfortable prospect of
20 : * name collisions with other processes (including other copies of
21 : * PostgreSQL) running on the same system. Some systems only support
22 : * the older System V shared memory interface (shmget etc.) which is
23 : * also usable; however, the default allocation limits are often quite
24 : * small, and the namespace is even more restricted.
25 : *
26 : * We also provide an mmap-based shared memory implementation. This may
27 : * be useful on systems that provide shared memory via a special-purpose
28 : * filesystem; by opting for this implementation, the user can even
29 : * control precisely where their shared memory segments are placed. It
30 : * can also be used as a fallback for systems where shm_open and shmget
31 : * are not available or can't be used for some reason. Of course,
32 : * mapping a file residing on an actual spinning disk is a fairly poor
33 : * approximation for shared memory because writeback may hurt performance
34 : * substantially, but there should be few systems where we must make do
35 : * with such poor tools.
36 : *
37 : * As ever, Windows requires its own implementation.
38 : *
39 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
40 : * Portions Copyright (c) 1994, Regents of the University of California
41 : *
42 : *
43 : * IDENTIFICATION
44 : * src/backend/storage/ipc/dsm_impl.c
45 : *
46 : *-------------------------------------------------------------------------
47 : */
48 :
49 : #include "postgres.h"
50 :
51 : #include <fcntl.h>
52 : #include <signal.h>
53 : #include <unistd.h>
54 : #ifndef WIN32
55 : #include <sys/mman.h>
56 : #include <sys/ipc.h>
57 : #include <sys/shm.h>
58 : #include <sys/stat.h>
59 : #endif
60 :
61 : #include "common/file_perm.h"
62 : #include "libpq/pqsignal.h"
63 : #include "miscadmin.h"
64 : #include "pgstat.h"
65 : #include "portability/mem.h"
66 : #include "postmaster/postmaster.h"
67 : #include "storage/dsm_impl.h"
68 : #include "storage/fd.h"
69 : #include "utils/guc.h"
70 : #include "utils/memutils.h"
71 :
72 : #ifdef USE_DSM_POSIX
73 : static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
74 : void **impl_private, void **mapped_address,
75 : Size *mapped_size, int elevel);
76 : static int dsm_impl_posix_resize(int fd, off_t size);
77 : #endif
78 : #ifdef USE_DSM_SYSV
79 : static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
80 : void **impl_private, void **mapped_address,
81 : Size *mapped_size, int elevel);
82 : #endif
83 : #ifdef USE_DSM_WINDOWS
84 : static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
85 : void **impl_private, void **mapped_address,
86 : Size *mapped_size, int elevel);
87 : #endif
88 : #ifdef USE_DSM_MMAP
89 : static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
90 : void **impl_private, void **mapped_address,
91 : Size *mapped_size, int elevel);
92 : #endif
93 : static int errcode_for_dynamic_shared_memory(void);
94 :
95 : const struct config_enum_entry dynamic_shared_memory_options[] = {
96 : #ifdef USE_DSM_POSIX
97 : {"posix", DSM_IMPL_POSIX, false},
98 : #endif
99 : #ifdef USE_DSM_SYSV
100 : {"sysv", DSM_IMPL_SYSV, false},
101 : #endif
102 : #ifdef USE_DSM_WINDOWS
103 : {"windows", DSM_IMPL_WINDOWS, false},
104 : #endif
105 : #ifdef USE_DSM_MMAP
106 : {"mmap", DSM_IMPL_MMAP, false},
107 : #endif
108 : {NULL, 0, false}
109 : };
110 :
111 : /* Implementation selector. */
112 : int dynamic_shared_memory_type = DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE;
113 :
114 : /* Amount of space reserved for DSM segments in the main area. */
115 : int min_dynamic_shared_memory;
116 :
117 : /* Size of buffer to be used for zero-filling. */
118 : #define ZBUFFER_SIZE 8192
119 :
120 : #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
121 :
122 : /*------
123 : * Perform a low-level shared memory operation in a platform-specific way,
124 : * as dictated by the selected implementation. Each implementation is
125 : * required to implement the following primitives.
126 : *
127 : * DSM_OP_CREATE. Create a segment whose size is the request_size and
128 : * map it.
129 : *
130 : * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
131 : *
132 : * DSM_OP_DETACH. Unmap the segment.
133 : *
134 : * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
135 : * segment.
136 : *
137 : * Arguments:
138 : * op: The operation to be performed.
139 : * handle: The handle of an existing object, or for DSM_OP_CREATE, the
140 : * identifier for the new handle the caller wants created.
141 : * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
142 : * impl_private: Private, implementation-specific data. Will be a pointer
143 : * to NULL for the first operation on a shared memory segment within this
144 : * backend; thereafter, it will point to the value to which it was set
145 : * on the previous call.
146 : * mapped_address: Pointer to start of current mapping; pointer to NULL
147 : * if none. Updated with new mapping address.
148 : * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
149 : * Updated with new mapped size.
150 : * elevel: Level at which to log errors.
151 : *
152 : * Return value: true on success, false on failure. When false is returned,
153 : * a message should first be logged at the specified elevel, except in the
154 : * case where DSM_OP_CREATE experiences a name collision, which should
155 : * silently return false.
156 : *-----
157 : */
158 : bool
159 93898 : dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
160 : void **impl_private, void **mapped_address, Size *mapped_size,
161 : int elevel)
162 : {
163 : Assert(op == DSM_OP_CREATE || request_size == 0);
164 : Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
165 : (*mapped_address == NULL && *mapped_size == 0));
166 :
167 93898 : switch (dynamic_shared_memory_type)
168 : {
169 : #ifdef USE_DSM_POSIX
170 93898 : case DSM_IMPL_POSIX:
171 93898 : return dsm_impl_posix(op, handle, request_size, impl_private,
172 : mapped_address, mapped_size, elevel);
173 : #endif
174 : #ifdef USE_DSM_SYSV
175 0 : case DSM_IMPL_SYSV:
176 0 : return dsm_impl_sysv(op, handle, request_size, impl_private,
177 : mapped_address, mapped_size, elevel);
178 : #endif
179 : #ifdef USE_DSM_WINDOWS
180 : case DSM_IMPL_WINDOWS:
181 : return dsm_impl_windows(op, handle, request_size, impl_private,
182 : mapped_address, mapped_size, elevel);
183 : #endif
184 : #ifdef USE_DSM_MMAP
185 0 : case DSM_IMPL_MMAP:
186 0 : return dsm_impl_mmap(op, handle, request_size, impl_private,
187 : mapped_address, mapped_size, elevel);
188 : #endif
189 0 : default:
190 0 : elog(ERROR, "unexpected dynamic shared memory type: %d",
191 : dynamic_shared_memory_type);
192 : return false;
193 : }
194 : }
195 :
196 : #ifdef USE_DSM_POSIX
197 : /*
198 : * Operating system primitives to support POSIX shared memory.
199 : *
200 : * POSIX shared memory segments are created and attached using shm_open()
201 : * and shm_unlink(); other operations, such as sizing or mapping the
202 : * segment, are performed as if the shared memory segments were files.
203 : *
204 : * Indeed, on some platforms, they may be implemented that way. While
205 : * POSIX shared memory segments seem intended to exist in a flat namespace,
206 : * some operating systems may implement them as files, even going so far
207 : * to treat a request for /xyz as a request to create a file by that name
208 : * in the root directory. Users of such broken platforms should select
209 : * a different shared memory implementation.
210 : */
211 : static bool
212 93898 : dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
213 : void **impl_private, void **mapped_address, Size *mapped_size,
214 : int elevel)
215 : {
216 : char name[64];
217 : int flags;
218 : int fd;
219 : char *address;
220 :
221 93898 : snprintf(name, 64, "/PostgreSQL.%u", handle);
222 :
223 : /* Handle teardown cases. */
224 93898 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
225 : {
226 48562 : if (*mapped_address != NULL
227 45332 : && munmap(*mapped_address, *mapped_size) != 0)
228 : {
229 0 : ereport(elevel,
230 : (errcode_for_dynamic_shared_memory(),
231 : errmsg("could not unmap shared memory segment \"%s\": %m",
232 : name)));
233 0 : return false;
234 : }
235 48562 : *mapped_address = NULL;
236 48562 : *mapped_size = 0;
237 48562 : if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
238 : {
239 0 : ereport(elevel,
240 : (errcode_for_dynamic_shared_memory(),
241 : errmsg("could not remove shared memory segment \"%s\": %m",
242 : name)));
243 0 : return false;
244 : }
245 48562 : return true;
246 : }
247 :
248 : /*
249 : * Create new segment or open an existing one for attach.
250 : *
251 : * Even though we will close the FD before returning, it seems desirable
252 : * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
253 : * failure. The fact that we won't hold the FD open long justifies using
254 : * ReserveExternalFD rather than AcquireExternalFD, though.
255 : */
256 45336 : ReserveExternalFD();
257 :
258 45336 : flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
259 45336 : if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
260 : {
261 0 : ReleaseExternalFD();
262 0 : if (op == DSM_OP_ATTACH || errno != EEXIST)
263 0 : ereport(elevel,
264 : (errcode_for_dynamic_shared_memory(),
265 : errmsg("could not open shared memory segment \"%s\": %m",
266 : name)));
267 0 : return false;
268 : }
269 :
270 : /*
271 : * If we're attaching the segment, determine the current size; if we are
272 : * creating the segment, set the size to the requested value.
273 : */
274 45336 : if (op == DSM_OP_ATTACH)
275 : {
276 : struct stat st;
277 :
278 40424 : if (fstat(fd, &st) != 0)
279 : {
280 : int save_errno;
281 :
282 : /* Back out what's already been done. */
283 0 : save_errno = errno;
284 0 : close(fd);
285 0 : ReleaseExternalFD();
286 0 : errno = save_errno;
287 :
288 0 : ereport(elevel,
289 : (errcode_for_dynamic_shared_memory(),
290 : errmsg("could not stat shared memory segment \"%s\": %m",
291 : name)));
292 0 : return false;
293 : }
294 40424 : request_size = st.st_size;
295 : }
296 4912 : else if (dsm_impl_posix_resize(fd, request_size) != 0)
297 : {
298 : int save_errno;
299 :
300 : /* Back out what's already been done. */
301 0 : save_errno = errno;
302 0 : close(fd);
303 0 : ReleaseExternalFD();
304 0 : shm_unlink(name);
305 0 : errno = save_errno;
306 :
307 0 : ereport(elevel,
308 : (errcode_for_dynamic_shared_memory(),
309 : errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
310 : name, request_size)));
311 0 : return false;
312 : }
313 :
314 : /* Map it. */
315 45336 : address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
316 : MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
317 45336 : if (address == MAP_FAILED)
318 : {
319 : int save_errno;
320 :
321 : /* Back out what's already been done. */
322 0 : save_errno = errno;
323 0 : close(fd);
324 0 : ReleaseExternalFD();
325 0 : if (op == DSM_OP_CREATE)
326 0 : shm_unlink(name);
327 0 : errno = save_errno;
328 :
329 0 : ereport(elevel,
330 : (errcode_for_dynamic_shared_memory(),
331 : errmsg("could not map shared memory segment \"%s\": %m",
332 : name)));
333 0 : return false;
334 : }
335 45336 : *mapped_address = address;
336 45336 : *mapped_size = request_size;
337 45336 : close(fd);
338 45336 : ReleaseExternalFD();
339 :
340 45336 : return true;
341 : }
342 :
343 : /*
344 : * Set the size of a virtual memory region associated with a file descriptor.
345 : * If necessary, also ensure that virtual memory is actually allocated by the
346 : * operating system, to avoid nasty surprises later.
347 : *
348 : * Returns non-zero if either truncation or allocation fails, and sets errno.
349 : */
350 : static int
351 4912 : dsm_impl_posix_resize(int fd, off_t size)
352 : {
353 : int rc;
354 : int save_errno;
355 : sigset_t save_sigmask;
356 :
357 : /*
358 : * Block all blockable signals, except SIGQUIT. posix_fallocate() can run
359 : * for quite a long time, and is an all-or-nothing operation. If we
360 : * allowed SIGUSR1 to interrupt us repeatedly (for example, due to
361 : * recovery conflicts), the retry loop might never succeed.
362 : */
363 4912 : if (IsUnderPostmaster)
364 2796 : sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
365 :
366 4912 : pgstat_report_wait_start(WAIT_EVENT_DSM_ALLOCATE);
367 : #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
368 :
369 : /*
370 : * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use
371 : * ftruncate, the file would contain a hole. Accessing memory backed by a
372 : * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
373 : * is no more tmpfs space available. So we ask tmpfs to allocate pages
374 : * here, so we can fail gracefully with ENOSPC now rather than risking
375 : * SIGBUS later.
376 : *
377 : * We still use a traditional EINTR retry loop to handle SIGCONT.
378 : * posix_fallocate() doesn't restart automatically, and we don't want this
379 : * to fail if you attach a debugger.
380 : */
381 : do
382 : {
383 4912 : rc = posix_fallocate(fd, 0, size);
384 4912 : } while (rc == EINTR);
385 :
386 : /*
387 : * The caller expects errno to be set, but posix_fallocate() doesn't set
388 : * it. Instead it returns error numbers directly. So set errno, even
389 : * though we'll also return rc to indicate success or failure.
390 : */
391 4912 : errno = rc;
392 : #else
393 : /* Extend the file to the requested size. */
394 : do
395 : {
396 : rc = ftruncate(fd, size);
397 : } while (rc < 0 && errno == EINTR);
398 : #endif
399 4912 : pgstat_report_wait_end();
400 :
401 4912 : if (IsUnderPostmaster)
402 : {
403 2796 : save_errno = errno;
404 2796 : sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
405 2796 : errno = save_errno;
406 : }
407 :
408 4912 : return rc;
409 : }
410 :
411 : #endif /* USE_DSM_POSIX */
412 :
413 : #ifdef USE_DSM_SYSV
414 : /*
415 : * Operating system primitives to support System V shared memory.
416 : *
417 : * System V shared memory segments are manipulated using shmget(), shmat(),
418 : * shmdt(), and shmctl(). As the default allocation limits for System V
419 : * shared memory are usually quite low, the POSIX facilities may be
420 : * preferable; but those are not supported everywhere.
421 : */
422 : static bool
423 0 : dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
424 : void **impl_private, void **mapped_address, Size *mapped_size,
425 : int elevel)
426 : {
427 : key_t key;
428 : int ident;
429 : char *address;
430 : char name[64];
431 : int *ident_cache;
432 :
433 : /*
434 : * POSIX shared memory and mmap-based shared memory identify segments with
435 : * names. To avoid needless error message variation, we use the handle as
436 : * the name.
437 : */
438 0 : snprintf(name, 64, "%u", handle);
439 :
440 : /*
441 : * The System V shared memory namespace is very restricted; names are of
442 : * type key_t, which is expected to be some sort of integer data type, but
443 : * not necessarily the same one as dsm_handle. Since we use dsm_handle to
444 : * identify shared memory segments across processes, this might seem like
445 : * a problem, but it's really not. If dsm_handle is bigger than key_t,
446 : * the cast below might truncate away some bits from the handle the
447 : * user-provided, but it'll truncate exactly the same bits away in exactly
448 : * the same fashion every time we use that handle, which is all that
449 : * really matters. Conversely, if dsm_handle is smaller than key_t, we
450 : * won't use the full range of available key space, but that's no big deal
451 : * either.
452 : *
453 : * We do make sure that the key isn't negative, because that might not be
454 : * portable.
455 : */
456 0 : key = (key_t) handle;
457 0 : if (key < 1) /* avoid compiler warning if type is unsigned */
458 0 : key = -key;
459 :
460 : /*
461 : * There's one special key, IPC_PRIVATE, which can't be used. If we end
462 : * up with that value by chance during a create operation, just pretend it
463 : * already exists, so that caller will retry. If we run into it anywhere
464 : * else, the caller has passed a handle that doesn't correspond to
465 : * anything we ever created, which should not happen.
466 : */
467 0 : if (key == IPC_PRIVATE)
468 : {
469 0 : if (op != DSM_OP_CREATE)
470 0 : elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
471 0 : errno = EEXIST;
472 0 : return false;
473 : }
474 :
475 : /*
476 : * Before we can do anything with a shared memory segment, we have to map
477 : * the shared memory key to a shared memory identifier using shmget(). To
478 : * avoid repeated lookups, we store the key using impl_private.
479 : */
480 0 : if (*impl_private != NULL)
481 : {
482 0 : ident_cache = *impl_private;
483 0 : ident = *ident_cache;
484 : }
485 : else
486 : {
487 0 : int flags = IPCProtection;
488 : size_t segsize;
489 :
490 : /*
491 : * Allocate the memory BEFORE acquiring the resource, so that we don't
492 : * leak the resource if memory allocation fails.
493 : */
494 0 : ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
495 :
496 : /*
497 : * When using shmget to find an existing segment, we must pass the
498 : * size as 0. Passing a non-zero size which is greater than the
499 : * actual size will result in EINVAL.
500 : */
501 0 : segsize = 0;
502 :
503 0 : if (op == DSM_OP_CREATE)
504 : {
505 0 : flags |= IPC_CREAT | IPC_EXCL;
506 0 : segsize = request_size;
507 : }
508 :
509 0 : if ((ident = shmget(key, segsize, flags)) == -1)
510 : {
511 0 : if (op == DSM_OP_ATTACH || errno != EEXIST)
512 : {
513 0 : int save_errno = errno;
514 :
515 0 : pfree(ident_cache);
516 0 : errno = save_errno;
517 0 : ereport(elevel,
518 : (errcode_for_dynamic_shared_memory(),
519 : errmsg("could not get shared memory segment: %m")));
520 : }
521 0 : return false;
522 : }
523 :
524 0 : *ident_cache = ident;
525 0 : *impl_private = ident_cache;
526 : }
527 :
528 : /* Handle teardown cases. */
529 0 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
530 : {
531 0 : pfree(ident_cache);
532 0 : *impl_private = NULL;
533 0 : if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
534 : {
535 0 : ereport(elevel,
536 : (errcode_for_dynamic_shared_memory(),
537 : errmsg("could not unmap shared memory segment \"%s\": %m",
538 : name)));
539 0 : return false;
540 : }
541 0 : *mapped_address = NULL;
542 0 : *mapped_size = 0;
543 0 : if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
544 : {
545 0 : ereport(elevel,
546 : (errcode_for_dynamic_shared_memory(),
547 : errmsg("could not remove shared memory segment \"%s\": %m",
548 : name)));
549 0 : return false;
550 : }
551 0 : return true;
552 : }
553 :
554 : /* If we're attaching it, we must use IPC_STAT to determine the size. */
555 0 : if (op == DSM_OP_ATTACH)
556 : {
557 : struct shmid_ds shm;
558 :
559 0 : if (shmctl(ident, IPC_STAT, &shm) != 0)
560 : {
561 0 : ereport(elevel,
562 : (errcode_for_dynamic_shared_memory(),
563 : errmsg("could not stat shared memory segment \"%s\": %m",
564 : name)));
565 0 : return false;
566 : }
567 0 : request_size = shm.shm_segsz;
568 : }
569 :
570 : /* Map it. */
571 0 : address = shmat(ident, NULL, PG_SHMAT_FLAGS);
572 0 : if (address == (void *) -1)
573 : {
574 : int save_errno;
575 :
576 : /* Back out what's already been done. */
577 0 : save_errno = errno;
578 0 : if (op == DSM_OP_CREATE)
579 0 : shmctl(ident, IPC_RMID, NULL);
580 0 : errno = save_errno;
581 :
582 0 : ereport(elevel,
583 : (errcode_for_dynamic_shared_memory(),
584 : errmsg("could not map shared memory segment \"%s\": %m",
585 : name)));
586 0 : return false;
587 : }
588 0 : *mapped_address = address;
589 0 : *mapped_size = request_size;
590 :
591 0 : return true;
592 : }
593 : #endif
594 :
595 : #ifdef USE_DSM_WINDOWS
596 : /*
597 : * Operating system primitives to support Windows shared memory.
598 : *
599 : * Windows shared memory implementation is done using file mapping
600 : * which can be backed by either physical file or system paging file.
601 : * Current implementation uses system paging file as other effects
602 : * like performance are not clear for physical file and it is used in similar
603 : * way for main shared memory in windows.
604 : *
605 : * A memory mapping object is a kernel object - they always get deleted when
606 : * the last reference to them goes away, either explicitly via a CloseHandle or
607 : * when the process containing the reference exits.
608 : */
609 : static bool
610 : dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
611 : void **impl_private, void **mapped_address,
612 : Size *mapped_size, int elevel)
613 : {
614 : char *address;
615 : HANDLE hmap;
616 : char name[64];
617 : MEMORY_BASIC_INFORMATION info;
618 :
619 : /*
620 : * Storing the shared memory segment in the Global\ namespace, can allow
621 : * any process running in any session to access that file mapping object
622 : * provided that the caller has the required access rights. But to avoid
623 : * issues faced in main shared memory, we are using the naming convention
624 : * similar to main shared memory. We can change here once issue mentioned
625 : * in GetSharedMemName is resolved.
626 : */
627 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
628 :
629 : /*
630 : * Handle teardown cases. Since Windows automatically destroys the object
631 : * when no references remain, we can treat it the same as detach.
632 : */
633 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
634 : {
635 : if (*mapped_address != NULL
636 : && UnmapViewOfFile(*mapped_address) == 0)
637 : {
638 : _dosmaperr(GetLastError());
639 : ereport(elevel,
640 : (errcode_for_dynamic_shared_memory(),
641 : errmsg("could not unmap shared memory segment \"%s\": %m",
642 : name)));
643 : return false;
644 : }
645 : if (*impl_private != NULL
646 : && CloseHandle(*impl_private) == 0)
647 : {
648 : _dosmaperr(GetLastError());
649 : ereport(elevel,
650 : (errcode_for_dynamic_shared_memory(),
651 : errmsg("could not remove shared memory segment \"%s\": %m",
652 : name)));
653 : return false;
654 : }
655 :
656 : *impl_private = NULL;
657 : *mapped_address = NULL;
658 : *mapped_size = 0;
659 : return true;
660 : }
661 :
662 : /* Create new segment or open an existing one for attach. */
663 : if (op == DSM_OP_CREATE)
664 : {
665 : DWORD size_high;
666 : DWORD size_low;
667 : DWORD errcode;
668 :
669 : /* Shifts >= the width of the type are undefined. */
670 : #ifdef _WIN64
671 : size_high = request_size >> 32;
672 : #else
673 : size_high = 0;
674 : #endif
675 : size_low = (DWORD) request_size;
676 :
677 : /* CreateFileMapping might not clear the error code on success */
678 : SetLastError(0);
679 :
680 : hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
681 : NULL, /* Default security attrs */
682 : PAGE_READWRITE, /* Memory is read/write */
683 : size_high, /* Upper 32 bits of size */
684 : size_low, /* Lower 32 bits of size */
685 : name);
686 :
687 : errcode = GetLastError();
688 : if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
689 : {
690 : /*
691 : * On Windows, when the segment already exists, a handle for the
692 : * existing segment is returned. We must close it before
693 : * returning. However, if the existing segment is created by a
694 : * service, then it returns ERROR_ACCESS_DENIED. We don't do
695 : * _dosmaperr here, so errno won't be modified.
696 : */
697 : if (hmap)
698 : CloseHandle(hmap);
699 : return false;
700 : }
701 :
702 : if (!hmap)
703 : {
704 : _dosmaperr(errcode);
705 : ereport(elevel,
706 : (errcode_for_dynamic_shared_memory(),
707 : errmsg("could not create shared memory segment \"%s\": %m",
708 : name)));
709 : return false;
710 : }
711 : }
712 : else
713 : {
714 : hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
715 : FALSE, /* do not inherit the name */
716 : name); /* name of mapping object */
717 : if (!hmap)
718 : {
719 : _dosmaperr(GetLastError());
720 : ereport(elevel,
721 : (errcode_for_dynamic_shared_memory(),
722 : errmsg("could not open shared memory segment \"%s\": %m",
723 : name)));
724 : return false;
725 : }
726 : }
727 :
728 : /* Map it. */
729 : address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
730 : 0, 0, 0);
731 : if (!address)
732 : {
733 : int save_errno;
734 :
735 : _dosmaperr(GetLastError());
736 : /* Back out what's already been done. */
737 : save_errno = errno;
738 : CloseHandle(hmap);
739 : errno = save_errno;
740 :
741 : ereport(elevel,
742 : (errcode_for_dynamic_shared_memory(),
743 : errmsg("could not map shared memory segment \"%s\": %m",
744 : name)));
745 : return false;
746 : }
747 :
748 : /*
749 : * VirtualQuery gives size in page_size units, which is 4K for Windows. We
750 : * need size only when we are attaching, but it's better to get the size
751 : * when creating new segment to keep size consistent both for
752 : * DSM_OP_CREATE and DSM_OP_ATTACH.
753 : */
754 : if (VirtualQuery(address, &info, sizeof(info)) == 0)
755 : {
756 : int save_errno;
757 :
758 : _dosmaperr(GetLastError());
759 : /* Back out what's already been done. */
760 : save_errno = errno;
761 : UnmapViewOfFile(address);
762 : CloseHandle(hmap);
763 : errno = save_errno;
764 :
765 : ereport(elevel,
766 : (errcode_for_dynamic_shared_memory(),
767 : errmsg("could not stat shared memory segment \"%s\": %m",
768 : name)));
769 : return false;
770 : }
771 :
772 : *mapped_address = address;
773 : *mapped_size = info.RegionSize;
774 : *impl_private = hmap;
775 :
776 : return true;
777 : }
778 : #endif
779 :
780 : #ifdef USE_DSM_MMAP
781 : /*
782 : * Operating system primitives to support mmap-based shared memory.
783 : *
784 : * Calling this "shared memory" is somewhat of a misnomer, because what
785 : * we're really doing is creating a bunch of files and mapping them into
786 : * our address space. The operating system may feel obliged to
787 : * synchronize the contents to disk even if nothing is being paged out,
788 : * which will not serve us well. The user can relocate the pg_dynshmem
789 : * directory to a ramdisk to avoid this problem, if available.
790 : */
791 : static bool
792 0 : dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
793 : void **impl_private, void **mapped_address, Size *mapped_size,
794 : int elevel)
795 : {
796 : char name[64];
797 : int flags;
798 : int fd;
799 : char *address;
800 :
801 0 : snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
802 : handle);
803 :
804 : /* Handle teardown cases. */
805 0 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
806 : {
807 0 : if (*mapped_address != NULL
808 0 : && munmap(*mapped_address, *mapped_size) != 0)
809 : {
810 0 : ereport(elevel,
811 : (errcode_for_dynamic_shared_memory(),
812 : errmsg("could not unmap shared memory segment \"%s\": %m",
813 : name)));
814 0 : return false;
815 : }
816 0 : *mapped_address = NULL;
817 0 : *mapped_size = 0;
818 0 : if (op == DSM_OP_DESTROY && unlink(name) != 0)
819 : {
820 0 : ereport(elevel,
821 : (errcode_for_dynamic_shared_memory(),
822 : errmsg("could not remove shared memory segment \"%s\": %m",
823 : name)));
824 0 : return false;
825 : }
826 0 : return true;
827 : }
828 :
829 : /* Create new segment or open an existing one for attach. */
830 0 : flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
831 0 : if ((fd = OpenTransientFile(name, flags)) == -1)
832 : {
833 0 : if (op == DSM_OP_ATTACH || errno != EEXIST)
834 0 : ereport(elevel,
835 : (errcode_for_dynamic_shared_memory(),
836 : errmsg("could not open shared memory segment \"%s\": %m",
837 : name)));
838 0 : return false;
839 : }
840 :
841 : /*
842 : * If we're attaching the segment, determine the current size; if we are
843 : * creating the segment, set the size to the requested value.
844 : */
845 0 : if (op == DSM_OP_ATTACH)
846 : {
847 : struct stat st;
848 :
849 0 : if (fstat(fd, &st) != 0)
850 : {
851 : int save_errno;
852 :
853 : /* Back out what's already been done. */
854 0 : save_errno = errno;
855 0 : CloseTransientFile(fd);
856 0 : errno = save_errno;
857 :
858 0 : ereport(elevel,
859 : (errcode_for_dynamic_shared_memory(),
860 : errmsg("could not stat shared memory segment \"%s\": %m",
861 : name)));
862 0 : return false;
863 : }
864 0 : request_size = st.st_size;
865 : }
866 : else
867 : {
868 : /*
869 : * Allocate a buffer full of zeros.
870 : *
871 : * Note: palloc zbuffer, instead of just using a local char array, to
872 : * ensure it is reasonably well-aligned; this may save a few cycles
873 : * transferring data to the kernel.
874 : */
875 0 : char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
876 0 : Size remaining = request_size;
877 0 : bool success = true;
878 :
879 : /*
880 : * Zero-fill the file. We have to do this the hard way to ensure that
881 : * all the file space has really been allocated, so that we don't
882 : * later seg fault when accessing the memory mapping. This is pretty
883 : * pessimal.
884 : */
885 0 : while (success && remaining > 0)
886 : {
887 0 : Size goal = remaining;
888 :
889 0 : if (goal > ZBUFFER_SIZE)
890 0 : goal = ZBUFFER_SIZE;
891 0 : pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
892 0 : if (write(fd, zbuffer, goal) == goal)
893 0 : remaining -= goal;
894 : else
895 0 : success = false;
896 0 : pgstat_report_wait_end();
897 : }
898 :
899 0 : if (!success)
900 : {
901 : int save_errno;
902 :
903 : /* Back out what's already been done. */
904 0 : save_errno = errno;
905 0 : CloseTransientFile(fd);
906 0 : unlink(name);
907 0 : errno = save_errno ? save_errno : ENOSPC;
908 :
909 0 : ereport(elevel,
910 : (errcode_for_dynamic_shared_memory(),
911 : errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
912 : name, request_size)));
913 0 : return false;
914 : }
915 : }
916 :
917 : /* Map it. */
918 0 : address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
919 : MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
920 0 : if (address == MAP_FAILED)
921 : {
922 : int save_errno;
923 :
924 : /* Back out what's already been done. */
925 0 : save_errno = errno;
926 0 : CloseTransientFile(fd);
927 0 : if (op == DSM_OP_CREATE)
928 0 : unlink(name);
929 0 : errno = save_errno;
930 :
931 0 : ereport(elevel,
932 : (errcode_for_dynamic_shared_memory(),
933 : errmsg("could not map shared memory segment \"%s\": %m",
934 : name)));
935 0 : return false;
936 : }
937 0 : *mapped_address = address;
938 0 : *mapped_size = request_size;
939 :
940 0 : if (CloseTransientFile(fd) != 0)
941 : {
942 0 : ereport(elevel,
943 : (errcode_for_file_access(),
944 : errmsg("could not close shared memory segment \"%s\": %m",
945 : name)));
946 0 : return false;
947 : }
948 :
949 0 : return true;
950 : }
951 : #endif
952 :
953 : /*
954 : * Implementation-specific actions that must be performed when a segment is to
955 : * be preserved even when no backend has it attached.
956 : *
957 : * Except on Windows, we don't need to do anything at all. But since Windows
958 : * cleans up segments automatically when no references remain, we duplicate
959 : * the segment handle into the postmaster process. The postmaster needn't
960 : * do anything to receive the handle; Windows transfers it automatically.
961 : */
962 : void
963 1932 : dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
964 : void **impl_private_pm_handle)
965 : {
966 1932 : switch (dynamic_shared_memory_type)
967 : {
968 : #ifdef USE_DSM_WINDOWS
969 : case DSM_IMPL_WINDOWS:
970 : if (IsUnderPostmaster)
971 : {
972 : HANDLE hmap;
973 :
974 : if (!DuplicateHandle(GetCurrentProcess(), impl_private,
975 : PostmasterHandle, &hmap, 0, FALSE,
976 : DUPLICATE_SAME_ACCESS))
977 : {
978 : char name[64];
979 :
980 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
981 : _dosmaperr(GetLastError());
982 : ereport(ERROR,
983 : (errcode_for_dynamic_shared_memory(),
984 : errmsg("could not duplicate handle for \"%s\": %m",
985 : name)));
986 : }
987 :
988 : /*
989 : * Here, we remember the handle that we created in the
990 : * postmaster process. This handle isn't actually usable in
991 : * any process other than the postmaster, but that doesn't
992 : * matter. We're just holding onto it so that, if the segment
993 : * is unpinned, dsm_impl_unpin_segment can close it.
994 : */
995 : *impl_private_pm_handle = hmap;
996 : }
997 : break;
998 : #endif
999 : default:
1000 1932 : break;
1001 : }
1002 1932 : }
1003 :
1004 : /*
1005 : * Implementation-specific actions that must be performed when a segment is no
1006 : * longer to be preserved, so that it will be cleaned up when all backends
1007 : * have detached from it.
1008 : *
1009 : * Except on Windows, we don't need to do anything at all. For Windows, we
1010 : * close the extra handle that dsm_impl_pin_segment created in the
1011 : * postmaster's process space.
1012 : */
1013 : void
1014 296 : dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1015 : {
1016 296 : switch (dynamic_shared_memory_type)
1017 : {
1018 : #ifdef USE_DSM_WINDOWS
1019 : case DSM_IMPL_WINDOWS:
1020 : if (IsUnderPostmaster)
1021 : {
1022 : if (*impl_private &&
1023 : !DuplicateHandle(PostmasterHandle, *impl_private,
1024 : NULL, NULL, 0, FALSE,
1025 : DUPLICATE_CLOSE_SOURCE))
1026 : {
1027 : char name[64];
1028 :
1029 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1030 : _dosmaperr(GetLastError());
1031 : ereport(ERROR,
1032 : (errcode_for_dynamic_shared_memory(),
1033 : errmsg("could not duplicate handle for \"%s\": %m",
1034 : name)));
1035 : }
1036 :
1037 : *impl_private = NULL;
1038 : }
1039 : break;
1040 : #endif
1041 : default:
1042 296 : break;
1043 : }
1044 296 : }
1045 :
1046 : static int
1047 0 : errcode_for_dynamic_shared_memory(void)
1048 : {
1049 0 : if (errno == EFBIG || errno == ENOMEM)
1050 0 : return errcode(ERRCODE_OUT_OF_MEMORY);
1051 : else
1052 0 : return errcode_for_file_access();
1053 : }
|