Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dsm_impl.c
4 : * manage dynamic shared memory segments
5 : *
6 : * This file provides low-level APIs for creating and destroying shared
7 : * memory segments using several different possible techniques. We refer
8 : * to these segments as dynamic because they can be created, altered, and
9 : * destroyed at any point during the server life cycle. This is unlike
10 : * the main shared memory segment, of which there is always exactly one
11 : * and which is always mapped at a fixed address in every PostgreSQL
12 : * background process.
13 : *
14 : * Because not all systems provide the same primitives in this area, nor
15 : * do all primitives behave the same way on all systems, we provide
16 : * several implementations of this facility. Many systems implement
17 : * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 : * in this area, with the exception that shared memory identifiers live
19 : * in a flat system-wide namespace, raising the uncomfortable prospect of
20 : * name collisions with other processes (including other copies of
21 : * PostgreSQL) running on the same system. Some systems only support
22 : * the older System V shared memory interface (shmget etc.) which is
23 : * also usable; however, the default allocation limits are often quite
24 : * small, and the namespace is even more restricted.
25 : *
26 : * We also provide an mmap-based shared memory implementation. This may
27 : * be useful on systems that provide shared memory via a special-purpose
28 : * filesystem; by opting for this implementation, the user can even
29 : * control precisely where their shared memory segments are placed. It
30 : * can also be used as a fallback for systems where shm_open and shmget
31 : * are not available or can't be used for some reason. Of course,
32 : * mapping a file residing on an actual spinning disk is a fairly poor
33 : * approximation for shared memory because writeback may hurt performance
34 : * substantially, but there should be few systems where we must make do
35 : * with such poor tools.
36 : *
37 : * As ever, Windows requires its own implementation.
38 : *
39 : * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
40 : * Portions Copyright (c) 1994, Regents of the University of California
41 : *
42 : *
43 : * IDENTIFICATION
44 : * src/backend/storage/ipc/dsm_impl.c
45 : *
46 : *-------------------------------------------------------------------------
47 : */
48 :
49 : #include "postgres.h"
50 :
51 : #include <fcntl.h>
52 : #include <unistd.h>
53 : #ifndef WIN32
54 : #include <sys/mman.h>
55 : #endif
56 : #include <sys/stat.h>
57 : #ifdef HAVE_SYS_IPC_H
58 : #include <sys/ipc.h>
59 : #endif
60 : #ifdef HAVE_SYS_SHM_H
61 : #include <sys/shm.h>
62 : #endif
63 :
64 : #include "common/file_perm.h"
65 : #include "miscadmin.h"
66 : #include "pgstat.h"
67 : #include "portability/mem.h"
68 : #include "postmaster/postmaster.h"
69 : #include "storage/dsm_impl.h"
70 : #include "storage/fd.h"
71 : #include "utils/guc.h"
72 : #include "utils/memutils.h"
73 :
74 : #ifdef USE_DSM_POSIX
75 : static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76 : void **impl_private, void **mapped_address,
77 : Size *mapped_size, int elevel);
78 : static int dsm_impl_posix_resize(int fd, off_t size);
79 : #endif
80 : #ifdef USE_DSM_SYSV
81 : static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82 : void **impl_private, void **mapped_address,
83 : Size *mapped_size, int elevel);
84 : #endif
85 : #ifdef USE_DSM_WINDOWS
86 : static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87 : void **impl_private, void **mapped_address,
88 : Size *mapped_size, int elevel);
89 : #endif
90 : #ifdef USE_DSM_MMAP
91 : static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92 : void **impl_private, void **mapped_address,
93 : Size *mapped_size, int elevel);
94 : #endif
95 : static int errcode_for_dynamic_shared_memory(void);
96 :
97 : const struct config_enum_entry dynamic_shared_memory_options[] = {
98 : #ifdef USE_DSM_POSIX
99 : {"posix", DSM_IMPL_POSIX, false},
100 : #endif
101 : #ifdef USE_DSM_SYSV
102 : {"sysv", DSM_IMPL_SYSV, false},
103 : #endif
104 : #ifdef USE_DSM_WINDOWS
105 : {"windows", DSM_IMPL_WINDOWS, false},
106 : #endif
107 : #ifdef USE_DSM_MMAP
108 : {"mmap", DSM_IMPL_MMAP, false},
109 : #endif
110 : {NULL, 0, false}
111 : };
112 :
113 : /* Implementation selector. */
114 : int dynamic_shared_memory_type;
115 :
116 : /* Amount of space reserved for DSM segments in the main area. */
117 : int min_dynamic_shared_memory;
118 :
119 : /* Size of buffer to be used for zero-filling. */
120 : #define ZBUFFER_SIZE 8192
121 :
122 : #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
123 :
124 : /*------
125 : * Perform a low-level shared memory operation in a platform-specific way,
126 : * as dictated by the selected implementation. Each implementation is
127 : * required to implement the following primitives.
128 : *
129 : * DSM_OP_CREATE. Create a segment whose size is the request_size and
130 : * map it.
131 : *
132 : * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
133 : *
134 : * DSM_OP_DETACH. Unmap the segment.
135 : *
136 : * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
137 : * segment.
138 : *
139 : * Arguments:
140 : * op: The operation to be performed.
141 : * handle: The handle of an existing object, or for DSM_OP_CREATE, the
142 : * a new handle the caller wants created.
143 : * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
144 : * impl_private: Private, implementation-specific data. Will be a pointer
145 : * to NULL for the first operation on a shared memory segment within this
146 : * backend; thereafter, it will point to the value to which it was set
147 : * on the previous call.
148 : * mapped_address: Pointer to start of current mapping; pointer to NULL
149 : * if none. Updated with new mapping address.
150 : * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
151 : * Updated with new mapped size.
152 : * elevel: Level at which to log errors.
153 : *
154 : * Return value: true on success, false on failure. When false is returned,
155 : * a message should first be logged at the specified elevel, except in the
156 : * case where DSM_OP_CREATE experiences a name collision, which should
157 : * silently return false.
158 : *-----
159 : */
160 : bool
161 14912 : dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
162 : void **impl_private, void **mapped_address, Size *mapped_size,
163 : int elevel)
164 : {
165 : Assert(op == DSM_OP_CREATE || request_size == 0);
166 : Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
167 : (*mapped_address == NULL && *mapped_size == 0));
168 :
169 14912 : switch (dynamic_shared_memory_type)
170 : {
171 : #ifdef USE_DSM_POSIX
172 14912 : case DSM_IMPL_POSIX:
173 14912 : return dsm_impl_posix(op, handle, request_size, impl_private,
174 : mapped_address, mapped_size, elevel);
175 : #endif
176 : #ifdef USE_DSM_SYSV
177 0 : case DSM_IMPL_SYSV:
178 0 : return dsm_impl_sysv(op, handle, request_size, impl_private,
179 : mapped_address, mapped_size, elevel);
180 : #endif
181 : #ifdef USE_DSM_WINDOWS
182 : case DSM_IMPL_WINDOWS:
183 : return dsm_impl_windows(op, handle, request_size, impl_private,
184 : mapped_address, mapped_size, elevel);
185 : #endif
186 : #ifdef USE_DSM_MMAP
187 0 : case DSM_IMPL_MMAP:
188 0 : return dsm_impl_mmap(op, handle, request_size, impl_private,
189 : mapped_address, mapped_size, elevel);
190 : #endif
191 0 : default:
192 0 : elog(ERROR, "unexpected dynamic shared memory type: %d",
193 : dynamic_shared_memory_type);
194 : return false;
195 : }
196 : }
197 :
198 : #ifdef USE_DSM_POSIX
199 : /*
200 : * Operating system primitives to support POSIX shared memory.
201 : *
202 : * POSIX shared memory segments are created and attached using shm_open()
203 : * and shm_unlink(); other operations, such as sizing or mapping the
204 : * segment, are performed as if the shared memory segments were files.
205 : *
206 : * Indeed, on some platforms, they may be implemented that way. While
207 : * POSIX shared memory segments seem intended to exist in a flat namespace,
208 : * some operating systems may implement them as files, even going so far
209 : * to treat a request for /xyz as a request to create a file by that name
210 : * in the root directory. Users of such broken platforms should select
211 : * a different shared memory implementation.
212 : */
213 : static bool
214 14912 : dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
215 : void **impl_private, void **mapped_address, Size *mapped_size,
216 : int elevel)
217 : {
218 : char name[64];
219 : int flags;
220 : int fd;
221 : char *address;
222 :
223 14912 : snprintf(name, 64, "/PostgreSQL.%u", handle);
224 :
225 : /* Handle teardown cases. */
226 14912 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
227 : {
228 8190 : if (*mapped_address != NULL
229 7510 : && munmap(*mapped_address, *mapped_size) != 0)
230 : {
231 0 : ereport(elevel,
232 : (errcode_for_dynamic_shared_memory(),
233 : errmsg("could not unmap shared memory segment \"%s\": %m",
234 : name)));
235 0 : return false;
236 : }
237 8190 : *mapped_address = NULL;
238 8190 : *mapped_size = 0;
239 8190 : if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
240 : {
241 0 : ereport(elevel,
242 : (errcode_for_dynamic_shared_memory(),
243 : errmsg("could not remove shared memory segment \"%s\": %m",
244 : name)));
245 0 : return false;
246 : }
247 8190 : return true;
248 : }
249 :
250 : /*
251 : * Create new segment or open an existing one for attach.
252 : *
253 : * Even though we will close the FD before returning, it seems desirable
254 : * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
255 : * failure. The fact that we won't hold the FD open long justifies using
256 : * ReserveExternalFD rather than AcquireExternalFD, though.
257 : */
258 6722 : ReserveExternalFD();
259 :
260 6722 : flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
261 6722 : if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
262 : {
263 0 : ReleaseExternalFD();
264 0 : if (errno != EEXIST)
265 0 : ereport(elevel,
266 : (errcode_for_dynamic_shared_memory(),
267 : errmsg("could not open shared memory segment \"%s\": %m",
268 : name)));
269 0 : return false;
270 : }
271 :
272 : /*
273 : * If we're attaching the segment, determine the current size; if we are
274 : * creating the segment, set the size to the requested value.
275 : */
276 6722 : if (op == DSM_OP_ATTACH)
277 : {
278 : struct stat st;
279 :
280 3616 : if (fstat(fd, &st) != 0)
281 : {
282 : int save_errno;
283 :
284 : /* Back out what's already been done. */
285 0 : save_errno = errno;
286 0 : close(fd);
287 0 : ReleaseExternalFD();
288 0 : errno = save_errno;
289 :
290 0 : ereport(elevel,
291 : (errcode_for_dynamic_shared_memory(),
292 : errmsg("could not stat shared memory segment \"%s\": %m",
293 : name)));
294 0 : return false;
295 : }
296 3616 : request_size = st.st_size;
297 : }
298 3106 : else if (dsm_impl_posix_resize(fd, request_size) != 0)
299 : {
300 : int save_errno;
301 :
302 : /* Back out what's already been done. */
303 0 : save_errno = errno;
304 0 : close(fd);
305 0 : ReleaseExternalFD();
306 0 : shm_unlink(name);
307 0 : errno = save_errno;
308 :
309 : /*
310 : * If we received a query cancel or termination signal, we will have
311 : * EINTR set here. If the caller said that errors are OK here, check
312 : * for interrupts immediately.
313 : */
314 0 : if (errno == EINTR && elevel >= ERROR)
315 0 : CHECK_FOR_INTERRUPTS();
316 :
317 0 : ereport(elevel,
318 : (errcode_for_dynamic_shared_memory(),
319 : errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
320 : name, request_size)));
321 0 : return false;
322 : }
323 :
324 : /* Map it. */
325 6722 : address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
326 : MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
327 6722 : if (address == MAP_FAILED)
328 : {
329 : int save_errno;
330 :
331 : /* Back out what's already been done. */
332 0 : save_errno = errno;
333 0 : close(fd);
334 0 : ReleaseExternalFD();
335 0 : if (op == DSM_OP_CREATE)
336 0 : shm_unlink(name);
337 0 : errno = save_errno;
338 :
339 0 : ereport(elevel,
340 : (errcode_for_dynamic_shared_memory(),
341 : errmsg("could not map shared memory segment \"%s\": %m",
342 : name)));
343 0 : return false;
344 : }
345 6722 : *mapped_address = address;
346 6722 : *mapped_size = request_size;
347 6722 : close(fd);
348 6722 : ReleaseExternalFD();
349 :
350 6722 : return true;
351 : }
352 :
353 : /*
354 : * Set the size of a virtual memory region associated with a file descriptor.
355 : * If necessary, also ensure that virtual memory is actually allocated by the
356 : * operating system, to avoid nasty surprises later.
357 : *
358 : * Returns non-zero if either truncation or allocation fails, and sets errno.
359 : */
360 : static int
361 3106 : dsm_impl_posix_resize(int fd, off_t size)
362 : {
363 : int rc;
364 :
365 : /* Truncate (or extend) the file to the requested size. */
366 3106 : rc = ftruncate(fd, size);
367 :
368 : /*
369 : * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
370 : * ftruncate, the file may contain a hole. Accessing memory backed by a
371 : * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
372 : * is no more tmpfs space available. So we ask tmpfs to allocate pages
373 : * here, so we can fail gracefully with ENOSPC now rather than risking
374 : * SIGBUS later.
375 : */
376 : #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
377 3106 : if (rc == 0)
378 : {
379 : /*
380 : * We may get interrupted. If so, just retry unless there is an
381 : * interrupt pending. This avoids the possibility of looping forever
382 : * if another backend is repeatedly trying to interrupt us.
383 : */
384 3106 : pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
385 : do
386 : {
387 3108 : rc = posix_fallocate(fd, 0, size);
388 3108 : } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
389 3106 : pgstat_report_wait_end();
390 :
391 : /*
392 : * The caller expects errno to be set, but posix_fallocate() doesn't
393 : * set it. Instead it returns error numbers directly. So set errno,
394 : * even though we'll also return rc to indicate success or failure.
395 : */
396 3106 : errno = rc;
397 : }
398 : #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
399 :
400 3106 : return rc;
401 : }
402 :
403 : #endif /* USE_DSM_POSIX */
404 :
405 : #ifdef USE_DSM_SYSV
406 : /*
407 : * Operating system primitives to support System V shared memory.
408 : *
409 : * System V shared memory segments are manipulated using shmget(), shmat(),
410 : * shmdt(), and shmctl(). As the default allocation limits for System V
411 : * shared memory are usually quite low, the POSIX facilities may be
412 : * preferable; but those are not supported everywhere.
413 : */
414 : static bool
415 0 : dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
416 : void **impl_private, void **mapped_address, Size *mapped_size,
417 : int elevel)
418 : {
419 : key_t key;
420 : int ident;
421 : char *address;
422 : char name[64];
423 : int *ident_cache;
424 :
425 : /*
426 : * POSIX shared memory and mmap-based shared memory identify segments with
427 : * names. To avoid needless error message variation, we use the handle as
428 : * the name.
429 : */
430 0 : snprintf(name, 64, "%u", handle);
431 :
432 : /*
433 : * The System V shared memory namespace is very restricted; names are of
434 : * type key_t, which is expected to be some sort of integer data type, but
435 : * not necessarily the same one as dsm_handle. Since we use dsm_handle to
436 : * identify shared memory segments across processes, this might seem like
437 : * a problem, but it's really not. If dsm_handle is bigger than key_t,
438 : * the cast below might truncate away some bits from the handle the
439 : * user-provided, but it'll truncate exactly the same bits away in exactly
440 : * the same fashion every time we use that handle, which is all that
441 : * really matters. Conversely, if dsm_handle is smaller than key_t, we
442 : * won't use the full range of available key space, but that's no big deal
443 : * either.
444 : *
445 : * We do make sure that the key isn't negative, because that might not be
446 : * portable.
447 : */
448 0 : key = (key_t) handle;
449 0 : if (key < 1) /* avoid compiler warning if type is unsigned */
450 0 : key = -key;
451 :
452 : /*
453 : * There's one special key, IPC_PRIVATE, which can't be used. If we end
454 : * up with that value by chance during a create operation, just pretend it
455 : * already exists, so that caller will retry. If we run into it anywhere
456 : * else, the caller has passed a handle that doesn't correspond to
457 : * anything we ever created, which should not happen.
458 : */
459 0 : if (key == IPC_PRIVATE)
460 : {
461 0 : if (op != DSM_OP_CREATE)
462 0 : elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
463 0 : errno = EEXIST;
464 0 : return false;
465 : }
466 :
467 : /*
468 : * Before we can do anything with a shared memory segment, we have to map
469 : * the shared memory key to a shared memory identifier using shmget(). To
470 : * avoid repeated lookups, we store the key using impl_private.
471 : */
472 0 : if (*impl_private != NULL)
473 : {
474 0 : ident_cache = *impl_private;
475 0 : ident = *ident_cache;
476 : }
477 : else
478 : {
479 0 : int flags = IPCProtection;
480 : size_t segsize;
481 :
482 : /*
483 : * Allocate the memory BEFORE acquiring the resource, so that we don't
484 : * leak the resource if memory allocation fails.
485 : */
486 0 : ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
487 :
488 : /*
489 : * When using shmget to find an existing segment, we must pass the
490 : * size as 0. Passing a non-zero size which is greater than the
491 : * actual size will result in EINVAL.
492 : */
493 0 : segsize = 0;
494 :
495 0 : if (op == DSM_OP_CREATE)
496 : {
497 0 : flags |= IPC_CREAT | IPC_EXCL;
498 0 : segsize = request_size;
499 : }
500 :
501 0 : if ((ident = shmget(key, segsize, flags)) == -1)
502 : {
503 0 : if (errno != EEXIST)
504 : {
505 0 : int save_errno = errno;
506 :
507 0 : pfree(ident_cache);
508 0 : errno = save_errno;
509 0 : ereport(elevel,
510 : (errcode_for_dynamic_shared_memory(),
511 : errmsg("could not get shared memory segment: %m")));
512 : }
513 0 : return false;
514 : }
515 :
516 0 : *ident_cache = ident;
517 0 : *impl_private = ident_cache;
518 : }
519 :
520 : /* Handle teardown cases. */
521 0 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
522 : {
523 0 : pfree(ident_cache);
524 0 : *impl_private = NULL;
525 0 : if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
526 : {
527 0 : ereport(elevel,
528 : (errcode_for_dynamic_shared_memory(),
529 : errmsg("could not unmap shared memory segment \"%s\": %m",
530 : name)));
531 0 : return false;
532 : }
533 0 : *mapped_address = NULL;
534 0 : *mapped_size = 0;
535 0 : if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
536 : {
537 0 : ereport(elevel,
538 : (errcode_for_dynamic_shared_memory(),
539 : errmsg("could not remove shared memory segment \"%s\": %m",
540 : name)));
541 0 : return false;
542 : }
543 0 : return true;
544 : }
545 :
546 : /* If we're attaching it, we must use IPC_STAT to determine the size. */
547 0 : if (op == DSM_OP_ATTACH)
548 : {
549 : struct shmid_ds shm;
550 :
551 0 : if (shmctl(ident, IPC_STAT, &shm) != 0)
552 : {
553 0 : ereport(elevel,
554 : (errcode_for_dynamic_shared_memory(),
555 : errmsg("could not stat shared memory segment \"%s\": %m",
556 : name)));
557 0 : return false;
558 : }
559 0 : request_size = shm.shm_segsz;
560 : }
561 :
562 : /* Map it. */
563 0 : address = shmat(ident, NULL, PG_SHMAT_FLAGS);
564 0 : if (address == (void *) -1)
565 : {
566 : int save_errno;
567 :
568 : /* Back out what's already been done. */
569 0 : save_errno = errno;
570 0 : if (op == DSM_OP_CREATE)
571 0 : shmctl(ident, IPC_RMID, NULL);
572 0 : errno = save_errno;
573 :
574 0 : ereport(elevel,
575 : (errcode_for_dynamic_shared_memory(),
576 : errmsg("could not map shared memory segment \"%s\": %m",
577 : name)));
578 0 : return false;
579 : }
580 0 : *mapped_address = address;
581 0 : *mapped_size = request_size;
582 :
583 0 : return true;
584 : }
585 : #endif
586 :
587 : #ifdef USE_DSM_WINDOWS
588 : /*
589 : * Operating system primitives to support Windows shared memory.
590 : *
591 : * Windows shared memory implementation is done using file mapping
592 : * which can be backed by either physical file or system paging file.
593 : * Current implementation uses system paging file as other effects
594 : * like performance are not clear for physical file and it is used in similar
595 : * way for main shared memory in windows.
596 : *
597 : * A memory mapping object is a kernel object - they always get deleted when
598 : * the last reference to them goes away, either explicitly via a CloseHandle or
599 : * when the process containing the reference exits.
600 : */
601 : static bool
602 : dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
603 : void **impl_private, void **mapped_address,
604 : Size *mapped_size, int elevel)
605 : {
606 : char *address;
607 : HANDLE hmap;
608 : char name[64];
609 : MEMORY_BASIC_INFORMATION info;
610 :
611 : /*
612 : * Storing the shared memory segment in the Global\ namespace, can allow
613 : * any process running in any session to access that file mapping object
614 : * provided that the caller has the required access rights. But to avoid
615 : * issues faced in main shared memory, we are using the naming convention
616 : * similar to main shared memory. We can change here once issue mentioned
617 : * in GetSharedMemName is resolved.
618 : */
619 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
620 :
621 : /*
622 : * Handle teardown cases. Since Windows automatically destroys the object
623 : * when no references remain, we can treat it the same as detach.
624 : */
625 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
626 : {
627 : if (*mapped_address != NULL
628 : && UnmapViewOfFile(*mapped_address) == 0)
629 : {
630 : _dosmaperr(GetLastError());
631 : ereport(elevel,
632 : (errcode_for_dynamic_shared_memory(),
633 : errmsg("could not unmap shared memory segment \"%s\": %m",
634 : name)));
635 : return false;
636 : }
637 : if (*impl_private != NULL
638 : && CloseHandle(*impl_private) == 0)
639 : {
640 : _dosmaperr(GetLastError());
641 : ereport(elevel,
642 : (errcode_for_dynamic_shared_memory(),
643 : errmsg("could not remove shared memory segment \"%s\": %m",
644 : name)));
645 : return false;
646 : }
647 :
648 : *impl_private = NULL;
649 : *mapped_address = NULL;
650 : *mapped_size = 0;
651 : return true;
652 : }
653 :
654 : /* Create new segment or open an existing one for attach. */
655 : if (op == DSM_OP_CREATE)
656 : {
657 : DWORD size_high;
658 : DWORD size_low;
659 : DWORD errcode;
660 :
661 : /* Shifts >= the width of the type are undefined. */
662 : #ifdef _WIN64
663 : size_high = request_size >> 32;
664 : #else
665 : size_high = 0;
666 : #endif
667 : size_low = (DWORD) request_size;
668 :
669 : /* CreateFileMapping might not clear the error code on success */
670 : SetLastError(0);
671 :
672 : hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
673 : NULL, /* Default security attrs */
674 : PAGE_READWRITE, /* Memory is read/write */
675 : size_high, /* Upper 32 bits of size */
676 : size_low, /* Lower 32 bits of size */
677 : name);
678 :
679 : errcode = GetLastError();
680 : if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
681 : {
682 : /*
683 : * On Windows, when the segment already exists, a handle for the
684 : * existing segment is returned. We must close it before
685 : * returning. However, if the existing segment is created by a
686 : * service, then it returns ERROR_ACCESS_DENIED. We don't do
687 : * _dosmaperr here, so errno won't be modified.
688 : */
689 : if (hmap)
690 : CloseHandle(hmap);
691 : return false;
692 : }
693 :
694 : if (!hmap)
695 : {
696 : _dosmaperr(errcode);
697 : ereport(elevel,
698 : (errcode_for_dynamic_shared_memory(),
699 : errmsg("could not create shared memory segment \"%s\": %m",
700 : name)));
701 : return false;
702 : }
703 : }
704 : else
705 : {
706 : hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
707 : FALSE, /* do not inherit the name */
708 : name); /* name of mapping object */
709 : if (!hmap)
710 : {
711 : _dosmaperr(GetLastError());
712 : ereport(elevel,
713 : (errcode_for_dynamic_shared_memory(),
714 : errmsg("could not open shared memory segment \"%s\": %m",
715 : name)));
716 : return false;
717 : }
718 : }
719 :
720 : /* Map it. */
721 : address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
722 : 0, 0, 0);
723 : if (!address)
724 : {
725 : int save_errno;
726 :
727 : _dosmaperr(GetLastError());
728 : /* Back out what's already been done. */
729 : save_errno = errno;
730 : CloseHandle(hmap);
731 : errno = save_errno;
732 :
733 : ereport(elevel,
734 : (errcode_for_dynamic_shared_memory(),
735 : errmsg("could not map shared memory segment \"%s\": %m",
736 : name)));
737 : return false;
738 : }
739 :
740 : /*
741 : * VirtualQuery gives size in page_size units, which is 4K for Windows. We
742 : * need size only when we are attaching, but it's better to get the size
743 : * when creating new segment to keep size consistent both for
744 : * DSM_OP_CREATE and DSM_OP_ATTACH.
745 : */
746 : if (VirtualQuery(address, &info, sizeof(info)) == 0)
747 : {
748 : int save_errno;
749 :
750 : _dosmaperr(GetLastError());
751 : /* Back out what's already been done. */
752 : save_errno = errno;
753 : UnmapViewOfFile(address);
754 : CloseHandle(hmap);
755 : errno = save_errno;
756 :
757 : ereport(elevel,
758 : (errcode_for_dynamic_shared_memory(),
759 : errmsg("could not stat shared memory segment \"%s\": %m",
760 : name)));
761 : return false;
762 : }
763 :
764 : *mapped_address = address;
765 : *mapped_size = info.RegionSize;
766 : *impl_private = hmap;
767 :
768 : return true;
769 : }
770 : #endif
771 :
772 : #ifdef USE_DSM_MMAP
773 : /*
774 : * Operating system primitives to support mmap-based shared memory.
775 : *
776 : * Calling this "shared memory" is somewhat of a misnomer, because what
777 : * we're really doing is creating a bunch of files and mapping them into
778 : * our address space. The operating system may feel obliged to
779 : * synchronize the contents to disk even if nothing is being paged out,
780 : * which will not serve us well. The user can relocate the pg_dynshmem
781 : * directory to a ramdisk to avoid this problem, if available.
782 : */
783 : static bool
784 0 : dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
785 : void **impl_private, void **mapped_address, Size *mapped_size,
786 : int elevel)
787 : {
788 : char name[64];
789 : int flags;
790 : int fd;
791 : char *address;
792 :
793 0 : snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
794 : handle);
795 :
796 : /* Handle teardown cases. */
797 0 : if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
798 : {
799 0 : if (*mapped_address != NULL
800 0 : && munmap(*mapped_address, *mapped_size) != 0)
801 : {
802 0 : ereport(elevel,
803 : (errcode_for_dynamic_shared_memory(),
804 : errmsg("could not unmap shared memory segment \"%s\": %m",
805 : name)));
806 0 : return false;
807 : }
808 0 : *mapped_address = NULL;
809 0 : *mapped_size = 0;
810 0 : if (op == DSM_OP_DESTROY && unlink(name) != 0)
811 : {
812 0 : ereport(elevel,
813 : (errcode_for_dynamic_shared_memory(),
814 : errmsg("could not remove shared memory segment \"%s\": %m",
815 : name)));
816 0 : return false;
817 : }
818 0 : return true;
819 : }
820 :
821 : /* Create new segment or open an existing one for attach. */
822 0 : flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
823 0 : if ((fd = OpenTransientFile(name, flags)) == -1)
824 : {
825 0 : if (errno != EEXIST)
826 0 : ereport(elevel,
827 : (errcode_for_dynamic_shared_memory(),
828 : errmsg("could not open shared memory segment \"%s\": %m",
829 : name)));
830 0 : return false;
831 : }
832 :
833 : /*
834 : * If we're attaching the segment, determine the current size; if we are
835 : * creating the segment, set the size to the requested value.
836 : */
837 0 : if (op == DSM_OP_ATTACH)
838 : {
839 : struct stat st;
840 :
841 0 : if (fstat(fd, &st) != 0)
842 : {
843 : int save_errno;
844 :
845 : /* Back out what's already been done. */
846 0 : save_errno = errno;
847 0 : CloseTransientFile(fd);
848 0 : errno = save_errno;
849 :
850 0 : ereport(elevel,
851 : (errcode_for_dynamic_shared_memory(),
852 : errmsg("could not stat shared memory segment \"%s\": %m",
853 : name)));
854 0 : return false;
855 : }
856 0 : request_size = st.st_size;
857 : }
858 : else
859 : {
860 : /*
861 : * Allocate a buffer full of zeros.
862 : *
863 : * Note: palloc zbuffer, instead of just using a local char array, to
864 : * ensure it is reasonably well-aligned; this may save a few cycles
865 : * transferring data to the kernel.
866 : */
867 0 : char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
868 0 : uint32 remaining = request_size;
869 0 : bool success = true;
870 :
871 : /*
872 : * Zero-fill the file. We have to do this the hard way to ensure that
873 : * all the file space has really been allocated, so that we don't
874 : * later seg fault when accessing the memory mapping. This is pretty
875 : * pessimal.
876 : */
877 0 : while (success && remaining > 0)
878 : {
879 0 : Size goal = remaining;
880 :
881 0 : if (goal > ZBUFFER_SIZE)
882 0 : goal = ZBUFFER_SIZE;
883 0 : pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
884 0 : if (write(fd, zbuffer, goal) == goal)
885 0 : remaining -= goal;
886 : else
887 0 : success = false;
888 0 : pgstat_report_wait_end();
889 : }
890 :
891 0 : if (!success)
892 : {
893 : int save_errno;
894 :
895 : /* Back out what's already been done. */
896 0 : save_errno = errno;
897 0 : CloseTransientFile(fd);
898 0 : unlink(name);
899 0 : errno = save_errno ? save_errno : ENOSPC;
900 :
901 0 : ereport(elevel,
902 : (errcode_for_dynamic_shared_memory(),
903 : errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
904 : name, request_size)));
905 0 : return false;
906 : }
907 : }
908 :
909 : /* Map it. */
910 0 : address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
911 : MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
912 0 : if (address == MAP_FAILED)
913 : {
914 : int save_errno;
915 :
916 : /* Back out what's already been done. */
917 0 : save_errno = errno;
918 0 : CloseTransientFile(fd);
919 0 : if (op == DSM_OP_CREATE)
920 0 : unlink(name);
921 0 : errno = save_errno;
922 :
923 0 : ereport(elevel,
924 : (errcode_for_dynamic_shared_memory(),
925 : errmsg("could not map shared memory segment \"%s\": %m",
926 : name)));
927 0 : return false;
928 : }
929 0 : *mapped_address = address;
930 0 : *mapped_size = request_size;
931 :
932 0 : if (CloseTransientFile(fd) != 0)
933 : {
934 0 : ereport(elevel,
935 : (errcode_for_file_access(),
936 : errmsg("could not close shared memory segment \"%s\": %m",
937 : name)));
938 0 : return false;
939 : }
940 :
941 0 : return true;
942 : }
943 : #endif
944 :
945 : /*
946 : * Implementation-specific actions that must be performed when a segment is to
947 : * be preserved even when no backend has it attached.
948 : *
949 : * Except on Windows, we don't need to do anything at all. But since Windows
950 : * cleans up segments automatically when no references remain, we duplicate
951 : * the segment handle into the postmaster process. The postmaster needn't
952 : * do anything to receive the handle; Windows transfers it automatically.
953 : */
954 : void
955 124 : dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
956 : void **impl_private_pm_handle)
957 : {
958 124 : switch (dynamic_shared_memory_type)
959 : {
960 : #ifdef USE_DSM_WINDOWS
961 : case DSM_IMPL_WINDOWS:
962 : {
963 : HANDLE hmap;
964 :
965 : if (!DuplicateHandle(GetCurrentProcess(), impl_private,
966 : PostmasterHandle, &hmap, 0, FALSE,
967 : DUPLICATE_SAME_ACCESS))
968 : {
969 : char name[64];
970 :
971 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
972 : _dosmaperr(GetLastError());
973 : ereport(ERROR,
974 : (errcode_for_dynamic_shared_memory(),
975 : errmsg("could not duplicate handle for \"%s\": %m",
976 : name)));
977 : }
978 :
979 : /*
980 : * Here, we remember the handle that we created in the
981 : * postmaster process. This handle isn't actually usable in
982 : * any process other than the postmaster, but that doesn't
983 : * matter. We're just holding onto it so that, if the segment
984 : * is unpinned, dsm_impl_unpin_segment can close it.
985 : */
986 : *impl_private_pm_handle = hmap;
987 : break;
988 : }
989 : #endif
990 : default:
991 124 : break;
992 : }
993 124 : }
994 :
995 : /*
996 : * Implementation-specific actions that must be performed when a segment is no
997 : * longer to be preserved, so that it will be cleaned up when all backends
998 : * have detached from it.
999 : *
1000 : * Except on Windows, we don't need to do anything at all. For Windows, we
1001 : * close the extra handle that dsm_impl_pin_segment created in the
1002 : * postmaster's process space.
1003 : */
1004 : void
1005 124 : dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1006 : {
1007 124 : switch (dynamic_shared_memory_type)
1008 : {
1009 : #ifdef USE_DSM_WINDOWS
1010 : case DSM_IMPL_WINDOWS:
1011 : {
1012 : if (*impl_private &&
1013 : !DuplicateHandle(PostmasterHandle, *impl_private,
1014 : NULL, NULL, 0, FALSE,
1015 : DUPLICATE_CLOSE_SOURCE))
1016 : {
1017 : char name[64];
1018 :
1019 : snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1020 : _dosmaperr(GetLastError());
1021 : ereport(ERROR,
1022 : (errcode_for_dynamic_shared_memory(),
1023 : errmsg("could not duplicate handle for \"%s\": %m",
1024 : name)));
1025 : }
1026 :
1027 : *impl_private = NULL;
1028 : break;
1029 : }
1030 : #endif
1031 : default:
1032 124 : break;
1033 : }
1034 124 : }
1035 :
1036 : static int
1037 0 : errcode_for_dynamic_shared_memory(void)
1038 : {
1039 0 : if (errno == EFBIG || errno == ENOMEM)
1040 0 : return errcode(ERRCODE_OUT_OF_MEMORY);
1041 : else
1042 0 : return errcode_for_file_access();
1043 : }
|