Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * sysv_shmem.c
4 : * Implement shared memory using SysV facilities
5 : *
6 : * These routines used to be a fairly thin layer on top of SysV shared
7 : * memory functionality. With the addition of anonymous-shmem logic,
8 : * they're a bit fatter now. We still require a SysV shmem block to
9 : * exist, though, because mmap'd shmem provides no way to find out how
10 : * many processes are attached, which we need for interlocking purposes.
11 : *
12 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
13 : * Portions Copyright (c) 1994, Regents of the University of California
14 : *
15 : * IDENTIFICATION
16 : * src/backend/port/sysv_shmem.c
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include <signal.h>
23 : #include <unistd.h>
24 : #include <sys/file.h>
25 : #include <sys/ipc.h>
26 : #include <sys/mman.h>
27 : #include <sys/shm.h>
28 : #include <sys/stat.h>
29 :
30 : #include "miscadmin.h"
31 : #include "port/pg_bitutils.h"
32 : #include "portability/mem.h"
33 : #include "storage/dsm.h"
34 : #include "storage/fd.h"
35 : #include "storage/ipc.h"
36 : #include "storage/pg_shmem.h"
37 : #include "utils/guc.h"
38 : #include "utils/guc_hooks.h"
39 : #include "utils/pidfile.h"
40 :
41 :
42 : /*
43 : * As of PostgreSQL 9.3, we normally allocate only a very small amount of
44 : * System V shared memory, and only for the purposes of providing an
45 : * interlock to protect the data directory. The real shared memory block
46 : * is allocated using mmap(). This works around the problem that many
47 : * systems have very low limits on the amount of System V shared memory
48 : * that can be allocated. Even a limit of a few megabytes will be enough
49 : * to run many copies of PostgreSQL without needing to adjust system settings.
50 : *
51 : * We assume that no one will attempt to run PostgreSQL 9.3 or later on
52 : * systems that are ancient enough that anonymous shared memory is not
53 : * supported, such as pre-2.4 versions of Linux. If that turns out to be
54 : * false, we might need to add compile and/or run-time tests here and do this
55 : * only if the running kernel supports it.
56 : *
57 : * However, we must always disable this logic in the EXEC_BACKEND case, and
58 : * fall back to the old method of allocating the entire segment using System V
59 : * shared memory, because there's no way to attach an anonymous mmap'd segment
60 : * to a process after exec(). Since EXEC_BACKEND is intended only for
61 : * developer use, this shouldn't be a big problem. Because of this, we do
62 : * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
63 : *
64 : * As of PostgreSQL 12, we regained the ability to use a large System V shared
65 : * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
66 : * to sysv (though this is not the default).
67 : */
68 :
69 :
70 : typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
71 : typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
72 :
73 : /*
74 : * How does a given IpcMemoryId relate to this PostgreSQL process?
75 : *
76 : * One could recycle unattached segments of different data directories if we
77 : * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
78 : * cause us to visit less of the key space, making us less likely to detect a
79 : * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
80 : * in that postmasters of different data directories could simultaneously
81 : * attempt to recycle a given key. We'll waste keys longer in some cases, but
82 : * avoiding the problems of the alternative justifies that loss.
83 : */
84 : typedef enum
85 : {
86 : SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
87 : SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
88 : SHMSTATE_ENOENT, /* no segment of that ID */
89 : SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
90 : SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */
91 : } IpcMemoryState;
92 :
93 :
94 : unsigned long UsedShmemSegID = 0;
95 : void *UsedShmemSegAddr = NULL;
96 :
97 : static Size AnonymousShmemSize;
98 : static void *AnonymousShmem = NULL;
99 :
100 : static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
101 : static void IpcMemoryDetach(int status, Datum shmaddr);
102 : static void IpcMemoryDelete(int status, Datum shmId);
103 : static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
104 : void *attachAt,
105 : PGShmemHeader **addr);
106 :
107 :
108 : /*
109 : * InternalIpcMemoryCreate(memKey, size)
110 : *
111 : * Attempt to create a new shared memory segment with the specified key.
112 : * Will fail (return NULL) if such a segment already exists. If successful,
113 : * attach the segment to the current process and return its attached address.
114 : * On success, callbacks are registered with on_shmem_exit to detach and
115 : * delete the segment when on_shmem_exit is called.
116 : *
117 : * If we fail with a failure code other than collision-with-existing-segment,
118 : * print out an error and abort. Other types of errors are not recoverable.
119 : */
120 : static void *
121 1936 : InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
122 : {
123 : IpcMemoryId shmid;
124 1936 : void *requestedAddress = NULL;
125 : void *memAddress;
126 :
127 : /*
128 : * Normally we just pass requestedAddress = NULL to shmat(), allowing the
129 : * system to choose where the segment gets mapped. But in an EXEC_BACKEND
130 : * build, it's possible for whatever is chosen in the postmaster to not
131 : * work for backends, due to variations in address space layout. As a
132 : * rather klugy workaround, allow the user to specify the address to use
133 : * via setting the environment variable PG_SHMEM_ADDR. (If this were of
134 : * interest for anything except debugging, we'd probably create a cleaner
135 : * and better-documented way to set it, such as a GUC.)
136 : */
137 : #ifdef EXEC_BACKEND
138 : {
139 : char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
140 :
141 : if (pg_shmem_addr)
142 : requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
143 : else
144 : {
145 : #if defined(__darwin__) && SIZEOF_VOID_P == 8
146 : /*
147 : * Provide a default value that is believed to avoid problems with
148 : * ASLR on the current macOS release.
149 : */
150 : requestedAddress = (void *) 0x80000000000;
151 : #endif
152 : }
153 : }
154 : #endif
155 :
156 1936 : shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
157 :
158 1936 : if (shmid < 0)
159 : {
160 18 : int shmget_errno = errno;
161 :
162 : /*
163 : * Fail quietly if error indicates a collision with existing segment.
164 : * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
165 : * we could get a permission violation instead? Also, EIDRM might
166 : * occur if an old seg is slated for destruction but not gone yet.
167 : */
168 18 : if (shmget_errno == EEXIST || shmget_errno == EACCES
169 : #ifdef EIDRM
170 0 : || shmget_errno == EIDRM
171 : #endif
172 : )
173 18 : return NULL;
174 :
175 : /*
176 : * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
177 : * there is an existing segment but it's smaller than "size" (this is
178 : * a result of poorly-thought-out ordering of error tests). To
179 : * distinguish between collision and invalid size in such cases, we
180 : * make a second try with size = 0. These kernels do not test size
181 : * against SHMMIN in the preexisting-segment case, so we will not get
182 : * EINVAL a second time if there is such a segment.
183 : */
184 0 : if (shmget_errno == EINVAL)
185 : {
186 0 : shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
187 :
188 0 : if (shmid < 0)
189 : {
190 : /* As above, fail quietly if we verify a collision */
191 0 : if (errno == EEXIST || errno == EACCES
192 : #ifdef EIDRM
193 0 : || errno == EIDRM
194 : #endif
195 : )
196 0 : return NULL;
197 : /* Otherwise, fall through to report the original error */
198 : }
199 : else
200 : {
201 : /*
202 : * On most platforms we cannot get here because SHMMIN is
203 : * greater than zero. However, if we do succeed in creating a
204 : * zero-size segment, free it and then fall through to report
205 : * the original error.
206 : */
207 0 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
208 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
209 : (int) shmid, IPC_RMID);
210 : }
211 : }
212 :
213 : /*
214 : * Else complain and abort.
215 : *
216 : * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
217 : * is violated. SHMALL violation might be reported as either ENOMEM
218 : * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
219 : * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
220 : * not-enough-RAM is ENOMEM.
221 : */
222 0 : errno = shmget_errno;
223 0 : ereport(FATAL,
224 : (errmsg("could not create shared memory segment: %m"),
225 : errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
226 : (unsigned long) memKey, size,
227 : IPC_CREAT | IPC_EXCL | IPCProtection),
228 : (shmget_errno == EINVAL) ?
229 : errhint("This error usually means that PostgreSQL's request for a shared memory "
230 : "segment exceeded your kernel's SHMMAX parameter, or possibly that "
231 : "it is less than "
232 : "your kernel's SHMMIN parameter.\n"
233 : "The PostgreSQL documentation contains more information about shared "
234 : "memory configuration.") : 0,
235 : (shmget_errno == ENOMEM) ?
236 : errhint("This error usually means that PostgreSQL's request for a shared "
237 : "memory segment exceeded your kernel's SHMALL parameter. You might need "
238 : "to reconfigure the kernel with larger SHMALL.\n"
239 : "The PostgreSQL documentation contains more information about shared "
240 : "memory configuration.") : 0,
241 : (shmget_errno == ENOSPC) ?
242 : errhint("This error does *not* mean that you have run out of disk space. "
243 : "It occurs either if all available shared memory IDs have been taken, "
244 : "in which case you need to raise the SHMMNI parameter in your kernel, "
245 : "or because the system's overall limit for shared memory has been "
246 : "reached.\n"
247 : "The PostgreSQL documentation contains more information about shared "
248 : "memory configuration.") : 0));
249 : }
250 :
251 : /* Register on-exit routine to delete the new segment */
252 1918 : on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
253 :
254 : /* OK, should be able to attach to the segment */
255 1918 : memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
256 :
257 1918 : if (memAddress == (void *) -1)
258 0 : elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
259 : shmid, requestedAddress, PG_SHMAT_FLAGS);
260 :
261 : /* Register on-exit routine to detach new segment before deleting */
262 1918 : on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
263 :
264 : /*
265 : * Store shmem key and ID in data directory lockfile. Format to try to
266 : * keep it the same length always (trailing junk in the lockfile won't
267 : * hurt, but might confuse humans).
268 : */
269 : {
270 : char line[64];
271 :
272 1918 : sprintf(line, "%9lu %9lu",
273 : (unsigned long) memKey, (unsigned long) shmid);
274 1918 : AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
275 : }
276 :
277 1918 : return memAddress;
278 : }
279 :
280 : /****************************************************************************/
281 : /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
282 : /* from process' address space */
283 : /* (called as an on_shmem_exit callback, hence funny argument list) */
284 : /****************************************************************************/
285 : static void
286 1912 : IpcMemoryDetach(int status, Datum shmaddr)
287 : {
288 : /* Detach System V shared memory block. */
289 1912 : if (shmdt(DatumGetPointer(shmaddr)) < 0)
290 0 : elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
291 1912 : }
292 :
293 : /****************************************************************************/
294 : /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
295 : /* (called as an on_shmem_exit callback, hence funny argument list) */
296 : /****************************************************************************/
297 : static void
298 1912 : IpcMemoryDelete(int status, Datum shmId)
299 : {
300 1912 : if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
301 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
302 : DatumGetInt32(shmId), IPC_RMID);
303 1912 : }
304 :
305 : /*
306 : * PGSharedMemoryIsInUse
307 : *
308 : * Is a previously-existing shmem segment still existing and in use?
309 : *
310 : * The point of this exercise is to detect the case where a prior postmaster
311 : * crashed, but it left child backends that are still running. Therefore
312 : * we only care about shmem segments that are associated with the intended
313 : * DataDir. This is an important consideration since accidental matches of
314 : * shmem segment IDs are reasonably common.
315 : */
316 : bool
317 4 : PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
318 : {
319 : PGShmemHeader *memAddress;
320 : IpcMemoryState state;
321 :
322 4 : state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
323 4 : if (memAddress && shmdt(memAddress) < 0)
324 0 : elog(LOG, "shmdt(%p) failed: %m", memAddress);
325 4 : switch (state)
326 : {
327 4 : case SHMSTATE_ENOENT:
328 : case SHMSTATE_FOREIGN:
329 : case SHMSTATE_UNATTACHED:
330 4 : return false;
331 0 : case SHMSTATE_ANALYSIS_FAILURE:
332 : case SHMSTATE_ATTACHED:
333 0 : return true;
334 : }
335 0 : return true;
336 : }
337 :
338 : /*
339 : * Test for a segment with id shmId; see comment at IpcMemoryState.
340 : *
341 : * If the segment exists, we'll attempt to attach to it, using attachAt
342 : * if that's not NULL (but it's best to pass NULL if possible).
343 : *
344 : * *addr is set to the segment memory address if we attached to it, else NULL.
345 : */
346 : static IpcMemoryState
347 22 : PGSharedMemoryAttach(IpcMemoryId shmId,
348 : void *attachAt,
349 : PGShmemHeader **addr)
350 : {
351 : struct shmid_ds shmStat;
352 : struct stat statbuf;
353 : PGShmemHeader *hdr;
354 :
355 22 : *addr = NULL;
356 :
357 : /*
358 : * First, try to stat the shm segment ID, to see if it exists at all.
359 : */
360 22 : if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
361 : {
362 : /*
363 : * EINVAL actually has multiple possible causes documented in the
364 : * shmctl man page, but we assume it must mean the segment no longer
365 : * exists.
366 : */
367 0 : if (errno == EINVAL)
368 0 : return SHMSTATE_ENOENT;
369 :
370 : /*
371 : * EACCES implies we have no read permission, which means it is not a
372 : * Postgres shmem segment (or at least, not one that is relevant to
373 : * our data directory).
374 : */
375 0 : if (errno == EACCES)
376 0 : return SHMSTATE_FOREIGN;
377 :
378 : /*
379 : * Some Linux kernel versions (in fact, all of them as of July 2007)
380 : * sometimes return EIDRM when EINVAL is correct. The Linux kernel
381 : * actually does not have any internal state that would justify
382 : * returning EIDRM, so we can get away with assuming that EIDRM is
383 : * equivalent to EINVAL on that platform.
384 : */
385 : #ifdef HAVE_LINUX_EIDRM_BUG
386 0 : if (errno == EIDRM)
387 0 : return SHMSTATE_ENOENT;
388 : #endif
389 :
390 : /*
391 : * Otherwise, we had better assume that the segment is in use. The
392 : * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
393 : * which implies that the segment has been IPC_RMID'd but there are
394 : * still processes attached to it.
395 : */
396 0 : return SHMSTATE_ANALYSIS_FAILURE;
397 : }
398 :
399 : /*
400 : * Try to attach to the segment and see if it matches our data directory.
401 : * This avoids any risk of duplicate-shmem-key conflicts on machines that
402 : * are running several postmasters under the same userid.
403 : *
404 : * (When we're called from PGSharedMemoryCreate, this stat call is
405 : * duplicative; but since this isn't a high-traffic case it's not worth
406 : * trying to optimize.)
407 : */
408 22 : if (stat(DataDir, &statbuf) < 0)
409 0 : return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
410 :
411 22 : hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
412 22 : if (hdr == (PGShmemHeader *) -1)
413 : {
414 : /*
415 : * Attachment failed. The cases we're interested in are the same as
416 : * for the shmctl() call above. In particular, note that the owning
417 : * postmaster could have terminated and removed the segment between
418 : * shmctl() and shmat().
419 : *
420 : * If attachAt isn't NULL, it's possible that EINVAL reflects a
421 : * problem with that address not a vanished segment, so it's best to
422 : * pass NULL when probing for conflicting segments.
423 : */
424 0 : if (errno == EINVAL)
425 0 : return SHMSTATE_ENOENT; /* segment disappeared */
426 0 : if (errno == EACCES)
427 0 : return SHMSTATE_FOREIGN; /* must be non-Postgres */
428 : #ifdef HAVE_LINUX_EIDRM_BUG
429 0 : if (errno == EIDRM)
430 0 : return SHMSTATE_ENOENT; /* segment disappeared */
431 : #endif
432 : /* Otherwise, be conservative. */
433 0 : return SHMSTATE_ANALYSIS_FAILURE;
434 : }
435 22 : *addr = hdr;
436 :
437 22 : if (hdr->magic != PGShmemMagic ||
438 14 : hdr->device != statbuf.st_dev ||
439 14 : hdr->inode != statbuf.st_ino)
440 : {
441 : /*
442 : * It's either not a Postgres segment, or not one for my data
443 : * directory.
444 : */
445 8 : return SHMSTATE_FOREIGN;
446 : }
447 :
448 : /*
449 : * It does match our data directory, so now test whether any processes are
450 : * still attached to it. (We are, now, but the shm_nattch result is from
451 : * before we attached to it.)
452 : */
453 14 : return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
454 : }
455 :
456 : /*
457 : * Identify the huge page size to use, and compute the related mmap flags.
458 : *
459 : * Some Linux kernel versions have a bug causing mmap() to fail on requests
460 : * that are not a multiple of the hugepage size. Versions without that bug
461 : * instead silently round the request up to the next hugepage multiple ---
462 : * and then munmap() fails when we give it a size different from that.
463 : * So we have to round our request up to a multiple of the actual hugepage
464 : * size to avoid trouble.
465 : *
466 : * Doing the round-up ourselves also lets us make use of the extra memory,
467 : * rather than just wasting it. Currently, we just increase the available
468 : * space recorded in the shmem header, which will make the extra usable for
469 : * purposes such as additional locktable entries. Someday, for very large
470 : * hugepage sizes, we might want to think about more invasive strategies,
471 : * such as increasing shared_buffers to absorb the extra space.
472 : *
473 : * Returns the (real, assumed or config provided) page size into
474 : * *hugepagesize, and the hugepage-related mmap flags to use into
475 : * *mmap_flags if requested by the caller. If huge pages are not supported,
476 : * *hugepagesize and *mmap_flags are set to 0.
477 : */
478 : void
479 3566 : GetHugePageSize(Size *hugepagesize, int *mmap_flags)
480 : {
481 : #ifdef MAP_HUGETLB
482 :
483 3566 : Size default_hugepagesize = 0;
484 3566 : Size hugepagesize_local = 0;
485 3566 : int mmap_flags_local = 0;
486 :
487 : /*
488 : * System-dependent code to find out the default huge page size.
489 : *
490 : * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
491 : * nnnn kB". Ignore any failures, falling back to the preset default.
492 : */
493 : #ifdef __linux__
494 :
495 : {
496 3566 : FILE *fp = AllocateFile("/proc/meminfo", "r");
497 : char buf[128];
498 : unsigned int sz;
499 : char ch;
500 :
501 3566 : if (fp)
502 : {
503 167602 : while (fgets(buf, sizeof(buf), fp))
504 : {
505 167602 : if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
506 : {
507 3566 : if (ch == 'k')
508 : {
509 3566 : default_hugepagesize = sz * (Size) 1024;
510 3566 : break;
511 : }
512 : /* We could accept other units besides kB, if needed */
513 : }
514 : }
515 3566 : FreeFile(fp);
516 : }
517 : }
518 : #endif /* __linux__ */
519 :
520 3566 : if (huge_page_size != 0)
521 : {
522 : /* If huge page size is requested explicitly, use that. */
523 0 : hugepagesize_local = (Size) huge_page_size * 1024;
524 : }
525 3566 : else if (default_hugepagesize != 0)
526 : {
527 : /* Otherwise use the system default, if we have it. */
528 3566 : hugepagesize_local = default_hugepagesize;
529 : }
530 : else
531 : {
532 : /*
533 : * If we fail to find out the system's default huge page size, or no
534 : * huge page size is requested explicitly, assume it is 2MB. This will
535 : * work fine when the actual size is less. If it's more, we might get
536 : * mmap() or munmap() failures due to unaligned requests; but at this
537 : * writing, there are no reports of any non-Linux systems being picky
538 : * about that.
539 : */
540 0 : hugepagesize_local = 2 * 1024 * 1024;
541 : }
542 :
543 3566 : mmap_flags_local = MAP_HUGETLB;
544 :
545 : /*
546 : * On recent enough Linux, also include the explicit page size, if
547 : * necessary.
548 : */
549 : #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
550 3566 : if (hugepagesize_local != default_hugepagesize)
551 : {
552 0 : int shift = pg_ceil_log2_64(hugepagesize_local);
553 :
554 0 : mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
555 : }
556 : #endif
557 :
558 : /* assign the results found */
559 3566 : if (mmap_flags)
560 1922 : *mmap_flags = mmap_flags_local;
561 3566 : if (hugepagesize)
562 3566 : *hugepagesize = hugepagesize_local;
563 :
564 : #else
565 :
566 : if (hugepagesize)
567 : *hugepagesize = 0;
568 : if (mmap_flags)
569 : *mmap_flags = 0;
570 :
571 : #endif /* MAP_HUGETLB */
572 3566 : }
573 :
574 : /*
575 : * GUC check_hook for huge_page_size
576 : */
577 : bool
578 1982 : check_huge_page_size(int *newval, void **extra, GucSource source)
579 : {
580 : #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
581 : /* Recent enough Linux only, for now. See GetHugePageSize(). */
582 : if (*newval != 0)
583 : {
584 : GUC_check_errdetail("\"huge_page_size\" must be 0 on this platform.");
585 : return false;
586 : }
587 : #endif
588 1982 : return true;
589 : }
590 :
591 : /*
592 : * Creates an anonymous mmap()ed shared memory segment.
593 : *
594 : * Pass the requested size in *size. This function will modify *size to the
595 : * actual size of the allocation, if it ends up allocating a segment that is
596 : * larger than requested.
597 : */
598 : static void *
599 1922 : CreateAnonymousSegment(Size *size)
600 : {
601 1922 : Size allocsize = *size;
602 1922 : void *ptr = MAP_FAILED;
603 1922 : int mmap_errno = 0;
604 :
605 : #ifndef MAP_HUGETLB
606 : /* PGSharedMemoryCreate should have dealt with this case */
607 : Assert(huge_pages != HUGE_PAGES_ON);
608 : #else
609 1922 : if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
610 : {
611 : /*
612 : * Round up the request size to a suitable large value.
613 : */
614 : Size hugepagesize;
615 : int mmap_flags;
616 :
617 1922 : GetHugePageSize(&hugepagesize, &mmap_flags);
618 :
619 1922 : if (allocsize % hugepagesize != 0)
620 1922 : allocsize += hugepagesize - (allocsize % hugepagesize);
621 :
622 1922 : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
623 : PG_MMAP_FLAGS | mmap_flags, -1, 0);
624 1922 : mmap_errno = errno;
625 1922 : if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
626 1922 : elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
627 : allocsize);
628 : }
629 : #endif
630 :
631 : /*
632 : * Report whether huge pages are in use. This needs to be tracked before
633 : * the second mmap() call if attempting to use huge pages failed
634 : * previously.
635 : */
636 1922 : SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
637 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
638 :
639 1922 : if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
640 : {
641 : /*
642 : * Use the original size, not the rounded-up value, when falling back
643 : * to non-huge pages.
644 : */
645 1922 : allocsize = *size;
646 1922 : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
647 : PG_MMAP_FLAGS, -1, 0);
648 1922 : mmap_errno = errno;
649 : }
650 :
651 1922 : if (ptr == MAP_FAILED)
652 : {
653 0 : errno = mmap_errno;
654 0 : ereport(FATAL,
655 : (errmsg("could not map anonymous shared memory: %m"),
656 : (mmap_errno == ENOMEM) ?
657 : errhint("This error usually means that PostgreSQL's request "
658 : "for a shared memory segment exceeded available memory, "
659 : "swap space, or huge pages. To reduce the request size "
660 : "(currently %zu bytes), reduce PostgreSQL's shared "
661 : "memory usage, perhaps by reducing \"shared_buffers\" or "
662 : "\"max_connections\".",
663 : allocsize) : 0));
664 : }
665 :
666 1922 : *size = allocsize;
667 1922 : return ptr;
668 : }
669 :
670 : /*
671 : * AnonymousShmemDetach --- detach from an anonymous mmap'd block
672 : * (called as an on_shmem_exit callback, hence funny argument list)
673 : */
674 : static void
675 1916 : AnonymousShmemDetach(int status, Datum arg)
676 : {
677 : /* Release anonymous shared memory block, if any. */
678 1916 : if (AnonymousShmem != NULL)
679 : {
680 1916 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
681 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
682 : AnonymousShmem, AnonymousShmemSize);
683 1916 : AnonymousShmem = NULL;
684 : }
685 1916 : }
686 :
687 : /*
688 : * PGSharedMemoryCreate
689 : *
690 : * Create a shared memory segment of the given size and initialize its
691 : * standard header. Also, register an on_shmem_exit callback to release
692 : * the storage.
693 : *
694 : * Dead Postgres segments pertinent to this DataDir are recycled if found, but
695 : * we do not fail upon collision with foreign shmem segments. The idea here
696 : * is to detect and re-use keys that may have been assigned by a crashed
697 : * postmaster or backend.
698 : */
699 : PGShmemHeader *
700 1922 : PGSharedMemoryCreate(Size size,
701 : PGShmemHeader **shim)
702 : {
703 : IpcMemoryKey NextShmemSegID;
704 : void *memAddress;
705 : PGShmemHeader *hdr;
706 : struct stat statbuf;
707 : Size sysvsize;
708 :
709 : /*
710 : * We use the data directory's ID info (inode and device numbers) to
711 : * positively identify shmem segments associated with this data dir, and
712 : * also as seeds for searching for a free shmem key.
713 : */
714 1922 : if (stat(DataDir, &statbuf) < 0)
715 0 : ereport(FATAL,
716 : (errcode_for_file_access(),
717 : errmsg("could not stat data directory \"%s\": %m",
718 : DataDir)));
719 :
720 : /* Complain if hugepages demanded but we can't possibly support them */
721 : #if !defined(MAP_HUGETLB)
722 : if (huge_pages == HUGE_PAGES_ON)
723 : ereport(ERROR,
724 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
725 : errmsg("huge pages not supported on this platform")));
726 : #endif
727 :
728 : /* For now, we don't support huge pages in SysV memory */
729 1922 : if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP)
730 0 : ereport(ERROR,
731 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
732 : errmsg("huge pages not supported with the current \"shared_memory_type\" setting")));
733 :
734 : /* Room for a header? */
735 : Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
736 :
737 1922 : if (shared_memory_type == SHMEM_TYPE_MMAP)
738 : {
739 1922 : AnonymousShmem = CreateAnonymousSegment(&size);
740 1922 : AnonymousShmemSize = size;
741 :
742 : /* Register on-exit routine to unmap the anonymous segment */
743 1922 : on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
744 :
745 : /* Now we need only allocate a minimal-sized SysV shmem block. */
746 1922 : sysvsize = sizeof(PGShmemHeader);
747 : }
748 : else
749 : {
750 0 : sysvsize = size;
751 :
752 : /* huge pages are only available with mmap */
753 0 : SetConfigOption("huge_pages_status", "off",
754 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
755 : }
756 :
757 : /*
758 : * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
759 : * ensure no more than one postmaster per data directory can enter this
760 : * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
761 : * that, but prefer fixing it over coping here.)
762 : */
763 1922 : NextShmemSegID = statbuf.st_ino;
764 :
765 : for (;;)
766 14 : {
767 : IpcMemoryId shmid;
768 : PGShmemHeader *oldhdr;
769 : IpcMemoryState state;
770 :
771 : /* Try to create new segment */
772 1936 : memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
773 1936 : if (memAddress)
774 1918 : break; /* successful create and attach */
775 :
776 : /* Check shared memory and possibly remove and recreate */
777 :
778 : /*
779 : * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
780 : * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
781 : * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
782 : */
783 18 : shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
784 18 : if (shmid < 0)
785 : {
786 0 : oldhdr = NULL;
787 0 : state = SHMSTATE_FOREIGN;
788 : }
789 : else
790 18 : state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
791 :
792 18 : switch (state)
793 : {
794 4 : case SHMSTATE_ANALYSIS_FAILURE:
795 : case SHMSTATE_ATTACHED:
796 4 : ereport(FATAL,
797 : (errcode(ERRCODE_LOCK_FILE_EXISTS),
798 : errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
799 : (unsigned long) NextShmemSegID,
800 : (unsigned long) shmid),
801 : errhint("Terminate any old server processes associated with data directory \"%s\".",
802 : DataDir)));
803 : break;
804 0 : case SHMSTATE_ENOENT:
805 :
806 : /*
807 : * To our surprise, some other process deleted since our last
808 : * InternalIpcMemoryCreate(). Moments earlier, we would have
809 : * seen SHMSTATE_FOREIGN. Try that same ID again.
810 : */
811 0 : elog(LOG,
812 : "shared memory block (key %lu, ID %lu) deleted during startup",
813 : (unsigned long) NextShmemSegID,
814 : (unsigned long) shmid);
815 0 : break;
816 8 : case SHMSTATE_FOREIGN:
817 8 : NextShmemSegID++;
818 8 : break;
819 6 : case SHMSTATE_UNATTACHED:
820 :
821 : /*
822 : * The segment pertains to DataDir, and every process that had
823 : * used it has died or detached. Zap it, if possible, and any
824 : * associated dynamic shared memory segments, as well. This
825 : * shouldn't fail, but if it does, assume the segment belongs
826 : * to someone else after all, and try the next candidate.
827 : * Otherwise, try again to create the segment. That may fail
828 : * if some other process creates the same shmem key before we
829 : * do, in which case we'll try the next key.
830 : */
831 6 : if (oldhdr->dsm_control != 0)
832 6 : dsm_cleanup_using_control_segment(oldhdr->dsm_control);
833 6 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
834 0 : NextShmemSegID++;
835 6 : break;
836 : }
837 :
838 14 : if (oldhdr && shmdt(oldhdr) < 0)
839 0 : elog(LOG, "shmdt(%p) failed: %m", oldhdr);
840 : }
841 :
842 : /* Initialize new segment. */
843 1918 : hdr = (PGShmemHeader *) memAddress;
844 1918 : hdr->creatorPID = getpid();
845 1918 : hdr->magic = PGShmemMagic;
846 1918 : hdr->dsm_control = 0;
847 :
848 : /* Fill in the data directory ID info, too */
849 1918 : hdr->device = statbuf.st_dev;
850 1918 : hdr->inode = statbuf.st_ino;
851 :
852 : /*
853 : * Initialize space allocation status for segment.
854 : */
855 1918 : hdr->totalsize = size;
856 1918 : hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
857 1918 : *shim = hdr;
858 :
859 : /* Save info for possible future use */
860 1918 : UsedShmemSegAddr = memAddress;
861 1918 : UsedShmemSegID = (unsigned long) NextShmemSegID;
862 :
863 : /*
864 : * If AnonymousShmem is NULL here, then we're not using anonymous shared
865 : * memory, and should return a pointer to the System V shared memory
866 : * block. Otherwise, the System V shared memory block is only a shim, and
867 : * we must return a pointer to the real block.
868 : */
869 1918 : if (AnonymousShmem == NULL)
870 0 : return hdr;
871 1918 : memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
872 1918 : return (PGShmemHeader *) AnonymousShmem;
873 : }
874 :
875 : #ifdef EXEC_BACKEND
876 :
877 : /*
878 : * PGSharedMemoryReAttach
879 : *
880 : * This is called during startup of a postmaster child process to re-attach to
881 : * an already existing shared memory segment. This is needed only in the
882 : * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
883 : * segment attachment via fork().
884 : *
885 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
886 : * routine. The caller must have already restored them to the postmaster's
887 : * values.
888 : */
889 : void
890 : PGSharedMemoryReAttach(void)
891 : {
892 : IpcMemoryId shmid;
893 : PGShmemHeader *hdr;
894 : IpcMemoryState state;
895 : void *origUsedShmemSegAddr = UsedShmemSegAddr;
896 :
897 : Assert(UsedShmemSegAddr != NULL);
898 : Assert(IsUnderPostmaster);
899 :
900 : #ifdef __CYGWIN__
901 : /* cygipc (currently) appears to not detach on exec. */
902 : PGSharedMemoryDetach();
903 : UsedShmemSegAddr = origUsedShmemSegAddr;
904 : #endif
905 :
906 : elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
907 : shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
908 : if (shmid < 0)
909 : state = SHMSTATE_FOREIGN;
910 : else
911 : state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
912 : if (state != SHMSTATE_ATTACHED)
913 : elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
914 : (int) UsedShmemSegID, UsedShmemSegAddr);
915 : if (hdr != origUsedShmemSegAddr)
916 : elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
917 : hdr, origUsedShmemSegAddr);
918 : dsm_set_control_handle(hdr->dsm_control);
919 :
920 : UsedShmemSegAddr = hdr; /* probably redundant */
921 : }
922 :
923 : /*
924 : * PGSharedMemoryNoReAttach
925 : *
926 : * This is called during startup of a postmaster child process when we choose
927 : * *not* to re-attach to the existing shared memory segment. We must clean up
928 : * to leave things in the appropriate state. This is not used in the non
929 : * EXEC_BACKEND case, either.
930 : *
931 : * The child process startup logic might or might not call PGSharedMemoryDetach
932 : * after this; make sure that it will be a no-op if called.
933 : *
934 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
935 : * routine. The caller must have already restored them to the postmaster's
936 : * values.
937 : */
938 : void
939 : PGSharedMemoryNoReAttach(void)
940 : {
941 : Assert(UsedShmemSegAddr != NULL);
942 : Assert(IsUnderPostmaster);
943 :
944 : #ifdef __CYGWIN__
945 : /* cygipc (currently) appears to not detach on exec. */
946 : PGSharedMemoryDetach();
947 : #endif
948 :
949 : /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
950 : UsedShmemSegAddr = NULL;
951 : /* And the same for UsedShmemSegID. */
952 : UsedShmemSegID = 0;
953 : }
954 :
955 : #endif /* EXEC_BACKEND */
956 :
957 : /*
958 : * PGSharedMemoryDetach
959 : *
960 : * Detach from the shared memory segment, if still attached. This is not
961 : * intended to be called explicitly by the process that originally created the
962 : * segment (it will have on_shmem_exit callback(s) registered to do that).
963 : * Rather, this is for subprocesses that have inherited an attachment and want
964 : * to get rid of it.
965 : *
966 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
967 : * routine, also AnonymousShmem and AnonymousShmemSize.
968 : */
969 : void
970 2 : PGSharedMemoryDetach(void)
971 : {
972 2 : if (UsedShmemSegAddr != NULL)
973 : {
974 2 : if ((shmdt(UsedShmemSegAddr) < 0)
975 : #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
976 : /* Work-around for cygipc exec bug */
977 : && shmdt(NULL) < 0
978 : #endif
979 : )
980 0 : elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
981 2 : UsedShmemSegAddr = NULL;
982 : }
983 :
984 2 : if (AnonymousShmem != NULL)
985 : {
986 2 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
987 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
988 : AnonymousShmem, AnonymousShmemSize);
989 2 : AnonymousShmem = NULL;
990 : }
991 2 : }
|