Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * sysv_shmem.c
4 : * Implement shared memory using SysV facilities
5 : *
6 : * These routines used to be a fairly thin layer on top of SysV shared
7 : * memory functionality. With the addition of anonymous-shmem logic,
8 : * they're a bit fatter now. We still require a SysV shmem block to
9 : * exist, though, because mmap'd shmem provides no way to find out how
10 : * many processes are attached, which we need for interlocking purposes.
11 : *
12 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
13 : * Portions Copyright (c) 1994, Regents of the University of California
14 : *
15 : * IDENTIFICATION
16 : * src/backend/port/sysv_shmem.c
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include <signal.h>
23 : #include <unistd.h>
24 : #include <sys/file.h>
25 : #include <sys/ipc.h>
26 : #include <sys/mman.h>
27 : #include <sys/shm.h>
28 : #include <sys/stat.h>
29 :
30 : #include "miscadmin.h"
31 : #include "port/pg_bitutils.h"
32 : #include "portability/mem.h"
33 : #include "storage/dsm.h"
34 : #include "storage/fd.h"
35 : #include "storage/ipc.h"
36 : #include "storage/pg_shmem.h"
37 : #include "storage/shmem.h"
38 : #include "utils/guc.h"
39 : #include "utils/guc_hooks.h"
40 : #include "utils/pidfile.h"
41 :
42 :
43 : /*
44 : * As of PostgreSQL 9.3, we normally allocate only a very small amount of
45 : * System V shared memory, and only for the purposes of providing an
46 : * interlock to protect the data directory. The real shared memory block
47 : * is allocated using mmap(). This works around the problem that many
48 : * systems have very low limits on the amount of System V shared memory
49 : * that can be allocated. Even a limit of a few megabytes will be enough
50 : * to run many copies of PostgreSQL without needing to adjust system settings.
51 : *
52 : * We assume that no one will attempt to run PostgreSQL 9.3 or later on
53 : * systems that are ancient enough that anonymous shared memory is not
54 : * supported, such as pre-2.4 versions of Linux. If that turns out to be
55 : * false, we might need to add compile and/or run-time tests here and do this
56 : * only if the running kernel supports it.
57 : *
58 : * However, we must always disable this logic in the EXEC_BACKEND case, and
59 : * fall back to the old method of allocating the entire segment using System V
60 : * shared memory, because there's no way to attach an anonymous mmap'd segment
61 : * to a process after exec(). Since EXEC_BACKEND is intended only for
62 : * developer use, this shouldn't be a big problem. Because of this, we do
63 : * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
64 : *
65 : * As of PostgreSQL 12, we regained the ability to use a large System V shared
66 : * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
67 : * to sysv (though this is not the default).
68 : */
69 :
70 :
71 : typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
72 : typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
73 :
74 : /*
75 : * How does a given IpcMemoryId relate to this PostgreSQL process?
76 : *
77 : * One could recycle unattached segments of different data directories if we
78 : * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
79 : * cause us to visit less of the key space, making us less likely to detect a
80 : * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
81 : * in that postmasters of different data directories could simultaneously
82 : * attempt to recycle a given key. We'll waste keys longer in some cases, but
83 : * avoiding the problems of the alternative justifies that loss.
84 : */
85 : typedef enum
86 : {
87 : SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
88 : SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
89 : SHMSTATE_ENOENT, /* no segment of that ID */
90 : SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
91 : SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */
92 : } IpcMemoryState;
93 :
94 :
95 : unsigned long UsedShmemSegID = 0;
96 : void *UsedShmemSegAddr = NULL;
97 :
98 : static Size AnonymousShmemSize;
99 : static void *AnonymousShmem = NULL;
100 :
101 : static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
102 : static void IpcMemoryDetach(int status, Datum shmaddr);
103 : static void IpcMemoryDelete(int status, Datum shmId);
104 : static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
105 : void *attachAt,
106 : PGShmemHeader **addr);
107 :
108 :
109 : /*
110 : * InternalIpcMemoryCreate(memKey, size)
111 : *
112 : * Attempt to create a new shared memory segment with the specified key.
113 : * Will fail (return NULL) if such a segment already exists. If successful,
114 : * attach the segment to the current process and return its attached address.
115 : * On success, callbacks are registered with on_shmem_exit to detach and
116 : * delete the segment when on_shmem_exit is called.
117 : *
118 : * If we fail with a failure code other than collision-with-existing-segment,
119 : * print out an error and abort. Other types of errors are not recoverable.
120 : */
121 : static void *
122 2292 : InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
123 : {
124 : IpcMemoryId shmid;
125 2292 : void *requestedAddress = NULL;
126 : void *memAddress;
127 :
128 : /*
129 : * Normally we just pass requestedAddress = NULL to shmat(), allowing the
130 : * system to choose where the segment gets mapped. But in an EXEC_BACKEND
131 : * build, it's possible for whatever is chosen in the postmaster to not
132 : * work for backends, due to variations in address space layout. As a
133 : * rather klugy workaround, allow the user to specify the address to use
134 : * via setting the environment variable PG_SHMEM_ADDR. (If this were of
135 : * interest for anything except debugging, we'd probably create a cleaner
136 : * and better-documented way to set it, such as a GUC.)
137 : */
138 : #ifdef EXEC_BACKEND
139 : {
140 : char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
141 :
142 : if (pg_shmem_addr)
143 : requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
144 : else
145 : {
146 : #if defined(__darwin__) && SIZEOF_VOID_P == 8
147 : /*
148 : * Provide a default value that is believed to avoid problems with
149 : * ASLR on the current macOS release.
150 : */
151 : requestedAddress = (void *) 0x80000000000;
152 : #endif
153 : }
154 : }
155 : #endif
156 :
157 2292 : shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
158 :
159 2292 : if (shmid < 0)
160 : {
161 12 : int shmget_errno = errno;
162 :
163 : /*
164 : * Fail quietly if error indicates a collision with existing segment.
165 : * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
166 : * we could get a permission violation instead? Also, EIDRM might
167 : * occur if an old seg is slated for destruction but not gone yet.
168 : */
169 12 : if (shmget_errno == EEXIST || shmget_errno == EACCES
170 : #ifdef EIDRM
171 0 : || shmget_errno == EIDRM
172 : #endif
173 : )
174 12 : return NULL;
175 :
176 : /*
177 : * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
178 : * there is an existing segment but it's smaller than "size" (this is
179 : * a result of poorly-thought-out ordering of error tests). To
180 : * distinguish between collision and invalid size in such cases, we
181 : * make a second try with size = 0. These kernels do not test size
182 : * against SHMMIN in the preexisting-segment case, so we will not get
183 : * EINVAL a second time if there is such a segment.
184 : */
185 0 : if (shmget_errno == EINVAL)
186 : {
187 0 : shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
188 :
189 0 : if (shmid < 0)
190 : {
191 : /* As above, fail quietly if we verify a collision */
192 0 : if (errno == EEXIST || errno == EACCES
193 : #ifdef EIDRM
194 0 : || errno == EIDRM
195 : #endif
196 : )
197 0 : return NULL;
198 : /* Otherwise, fall through to report the original error */
199 : }
200 : else
201 : {
202 : /*
203 : * On most platforms we cannot get here because SHMMIN is
204 : * greater than zero. However, if we do succeed in creating a
205 : * zero-size segment, free it and then fall through to report
206 : * the original error.
207 : */
208 0 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
209 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
210 : shmid, IPC_RMID);
211 : }
212 : }
213 :
214 : /*
215 : * Else complain and abort.
216 : *
217 : * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
218 : * is violated. SHMALL violation might be reported as either ENOMEM
219 : * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
220 : * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
221 : * not-enough-RAM is ENOMEM.
222 : */
223 0 : errno = shmget_errno;
224 0 : ereport(FATAL,
225 : (errmsg("could not create shared memory segment: %m"),
226 : errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
227 : (unsigned long) memKey, size,
228 : IPC_CREAT | IPC_EXCL | IPCProtection),
229 : (shmget_errno == EINVAL) ?
230 : errhint("This error usually means that PostgreSQL's request for a shared memory "
231 : "segment exceeded your kernel's SHMMAX parameter, or possibly that "
232 : "it is less than "
233 : "your kernel's SHMMIN parameter.\n"
234 : "The PostgreSQL documentation contains more information about shared "
235 : "memory configuration.") : 0,
236 : (shmget_errno == ENOMEM) ?
237 : errhint("This error usually means that PostgreSQL's request for a shared "
238 : "memory segment exceeded your kernel's SHMALL parameter. You might need "
239 : "to reconfigure the kernel with larger SHMALL.\n"
240 : "The PostgreSQL documentation contains more information about shared "
241 : "memory configuration.") : 0,
242 : (shmget_errno == ENOSPC) ?
243 : errhint("This error does *not* mean that you have run out of disk space. "
244 : "It occurs either if all available shared memory IDs have been taken, "
245 : "in which case you need to raise the SHMMNI parameter in your kernel, "
246 : "or because the system's overall limit for shared memory has been "
247 : "reached.\n"
248 : "The PostgreSQL documentation contains more information about shared "
249 : "memory configuration.") : 0));
250 : }
251 :
252 : /* Register on-exit routine to delete the new segment */
253 2280 : on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
254 :
255 : /* OK, should be able to attach to the segment */
256 2280 : memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
257 :
258 2280 : if (memAddress == (void *) -1)
259 0 : elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
260 : shmid, requestedAddress, PG_SHMAT_FLAGS);
261 :
262 : /* Register on-exit routine to detach new segment before deleting */
263 2280 : on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
264 :
265 : /*
266 : * Store shmem key and ID in data directory lockfile. Format to try to
267 : * keep it the same length always (trailing junk in the lockfile won't
268 : * hurt, but might confuse humans).
269 : */
270 : {
271 : char line[64];
272 :
273 2280 : sprintf(line, "%9lu %9lu",
274 : (unsigned long) memKey, (unsigned long) shmid);
275 2280 : AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
276 : }
277 :
278 2280 : return memAddress;
279 : }
280 :
281 : /****************************************************************************/
282 : /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
283 : /* from process' address space */
284 : /* (called as an on_shmem_exit callback, hence funny argument list) */
285 : /****************************************************************************/
286 : static void
287 2280 : IpcMemoryDetach(int status, Datum shmaddr)
288 : {
289 : /* Detach System V shared memory block. */
290 2280 : if (shmdt(DatumGetPointer(shmaddr)) < 0)
291 0 : elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
292 2280 : }
293 :
294 : /****************************************************************************/
295 : /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
296 : /* (called as an on_shmem_exit callback, hence funny argument list) */
297 : /****************************************************************************/
298 : static void
299 2280 : IpcMemoryDelete(int status, Datum shmId)
300 : {
301 2280 : if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
302 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
303 : DatumGetInt32(shmId), IPC_RMID);
304 2280 : }
305 :
306 : /*
307 : * PGSharedMemoryIsInUse
308 : *
309 : * Is a previously-existing shmem segment still existing and in use?
310 : *
311 : * The point of this exercise is to detect the case where a prior postmaster
312 : * crashed, but it left child backends that are still running. Therefore
313 : * we only care about shmem segments that are associated with the intended
314 : * DataDir. This is an important consideration since accidental matches of
315 : * shmem segment IDs are reasonably common.
316 : */
317 : bool
318 4 : PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
319 : {
320 : PGShmemHeader *memAddress;
321 : IpcMemoryState state;
322 :
323 4 : state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
324 4 : if (memAddress && shmdt(memAddress) < 0)
325 0 : elog(LOG, "shmdt(%p) failed: %m", memAddress);
326 4 : switch (state)
327 : {
328 2 : case SHMSTATE_ENOENT:
329 : case SHMSTATE_FOREIGN:
330 : case SHMSTATE_UNATTACHED:
331 2 : return false;
332 2 : case SHMSTATE_ANALYSIS_FAILURE:
333 : case SHMSTATE_ATTACHED:
334 2 : return true;
335 : }
336 0 : return true;
337 : }
338 :
339 : /*
340 : * Test for a segment with id shmId; see comment at IpcMemoryState.
341 : *
342 : * If the segment exists, we'll attempt to attach to it, using attachAt
343 : * if that's not NULL (but it's best to pass NULL if possible).
344 : *
345 : * *addr is set to the segment memory address if we attached to it, else NULL.
346 : */
347 : static IpcMemoryState
348 16 : PGSharedMemoryAttach(IpcMemoryId shmId,
349 : void *attachAt,
350 : PGShmemHeader **addr)
351 : {
352 : struct shmid_ds shmStat;
353 : struct stat statbuf;
354 : PGShmemHeader *hdr;
355 :
356 16 : *addr = NULL;
357 :
358 : /*
359 : * First, try to stat the shm segment ID, to see if it exists at all.
360 : */
361 16 : if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
362 : {
363 : /*
364 : * EINVAL actually has multiple possible causes documented in the
365 : * shmctl man page, but we assume it must mean the segment no longer
366 : * exists.
367 : */
368 0 : if (errno == EINVAL)
369 0 : return SHMSTATE_ENOENT;
370 :
371 : /*
372 : * EACCES implies we have no read permission, which means it is not a
373 : * Postgres shmem segment (or at least, not one that is relevant to
374 : * our data directory).
375 : */
376 0 : if (errno == EACCES)
377 0 : return SHMSTATE_FOREIGN;
378 :
379 : /*
380 : * Some Linux kernel versions (in fact, all of them as of July 2007)
381 : * sometimes return EIDRM when EINVAL is correct. The Linux kernel
382 : * actually does not have any internal state that would justify
383 : * returning EIDRM, so we can get away with assuming that EIDRM is
384 : * equivalent to EINVAL on that platform.
385 : */
386 : #ifdef HAVE_LINUX_EIDRM_BUG
387 0 : if (errno == EIDRM)
388 0 : return SHMSTATE_ENOENT;
389 : #endif
390 :
391 : /*
392 : * Otherwise, we had better assume that the segment is in use. The
393 : * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
394 : * which implies that the segment has been IPC_RMID'd but there are
395 : * still processes attached to it.
396 : */
397 0 : return SHMSTATE_ANALYSIS_FAILURE;
398 : }
399 :
400 : /*
401 : * Try to attach to the segment and see if it matches our data directory.
402 : * This avoids any risk of duplicate-shmem-key conflicts on machines that
403 : * are running several postmasters under the same userid.
404 : *
405 : * (When we're called from PGSharedMemoryCreate, this stat call is
406 : * duplicative; but since this isn't a high-traffic case it's not worth
407 : * trying to optimize.)
408 : */
409 16 : if (stat(DataDir, &statbuf) < 0)
410 0 : return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
411 :
412 16 : hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
413 16 : if (hdr == (PGShmemHeader *) -1)
414 : {
415 : /*
416 : * Attachment failed. The cases we're interested in are the same as
417 : * for the shmctl() call above. In particular, note that the owning
418 : * postmaster could have terminated and removed the segment between
419 : * shmctl() and shmat().
420 : *
421 : * If attachAt isn't NULL, it's possible that EINVAL reflects a
422 : * problem with that address not a vanished segment, so it's best to
423 : * pass NULL when probing for conflicting segments.
424 : */
425 0 : if (errno == EINVAL)
426 0 : return SHMSTATE_ENOENT; /* segment disappeared */
427 0 : if (errno == EACCES)
428 0 : return SHMSTATE_FOREIGN; /* must be non-Postgres */
429 : #ifdef HAVE_LINUX_EIDRM_BUG
430 0 : if (errno == EIDRM)
431 0 : return SHMSTATE_ENOENT; /* segment disappeared */
432 : #endif
433 : /* Otherwise, be conservative. */
434 0 : return SHMSTATE_ANALYSIS_FAILURE;
435 : }
436 16 : *addr = hdr;
437 :
438 16 : if (hdr->magic != PGShmemMagic ||
439 12 : hdr->device != statbuf.st_dev ||
440 12 : hdr->inode != statbuf.st_ino)
441 : {
442 : /*
443 : * It's either not a Postgres segment, or not one for my data
444 : * directory.
445 : */
446 4 : return SHMSTATE_FOREIGN;
447 : }
448 :
449 : /*
450 : * It does match our data directory, so now test whether any processes are
451 : * still attached to it. (We are, now, but the shm_nattch result is from
452 : * before we attached to it.)
453 : */
454 12 : return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
455 : }
456 :
457 : /*
458 : * Identify the huge page size to use, and compute the related mmap flags.
459 : *
460 : * Some Linux kernel versions have a bug causing mmap() to fail on requests
461 : * that are not a multiple of the hugepage size. Versions without that bug
462 : * instead silently round the request up to the next hugepage multiple ---
463 : * and then munmap() fails when we give it a size different from that.
464 : * So we have to round our request up to a multiple of the actual hugepage
465 : * size to avoid trouble.
466 : *
467 : * Doing the round-up ourselves also lets us make use of the extra memory,
468 : * rather than just wasting it. Currently, we just increase the available
469 : * space recorded in the shmem header, which will make the extra usable for
470 : * purposes such as additional locktable entries. Someday, for very large
471 : * hugepage sizes, we might want to think about more invasive strategies,
472 : * such as increasing shared_buffers to absorb the extra space.
473 : *
474 : * Returns the (real, assumed or config provided) page size into
475 : * *hugepagesize, and the hugepage-related mmap flags to use into
476 : * *mmap_flags if requested by the caller. If huge pages are not supported,
477 : * *hugepagesize and *mmap_flags are set to 0.
478 : */
479 : void
480 4254 : GetHugePageSize(Size *hugepagesize, int *mmap_flags)
481 : {
482 : #ifdef MAP_HUGETLB
483 :
484 4254 : Size default_hugepagesize = 0;
485 4254 : Size hugepagesize_local = 0;
486 4254 : int mmap_flags_local = 0;
487 :
488 : /*
489 : * System-dependent code to find out the default huge page size.
490 : *
491 : * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
492 : * nnnn kB". Ignore any failures, falling back to the preset default.
493 : */
494 : #ifdef __linux__
495 :
496 : {
497 4254 : FILE *fp = AllocateFile("/proc/meminfo", "r");
498 : char buf[128];
499 : unsigned int sz;
500 : char ch;
501 :
502 4254 : if (fp)
503 : {
504 212700 : while (fgets(buf, sizeof(buf), fp))
505 : {
506 212700 : if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
507 : {
508 4254 : if (ch == 'k')
509 : {
510 4254 : default_hugepagesize = sz * (Size) 1024;
511 4254 : break;
512 : }
513 : /* We could accept other units besides kB, if needed */
514 : }
515 : }
516 4254 : FreeFile(fp);
517 : }
518 : }
519 : #endif /* __linux__ */
520 :
521 4254 : if (huge_page_size != 0)
522 : {
523 : /* If huge page size is requested explicitly, use that. */
524 0 : hugepagesize_local = (Size) huge_page_size * 1024;
525 : }
526 4254 : else if (default_hugepagesize != 0)
527 : {
528 : /* Otherwise use the system default, if we have it. */
529 4254 : hugepagesize_local = default_hugepagesize;
530 : }
531 : else
532 : {
533 : /*
534 : * If we fail to find out the system's default huge page size, or no
535 : * huge page size is requested explicitly, assume it is 2MB. This will
536 : * work fine when the actual size is less. If it's more, we might get
537 : * mmap() or munmap() failures due to unaligned requests; but at this
538 : * writing, there are no reports of any non-Linux systems being picky
539 : * about that.
540 : */
541 0 : hugepagesize_local = 2 * 1024 * 1024;
542 : }
543 :
544 4254 : mmap_flags_local = MAP_HUGETLB;
545 :
546 : /*
547 : * On recent enough Linux, also include the explicit page size, if
548 : * necessary.
549 : */
550 : #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
551 4254 : if (hugepagesize_local != default_hugepagesize)
552 : {
553 0 : int shift = pg_ceil_log2_64(hugepagesize_local);
554 :
555 0 : mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
556 : }
557 : #endif
558 :
559 : /* assign the results found */
560 4254 : if (mmap_flags)
561 2284 : *mmap_flags = mmap_flags_local;
562 4254 : if (hugepagesize)
563 4254 : *hugepagesize = hugepagesize_local;
564 :
565 : #else
566 :
567 : if (hugepagesize)
568 : *hugepagesize = 0;
569 : if (mmap_flags)
570 : *mmap_flags = 0;
571 :
572 : #endif /* MAP_HUGETLB */
573 4254 : }
574 :
575 : /*
576 : * GUC check_hook for huge_page_size
577 : */
578 : bool
579 2356 : check_huge_page_size(int *newval, void **extra, GucSource source)
580 : {
581 : #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
582 : /* Recent enough Linux only, for now. See GetHugePageSize(). */
583 : if (*newval != 0)
584 : {
585 : GUC_check_errdetail("\"huge_page_size\" must be 0 on this platform.");
586 : return false;
587 : }
588 : #endif
589 2356 : return true;
590 : }
591 :
592 : /*
593 : * Creates an anonymous mmap()ed shared memory segment.
594 : *
595 : * Pass the requested size in *size. This function will modify *size to the
596 : * actual size of the allocation, if it ends up allocating a segment that is
597 : * larger than requested.
598 : */
599 : static void *
600 2284 : CreateAnonymousSegment(Size *size)
601 : {
602 2284 : Size allocsize = *size;
603 2284 : void *ptr = MAP_FAILED;
604 2284 : int mmap_errno = 0;
605 2284 : int mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_HASSEMAPHORE;
606 :
607 : #ifndef MAP_HUGETLB
608 : /* PGSharedMemoryCreate should have dealt with this case */
609 : Assert(huge_pages != HUGE_PAGES_ON);
610 : #else
611 2284 : if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
612 : {
613 : /*
614 : * Round up the request size to a suitable large value.
615 : */
616 : Size hugepagesize;
617 : int huge_mmap_flags;
618 :
619 2284 : GetHugePageSize(&hugepagesize, &huge_mmap_flags);
620 :
621 2284 : if (allocsize % hugepagesize != 0)
622 2284 : allocsize = add_size(allocsize, hugepagesize - (allocsize % hugepagesize));
623 :
624 2284 : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
625 : mmap_flags | huge_mmap_flags, -1, 0);
626 2284 : mmap_errno = errno;
627 2284 : if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
628 2284 : elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
629 : allocsize);
630 : }
631 : #endif
632 :
633 : /*
634 : * Report whether huge pages are in use. This needs to be tracked before
635 : * the second mmap() call if attempting to use huge pages failed
636 : * previously.
637 : */
638 2284 : SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
639 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
640 :
641 2284 : if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
642 : {
643 : /*
644 : * Use the original size, not the rounded-up value, when falling back
645 : * to non-huge pages.
646 : */
647 2284 : allocsize = *size;
648 2284 : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
649 : mmap_flags, -1, 0);
650 2284 : mmap_errno = errno;
651 : }
652 :
653 2284 : if (ptr == MAP_FAILED)
654 : {
655 0 : errno = mmap_errno;
656 0 : ereport(FATAL,
657 : (errmsg("could not map anonymous shared memory: %m"),
658 : (mmap_errno == ENOMEM) ?
659 : errhint("This error usually means that PostgreSQL's request "
660 : "for a shared memory segment exceeded available memory, "
661 : "swap space, or huge pages. To reduce the request size "
662 : "(currently %zu bytes), reduce PostgreSQL's shared "
663 : "memory usage, perhaps by reducing \"shared_buffers\" or "
664 : "\"max_connections\".",
665 : allocsize) : 0));
666 : }
667 :
668 2284 : *size = allocsize;
669 2284 : return ptr;
670 : }
671 :
672 : /*
673 : * AnonymousShmemDetach --- detach from an anonymous mmap'd block
674 : * (called as an on_shmem_exit callback, hence funny argument list)
675 : */
676 : static void
677 2284 : AnonymousShmemDetach(int status, Datum arg)
678 : {
679 : /* Release anonymous shared memory block, if any. */
680 2284 : if (AnonymousShmem != NULL)
681 : {
682 2284 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
683 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
684 : AnonymousShmem, AnonymousShmemSize);
685 2284 : AnonymousShmem = NULL;
686 : }
687 2284 : }
688 :
689 : /*
690 : * PGSharedMemoryCreate
691 : *
692 : * Create a shared memory segment of the given size and initialize its
693 : * standard header. Also, register an on_shmem_exit callback to release
694 : * the storage.
695 : *
696 : * Dead Postgres segments pertinent to this DataDir are recycled if found, but
697 : * we do not fail upon collision with foreign shmem segments. The idea here
698 : * is to detect and re-use keys that may have been assigned by a crashed
699 : * postmaster or backend.
700 : */
701 : PGShmemHeader *
702 2284 : PGSharedMemoryCreate(Size size,
703 : PGShmemHeader **shim)
704 : {
705 : IpcMemoryKey NextShmemSegID;
706 : void *memAddress;
707 : PGShmemHeader *hdr;
708 : struct stat statbuf;
709 : Size sysvsize;
710 :
711 : /*
712 : * We use the data directory's ID info (inode and device numbers) to
713 : * positively identify shmem segments associated with this data dir, and
714 : * also as seeds for searching for a free shmem key.
715 : */
716 2284 : if (stat(DataDir, &statbuf) < 0)
717 0 : ereport(FATAL,
718 : (errcode_for_file_access(),
719 : errmsg("could not stat data directory \"%s\": %m",
720 : DataDir)));
721 :
722 : /* Complain if hugepages demanded but we can't possibly support them */
723 : #if !defined(MAP_HUGETLB)
724 : if (huge_pages == HUGE_PAGES_ON)
725 : ereport(ERROR,
726 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
727 : errmsg("huge pages not supported on this platform")));
728 : #endif
729 :
730 : /* For now, we don't support huge pages in SysV memory */
731 2284 : if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP)
732 0 : ereport(ERROR,
733 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
734 : errmsg("huge pages not supported with the current \"shared_memory_type\" setting")));
735 :
736 : /* Room for a header? */
737 : Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
738 :
739 2284 : if (shared_memory_type == SHMEM_TYPE_MMAP)
740 : {
741 2284 : AnonymousShmem = CreateAnonymousSegment(&size);
742 2284 : AnonymousShmemSize = size;
743 :
744 : /* Register on-exit routine to unmap the anonymous segment */
745 2284 : on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
746 :
747 : /* Now we need only allocate a minimal-sized SysV shmem block. */
748 2284 : sysvsize = sizeof(PGShmemHeader);
749 : }
750 : else
751 : {
752 0 : sysvsize = size;
753 :
754 : /* huge pages are only available with mmap */
755 0 : SetConfigOption("huge_pages_status", "off",
756 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
757 : }
758 :
759 : /*
760 : * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
761 : * ensure no more than one postmaster per data directory can enter this
762 : * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
763 : * that, but prefer fixing it over coping here.)
764 : */
765 2284 : NextShmemSegID = statbuf.st_ino;
766 :
767 : for (;;)
768 8 : {
769 : IpcMemoryId shmid;
770 : PGShmemHeader *oldhdr;
771 : IpcMemoryState state;
772 :
773 : /* Try to create new segment */
774 2292 : memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
775 2292 : if (memAddress)
776 2280 : break; /* successful create and attach */
777 :
778 : /* Check shared memory and possibly remove and recreate */
779 :
780 : /*
781 : * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
782 : * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
783 : * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
784 : */
785 12 : shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
786 12 : if (shmid < 0)
787 : {
788 0 : oldhdr = NULL;
789 0 : state = SHMSTATE_FOREIGN;
790 : }
791 : else
792 12 : state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
793 :
794 12 : switch (state)
795 : {
796 4 : case SHMSTATE_ANALYSIS_FAILURE:
797 : case SHMSTATE_ATTACHED:
798 4 : ereport(FATAL,
799 : (errcode(ERRCODE_LOCK_FILE_EXISTS),
800 : errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
801 : (unsigned long) NextShmemSegID,
802 : (unsigned long) shmid),
803 : errhint("Terminate any old server processes associated with data directory \"%s\".",
804 : DataDir)));
805 : break;
806 0 : case SHMSTATE_ENOENT:
807 :
808 : /*
809 : * To our surprise, some other process deleted since our last
810 : * InternalIpcMemoryCreate(). Moments earlier, we would have
811 : * seen SHMSTATE_FOREIGN. Try that same ID again.
812 : */
813 0 : elog(LOG,
814 : "shared memory block (key %lu, ID %lu) deleted during startup",
815 : (unsigned long) NextShmemSegID,
816 : (unsigned long) shmid);
817 0 : break;
818 4 : case SHMSTATE_FOREIGN:
819 4 : NextShmemSegID++;
820 4 : break;
821 4 : case SHMSTATE_UNATTACHED:
822 :
823 : /*
824 : * The segment pertains to DataDir, and every process that had
825 : * used it has died or detached. Zap it, if possible, and any
826 : * associated dynamic shared memory segments, as well. This
827 : * shouldn't fail, but if it does, assume the segment belongs
828 : * to someone else after all, and try the next candidate.
829 : * Otherwise, try again to create the segment. That may fail
830 : * if some other process creates the same shmem key before we
831 : * do, in which case we'll try the next key.
832 : */
833 4 : if (oldhdr->dsm_control != 0)
834 4 : dsm_cleanup_using_control_segment(oldhdr->dsm_control);
835 4 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
836 0 : NextShmemSegID++;
837 4 : break;
838 : }
839 :
840 8 : if (oldhdr && shmdt(oldhdr) < 0)
841 0 : elog(LOG, "shmdt(%p) failed: %m", oldhdr);
842 : }
843 :
844 : /* Initialize new segment. */
845 2280 : hdr = (PGShmemHeader *) memAddress;
846 2280 : hdr->creatorPID = getpid();
847 2280 : hdr->magic = PGShmemMagic;
848 2280 : hdr->dsm_control = 0;
849 :
850 : /* Fill in the data directory ID info, too */
851 2280 : hdr->device = statbuf.st_dev;
852 2280 : hdr->inode = statbuf.st_ino;
853 :
854 : /*
855 : * Initialize space allocation status for segment.
856 : */
857 2280 : hdr->totalsize = size;
858 2280 : hdr->content_offset = MAXALIGN(sizeof(PGShmemHeader));
859 2280 : *shim = hdr;
860 :
861 : /* Save info for possible future use */
862 2280 : UsedShmemSegAddr = memAddress;
863 2280 : UsedShmemSegID = (unsigned long) NextShmemSegID;
864 :
865 : /*
866 : * If AnonymousShmem is NULL here, then we're not using anonymous shared
867 : * memory, and should return a pointer to the System V shared memory
868 : * block. Otherwise, the System V shared memory block is only a shim, and
869 : * we must return a pointer to the real block.
870 : */
871 2280 : if (AnonymousShmem == NULL)
872 0 : return hdr;
873 2280 : memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
874 2280 : return (PGShmemHeader *) AnonymousShmem;
875 : }
876 :
877 : #ifdef EXEC_BACKEND
878 :
879 : /*
880 : * PGSharedMemoryReAttach
881 : *
882 : * This is called during startup of a postmaster child process to re-attach to
883 : * an already existing shared memory segment. This is needed only in the
884 : * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
885 : * segment attachment via fork().
886 : *
887 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
888 : * routine. The caller must have already restored them to the postmaster's
889 : * values.
890 : */
891 : void
892 : PGSharedMemoryReAttach(void)
893 : {
894 : IpcMemoryId shmid;
895 : PGShmemHeader *hdr;
896 : IpcMemoryState state;
897 : void *origUsedShmemSegAddr = UsedShmemSegAddr;
898 :
899 : Assert(UsedShmemSegAddr != NULL);
900 : Assert(IsUnderPostmaster);
901 :
902 : #ifdef __CYGWIN__
903 : /* cygipc (currently) appears to not detach on exec. */
904 : PGSharedMemoryDetach();
905 : UsedShmemSegAddr = origUsedShmemSegAddr;
906 : #endif
907 :
908 : elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
909 : shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
910 : if (shmid < 0)
911 : state = SHMSTATE_FOREIGN;
912 : else
913 : state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
914 : if (state != SHMSTATE_ATTACHED)
915 : elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
916 : (int) UsedShmemSegID, UsedShmemSegAddr);
917 : if (hdr != origUsedShmemSegAddr)
918 : elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
919 : hdr, origUsedShmemSegAddr);
920 : dsm_set_control_handle(hdr->dsm_control);
921 :
922 : UsedShmemSegAddr = hdr; /* probably redundant */
923 : }
924 :
925 : /*
926 : * PGSharedMemoryNoReAttach
927 : *
928 : * This is called during startup of a postmaster child process when we choose
929 : * *not* to re-attach to the existing shared memory segment. We must clean up
930 : * to leave things in the appropriate state. This is not used in the non
931 : * EXEC_BACKEND case, either.
932 : *
933 : * The child process startup logic might or might not call PGSharedMemoryDetach
934 : * after this; make sure that it will be a no-op if called.
935 : *
936 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
937 : * routine. The caller must have already restored them to the postmaster's
938 : * values.
939 : */
940 : void
941 : PGSharedMemoryNoReAttach(void)
942 : {
943 : Assert(UsedShmemSegAddr != NULL);
944 : Assert(IsUnderPostmaster);
945 :
946 : #ifdef __CYGWIN__
947 : /* cygipc (currently) appears to not detach on exec. */
948 : PGSharedMemoryDetach();
949 : #endif
950 :
951 : /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
952 : UsedShmemSegAddr = NULL;
953 : /* And the same for UsedShmemSegID. */
954 : UsedShmemSegID = 0;
955 : }
956 :
957 : #endif /* EXEC_BACKEND */
958 :
959 : /*
960 : * PGSharedMemoryDetach
961 : *
962 : * Detach from the shared memory segment, if still attached. This is not
963 : * intended to be called explicitly by the process that originally created the
964 : * segment (it will have on_shmem_exit callback(s) registered to do that).
965 : * Rather, this is for subprocesses that have inherited an attachment and want
966 : * to get rid of it.
967 : *
968 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
969 : * routine, also AnonymousShmem and AnonymousShmemSize.
970 : */
971 : void
972 2 : PGSharedMemoryDetach(void)
973 : {
974 2 : if (UsedShmemSegAddr != NULL)
975 : {
976 2 : if ((shmdt(UsedShmemSegAddr) < 0)
977 : #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
978 : /* Work-around for cygipc exec bug */
979 : && shmdt(NULL) < 0
980 : #endif
981 : )
982 0 : elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
983 2 : UsedShmemSegAddr = NULL;
984 : }
985 :
986 2 : if (AnonymousShmem != NULL)
987 : {
988 2 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
989 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
990 : AnonymousShmem, AnonymousShmemSize);
991 2 : AnonymousShmem = NULL;
992 : }
993 2 : }
|