Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * sysv_shmem.c
4 : * Implement shared memory using SysV facilities
5 : *
6 : * These routines used to be a fairly thin layer on top of SysV shared
7 : * memory functionality. With the addition of anonymous-shmem logic,
8 : * they're a bit fatter now. We still require a SysV shmem block to
9 : * exist, though, because mmap'd shmem provides no way to find out how
10 : * many processes are attached, which we need for interlocking purposes.
11 : *
12 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
13 : * Portions Copyright (c) 1994, Regents of the University of California
14 : *
15 : * IDENTIFICATION
16 : * src/backend/port/sysv_shmem.c
17 : *
18 : *-------------------------------------------------------------------------
19 : */
20 : #include "postgres.h"
21 :
22 : #include <signal.h>
23 : #include <unistd.h>
24 : #include <sys/file.h>
25 : #include <sys/ipc.h>
26 : #include <sys/mman.h>
27 : #include <sys/shm.h>
28 : #include <sys/stat.h>
29 :
30 : #include "miscadmin.h"
31 : #include "port/pg_bitutils.h"
32 : #include "portability/mem.h"
33 : #include "storage/dsm.h"
34 : #include "storage/fd.h"
35 : #include "storage/ipc.h"
36 : #include "storage/pg_shmem.h"
37 : #include "utils/guc_hooks.h"
38 : #include "utils/pidfile.h"
39 :
40 :
41 : /*
42 : * As of PostgreSQL 9.3, we normally allocate only a very small amount of
43 : * System V shared memory, and only for the purposes of providing an
44 : * interlock to protect the data directory. The real shared memory block
45 : * is allocated using mmap(). This works around the problem that many
46 : * systems have very low limits on the amount of System V shared memory
47 : * that can be allocated. Even a limit of a few megabytes will be enough
48 : * to run many copies of PostgreSQL without needing to adjust system settings.
49 : *
50 : * We assume that no one will attempt to run PostgreSQL 9.3 or later on
51 : * systems that are ancient enough that anonymous shared memory is not
52 : * supported, such as pre-2.4 versions of Linux. If that turns out to be
53 : * false, we might need to add compile and/or run-time tests here and do this
54 : * only if the running kernel supports it.
55 : *
56 : * However, we must always disable this logic in the EXEC_BACKEND case, and
57 : * fall back to the old method of allocating the entire segment using System V
58 : * shared memory, because there's no way to attach an anonymous mmap'd segment
59 : * to a process after exec(). Since EXEC_BACKEND is intended only for
60 : * developer use, this shouldn't be a big problem. Because of this, we do
61 : * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
62 : *
63 : * As of PostgreSQL 12, we regained the ability to use a large System V shared
64 : * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
65 : * to sysv (though this is not the default).
66 : */
67 :
68 :
69 : typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
70 : typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
71 :
72 : /*
73 : * How does a given IpcMemoryId relate to this PostgreSQL process?
74 : *
75 : * One could recycle unattached segments of different data directories if we
76 : * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
77 : * cause us to visit less of the key space, making us less likely to detect a
78 : * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
79 : * in that postmasters of different data directories could simultaneously
80 : * attempt to recycle a given key. We'll waste keys longer in some cases, but
81 : * avoiding the problems of the alternative justifies that loss.
82 : */
83 : typedef enum
84 : {
85 : SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
86 : SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
87 : SHMSTATE_ENOENT, /* no segment of that ID */
88 : SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
89 : SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */
90 : } IpcMemoryState;
91 :
92 :
93 : unsigned long UsedShmemSegID = 0;
94 : void *UsedShmemSegAddr = NULL;
95 :
96 : static Size AnonymousShmemSize;
97 : static void *AnonymousShmem = NULL;
98 :
99 : static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
100 : static void IpcMemoryDetach(int status, Datum shmaddr);
101 : static void IpcMemoryDelete(int status, Datum shmId);
102 : static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
103 : void *attachAt,
104 : PGShmemHeader **addr);
105 :
106 :
107 : /*
108 : * InternalIpcMemoryCreate(memKey, size)
109 : *
110 : * Attempt to create a new shared memory segment with the specified key.
111 : * Will fail (return NULL) if such a segment already exists. If successful,
112 : * attach the segment to the current process and return its attached address.
113 : * On success, callbacks are registered with on_shmem_exit to detach and
114 : * delete the segment when on_shmem_exit is called.
115 : *
116 : * If we fail with a failure code other than collision-with-existing-segment,
117 : * print out an error and abort. Other types of errors are not recoverable.
118 : */
119 : static void *
120 1580 : InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
121 : {
122 : IpcMemoryId shmid;
123 1580 : void *requestedAddress = NULL;
124 : void *memAddress;
125 :
126 : /*
127 : * Normally we just pass requestedAddress = NULL to shmat(), allowing the
128 : * system to choose where the segment gets mapped. But in an EXEC_BACKEND
129 : * build, it's possible for whatever is chosen in the postmaster to not
130 : * work for backends, due to variations in address space layout. As a
131 : * rather klugy workaround, allow the user to specify the address to use
132 : * via setting the environment variable PG_SHMEM_ADDR. (If this were of
133 : * interest for anything except debugging, we'd probably create a cleaner
134 : * and better-documented way to set it, such as a GUC.)
135 : */
136 : #ifdef EXEC_BACKEND
137 : {
138 : char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
139 :
140 : if (pg_shmem_addr)
141 : requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
142 : else
143 : {
144 : #if defined(__darwin__) && SIZEOF_VOID_P == 8
145 : /*
146 : * Provide a default value that is believed to avoid problems with
147 : * ASLR on the current macOS release.
148 : */
149 : requestedAddress = (void *) 0x80000000000;
150 : #endif
151 : }
152 : }
153 : #endif
154 :
155 1580 : shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
156 :
157 1580 : if (shmid < 0)
158 : {
159 18 : int shmget_errno = errno;
160 :
161 : /*
162 : * Fail quietly if error indicates a collision with existing segment.
163 : * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
164 : * we could get a permission violation instead? Also, EIDRM might
165 : * occur if an old seg is slated for destruction but not gone yet.
166 : */
167 18 : if (shmget_errno == EEXIST || shmget_errno == EACCES
168 : #ifdef EIDRM
169 0 : || shmget_errno == EIDRM
170 : #endif
171 : )
172 18 : return NULL;
173 :
174 : /*
175 : * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
176 : * there is an existing segment but it's smaller than "size" (this is
177 : * a result of poorly-thought-out ordering of error tests). To
178 : * distinguish between collision and invalid size in such cases, we
179 : * make a second try with size = 0. These kernels do not test size
180 : * against SHMMIN in the preexisting-segment case, so we will not get
181 : * EINVAL a second time if there is such a segment.
182 : */
183 0 : if (shmget_errno == EINVAL)
184 : {
185 0 : shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
186 :
187 0 : if (shmid < 0)
188 : {
189 : /* As above, fail quietly if we verify a collision */
190 0 : if (errno == EEXIST || errno == EACCES
191 : #ifdef EIDRM
192 0 : || errno == EIDRM
193 : #endif
194 : )
195 0 : return NULL;
196 : /* Otherwise, fall through to report the original error */
197 : }
198 : else
199 : {
200 : /*
201 : * On most platforms we cannot get here because SHMMIN is
202 : * greater than zero. However, if we do succeed in creating a
203 : * zero-size segment, free it and then fall through to report
204 : * the original error.
205 : */
206 0 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
207 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
208 : (int) shmid, IPC_RMID);
209 : }
210 : }
211 :
212 : /*
213 : * Else complain and abort.
214 : *
215 : * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
216 : * is violated. SHMALL violation might be reported as either ENOMEM
217 : * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
218 : * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
219 : * not-enough-RAM is ENOMEM.
220 : */
221 0 : errno = shmget_errno;
222 0 : ereport(FATAL,
223 : (errmsg("could not create shared memory segment: %m"),
224 : errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
225 : (unsigned long) memKey, size,
226 : IPC_CREAT | IPC_EXCL | IPCProtection),
227 : (shmget_errno == EINVAL) ?
228 : errhint("This error usually means that PostgreSQL's request for a shared memory "
229 : "segment exceeded your kernel's SHMMAX parameter, or possibly that "
230 : "it is less than "
231 : "your kernel's SHMMIN parameter.\n"
232 : "The PostgreSQL documentation contains more information about shared "
233 : "memory configuration.") : 0,
234 : (shmget_errno == ENOMEM) ?
235 : errhint("This error usually means that PostgreSQL's request for a shared "
236 : "memory segment exceeded your kernel's SHMALL parameter. You might need "
237 : "to reconfigure the kernel with larger SHMALL.\n"
238 : "The PostgreSQL documentation contains more information about shared "
239 : "memory configuration.") : 0,
240 : (shmget_errno == ENOSPC) ?
241 : errhint("This error does *not* mean that you have run out of disk space. "
242 : "It occurs either if all available shared memory IDs have been taken, "
243 : "in which case you need to raise the SHMMNI parameter in your kernel, "
244 : "or because the system's overall limit for shared memory has been "
245 : "reached.\n"
246 : "The PostgreSQL documentation contains more information about shared "
247 : "memory configuration.") : 0));
248 : }
249 :
250 : /* Register on-exit routine to delete the new segment */
251 1562 : on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
252 :
253 : /* OK, should be able to attach to the segment */
254 1562 : memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
255 :
256 1562 : if (memAddress == (void *) -1)
257 0 : elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
258 : shmid, requestedAddress, PG_SHMAT_FLAGS);
259 :
260 : /* Register on-exit routine to detach new segment before deleting */
261 1562 : on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
262 :
263 : /*
264 : * Store shmem key and ID in data directory lockfile. Format to try to
265 : * keep it the same length always (trailing junk in the lockfile won't
266 : * hurt, but might confuse humans).
267 : */
268 : {
269 : char line[64];
270 :
271 1562 : sprintf(line, "%9lu %9lu",
272 : (unsigned long) memKey, (unsigned long) shmid);
273 1562 : AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
274 : }
275 :
276 1562 : return memAddress;
277 : }
278 :
279 : /****************************************************************************/
280 : /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
281 : /* from process' address space */
282 : /* (called as an on_shmem_exit callback, hence funny argument list) */
283 : /****************************************************************************/
284 : static void
285 1556 : IpcMemoryDetach(int status, Datum shmaddr)
286 : {
287 : /* Detach System V shared memory block. */
288 1556 : if (shmdt((void *) DatumGetPointer(shmaddr)) < 0)
289 0 : elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
290 1556 : }
291 :
292 : /****************************************************************************/
293 : /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
294 : /* (called as an on_shmem_exit callback, hence funny argument list) */
295 : /****************************************************************************/
296 : static void
297 1556 : IpcMemoryDelete(int status, Datum shmId)
298 : {
299 1556 : if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
300 0 : elog(LOG, "shmctl(%d, %d, 0) failed: %m",
301 : DatumGetInt32(shmId), IPC_RMID);
302 1556 : }
303 :
304 : /*
305 : * PGSharedMemoryIsInUse
306 : *
307 : * Is a previously-existing shmem segment still existing and in use?
308 : *
309 : * The point of this exercise is to detect the case where a prior postmaster
310 : * crashed, but it left child backends that are still running. Therefore
311 : * we only care about shmem segments that are associated with the intended
312 : * DataDir. This is an important consideration since accidental matches of
313 : * shmem segment IDs are reasonably common.
314 : */
315 : bool
316 4 : PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
317 : {
318 : PGShmemHeader *memAddress;
319 : IpcMemoryState state;
320 :
321 4 : state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
322 4 : if (memAddress && shmdt((void *) memAddress) < 0)
323 0 : elog(LOG, "shmdt(%p) failed: %m", memAddress);
324 4 : switch (state)
325 : {
326 4 : case SHMSTATE_ENOENT:
327 : case SHMSTATE_FOREIGN:
328 : case SHMSTATE_UNATTACHED:
329 4 : return false;
330 0 : case SHMSTATE_ANALYSIS_FAILURE:
331 : case SHMSTATE_ATTACHED:
332 0 : return true;
333 : }
334 0 : return true;
335 : }
336 :
337 : /*
338 : * Test for a segment with id shmId; see comment at IpcMemoryState.
339 : *
340 : * If the segment exists, we'll attempt to attach to it, using attachAt
341 : * if that's not NULL (but it's best to pass NULL if possible).
342 : *
343 : * *addr is set to the segment memory address if we attached to it, else NULL.
344 : */
345 : static IpcMemoryState
346 22 : PGSharedMemoryAttach(IpcMemoryId shmId,
347 : void *attachAt,
348 : PGShmemHeader **addr)
349 : {
350 : struct shmid_ds shmStat;
351 : struct stat statbuf;
352 : PGShmemHeader *hdr;
353 :
354 22 : *addr = NULL;
355 :
356 : /*
357 : * First, try to stat the shm segment ID, to see if it exists at all.
358 : */
359 22 : if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
360 : {
361 : /*
362 : * EINVAL actually has multiple possible causes documented in the
363 : * shmctl man page, but we assume it must mean the segment no longer
364 : * exists.
365 : */
366 0 : if (errno == EINVAL)
367 0 : return SHMSTATE_ENOENT;
368 :
369 : /*
370 : * EACCES implies we have no read permission, which means it is not a
371 : * Postgres shmem segment (or at least, not one that is relevant to
372 : * our data directory).
373 : */
374 0 : if (errno == EACCES)
375 0 : return SHMSTATE_FOREIGN;
376 :
377 : /*
378 : * Some Linux kernel versions (in fact, all of them as of July 2007)
379 : * sometimes return EIDRM when EINVAL is correct. The Linux kernel
380 : * actually does not have any internal state that would justify
381 : * returning EIDRM, so we can get away with assuming that EIDRM is
382 : * equivalent to EINVAL on that platform.
383 : */
384 : #ifdef HAVE_LINUX_EIDRM_BUG
385 0 : if (errno == EIDRM)
386 0 : return SHMSTATE_ENOENT;
387 : #endif
388 :
389 : /*
390 : * Otherwise, we had better assume that the segment is in use. The
391 : * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
392 : * which implies that the segment has been IPC_RMID'd but there are
393 : * still processes attached to it.
394 : */
395 0 : return SHMSTATE_ANALYSIS_FAILURE;
396 : }
397 :
398 : /*
399 : * Try to attach to the segment and see if it matches our data directory.
400 : * This avoids any risk of duplicate-shmem-key conflicts on machines that
401 : * are running several postmasters under the same userid.
402 : *
403 : * (When we're called from PGSharedMemoryCreate, this stat call is
404 : * duplicative; but since this isn't a high-traffic case it's not worth
405 : * trying to optimize.)
406 : */
407 22 : if (stat(DataDir, &statbuf) < 0)
408 0 : return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
409 :
410 22 : hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
411 22 : if (hdr == (PGShmemHeader *) -1)
412 : {
413 : /*
414 : * Attachment failed. The cases we're interested in are the same as
415 : * for the shmctl() call above. In particular, note that the owning
416 : * postmaster could have terminated and removed the segment between
417 : * shmctl() and shmat().
418 : *
419 : * If attachAt isn't NULL, it's possible that EINVAL reflects a
420 : * problem with that address not a vanished segment, so it's best to
421 : * pass NULL when probing for conflicting segments.
422 : */
423 0 : if (errno == EINVAL)
424 0 : return SHMSTATE_ENOENT; /* segment disappeared */
425 0 : if (errno == EACCES)
426 0 : return SHMSTATE_FOREIGN; /* must be non-Postgres */
427 : #ifdef HAVE_LINUX_EIDRM_BUG
428 0 : if (errno == EIDRM)
429 0 : return SHMSTATE_ENOENT; /* segment disappeared */
430 : #endif
431 : /* Otherwise, be conservative. */
432 0 : return SHMSTATE_ANALYSIS_FAILURE;
433 : }
434 22 : *addr = hdr;
435 :
436 22 : if (hdr->magic != PGShmemMagic ||
437 14 : hdr->device != statbuf.st_dev ||
438 14 : hdr->inode != statbuf.st_ino)
439 : {
440 : /*
441 : * It's either not a Postgres segment, or not one for my data
442 : * directory.
443 : */
444 8 : return SHMSTATE_FOREIGN;
445 : }
446 :
447 : /*
448 : * It does match our data directory, so now test whether any processes are
449 : * still attached to it. (We are, now, but the shm_nattch result is from
450 : * before we attached to it.)
451 : */
452 14 : return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
453 : }
454 :
455 : /*
456 : * Identify the huge page size to use, and compute the related mmap flags.
457 : *
458 : * Some Linux kernel versions have a bug causing mmap() to fail on requests
459 : * that are not a multiple of the hugepage size. Versions without that bug
460 : * instead silently round the request up to the next hugepage multiple ---
461 : * and then munmap() fails when we give it a size different from that.
462 : * So we have to round our request up to a multiple of the actual hugepage
463 : * size to avoid trouble.
464 : *
465 : * Doing the round-up ourselves also lets us make use of the extra memory,
466 : * rather than just wasting it. Currently, we just increase the available
467 : * space recorded in the shmem header, which will make the extra usable for
468 : * purposes such as additional locktable entries. Someday, for very large
469 : * hugepage sizes, we might want to think about more invasive strategies,
470 : * such as increasing shared_buffers to absorb the extra space.
471 : *
472 : * Returns the (real, assumed or config provided) page size into
473 : * *hugepagesize, and the hugepage-related mmap flags to use into
474 : * *mmap_flags if requested by the caller. If huge pages are not supported,
475 : * *hugepagesize and *mmap_flags are set to 0.
476 : */
477 : void
478 2934 : GetHugePageSize(Size *hugepagesize, int *mmap_flags)
479 : {
480 : #ifdef MAP_HUGETLB
481 :
482 2934 : Size default_hugepagesize = 0;
483 2934 : Size hugepagesize_local = 0;
484 2934 : int mmap_flags_local = 0;
485 :
486 : /*
487 : * System-dependent code to find out the default huge page size.
488 : *
489 : * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
490 : * nnnn kB". Ignore any failures, falling back to the preset default.
491 : */
492 : #ifdef __linux__
493 :
494 : {
495 2934 : FILE *fp = AllocateFile("/proc/meminfo", "r");
496 : char buf[128];
497 : unsigned int sz;
498 : char ch;
499 :
500 2934 : if (fp)
501 : {
502 137898 : while (fgets(buf, sizeof(buf), fp))
503 : {
504 137898 : if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
505 : {
506 2934 : if (ch == 'k')
507 : {
508 2934 : default_hugepagesize = sz * (Size) 1024;
509 2934 : break;
510 : }
511 : /* We could accept other units besides kB, if needed */
512 : }
513 : }
514 2934 : FreeFile(fp);
515 : }
516 : }
517 : #endif /* __linux__ */
518 :
519 2934 : if (huge_page_size != 0)
520 : {
521 : /* If huge page size is requested explicitly, use that. */
522 0 : hugepagesize_local = (Size) huge_page_size * 1024;
523 : }
524 2934 : else if (default_hugepagesize != 0)
525 : {
526 : /* Otherwise use the system default, if we have it. */
527 2934 : hugepagesize_local = default_hugepagesize;
528 : }
529 : else
530 : {
531 : /*
532 : * If we fail to find out the system's default huge page size, or no
533 : * huge page size is requested explicitly, assume it is 2MB. This will
534 : * work fine when the actual size is less. If it's more, we might get
535 : * mmap() or munmap() failures due to unaligned requests; but at this
536 : * writing, there are no reports of any non-Linux systems being picky
537 : * about that.
538 : */
539 0 : hugepagesize_local = 2 * 1024 * 1024;
540 : }
541 :
542 2934 : mmap_flags_local = MAP_HUGETLB;
543 :
544 : /*
545 : * On recent enough Linux, also include the explicit page size, if
546 : * necessary.
547 : */
548 : #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
549 2934 : if (hugepagesize_local != default_hugepagesize)
550 : {
551 0 : int shift = pg_ceil_log2_64(hugepagesize_local);
552 :
553 0 : mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
554 : }
555 : #endif
556 :
557 : /* assign the results found */
558 2934 : if (mmap_flags)
559 1566 : *mmap_flags = mmap_flags_local;
560 2934 : if (hugepagesize)
561 2934 : *hugepagesize = hugepagesize_local;
562 :
563 : #else
564 :
565 : if (hugepagesize)
566 : *hugepagesize = 0;
567 : if (mmap_flags)
568 : *mmap_flags = 0;
569 :
570 : #endif /* MAP_HUGETLB */
571 2934 : }
572 :
573 : /*
574 : * GUC check_hook for huge_page_size
575 : */
576 : bool
577 1624 : check_huge_page_size(int *newval, void **extra, GucSource source)
578 : {
579 : #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
580 : /* Recent enough Linux only, for now. See GetHugePageSize(). */
581 : if (*newval != 0)
582 : {
583 : GUC_check_errdetail("huge_page_size must be 0 on this platform.");
584 : return false;
585 : }
586 : #endif
587 1624 : return true;
588 : }
589 :
590 : /*
591 : * Creates an anonymous mmap()ed shared memory segment.
592 : *
593 : * Pass the requested size in *size. This function will modify *size to the
594 : * actual size of the allocation, if it ends up allocating a segment that is
595 : * larger than requested.
596 : */
597 : static void *
598 1566 : CreateAnonymousSegment(Size *size)
599 : {
600 1566 : Size allocsize = *size;
601 1566 : void *ptr = MAP_FAILED;
602 1566 : int mmap_errno = 0;
603 :
604 : #ifndef MAP_HUGETLB
605 : /* PGSharedMemoryCreate should have dealt with this case */
606 : Assert(huge_pages != HUGE_PAGES_ON);
607 : #else
608 1566 : if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
609 : {
610 : /*
611 : * Round up the request size to a suitable large value.
612 : */
613 : Size hugepagesize;
614 : int mmap_flags;
615 :
616 1566 : GetHugePageSize(&hugepagesize, &mmap_flags);
617 :
618 1566 : if (allocsize % hugepagesize != 0)
619 1566 : allocsize += hugepagesize - (allocsize % hugepagesize);
620 :
621 1566 : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
622 : PG_MMAP_FLAGS | mmap_flags, -1, 0);
623 1566 : mmap_errno = errno;
624 1566 : if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
625 1566 : elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
626 : allocsize);
627 : }
628 : #endif
629 :
630 : /*
631 : * Report whether huge pages are in use. This needs to be tracked before
632 : * the second mmap() call if attempting to use huge pages failed
633 : * previously.
634 : */
635 1566 : SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
636 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
637 :
638 1566 : if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
639 : {
640 : /*
641 : * Use the original size, not the rounded-up value, when falling back
642 : * to non-huge pages.
643 : */
644 1566 : allocsize = *size;
645 1566 : ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
646 : PG_MMAP_FLAGS, -1, 0);
647 1566 : mmap_errno = errno;
648 : }
649 :
650 1566 : if (ptr == MAP_FAILED)
651 : {
652 0 : errno = mmap_errno;
653 0 : ereport(FATAL,
654 : (errmsg("could not map anonymous shared memory: %m"),
655 : (mmap_errno == ENOMEM) ?
656 : errhint("This error usually means that PostgreSQL's request "
657 : "for a shared memory segment exceeded available memory, "
658 : "swap space, or huge pages. To reduce the request size "
659 : "(currently %zu bytes), reduce PostgreSQL's shared "
660 : "memory usage, perhaps by reducing shared_buffers or "
661 : "max_connections.",
662 : allocsize) : 0));
663 : }
664 :
665 1566 : *size = allocsize;
666 1566 : return ptr;
667 : }
668 :
669 : /*
670 : * AnonymousShmemDetach --- detach from an anonymous mmap'd block
671 : * (called as an on_shmem_exit callback, hence funny argument list)
672 : */
673 : static void
674 1560 : AnonymousShmemDetach(int status, Datum arg)
675 : {
676 : /* Release anonymous shared memory block, if any. */
677 1560 : if (AnonymousShmem != NULL)
678 : {
679 1560 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
680 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
681 : AnonymousShmem, AnonymousShmemSize);
682 1560 : AnonymousShmem = NULL;
683 : }
684 1560 : }
685 :
686 : /*
687 : * PGSharedMemoryCreate
688 : *
689 : * Create a shared memory segment of the given size and initialize its
690 : * standard header. Also, register an on_shmem_exit callback to release
691 : * the storage.
692 : *
693 : * Dead Postgres segments pertinent to this DataDir are recycled if found, but
694 : * we do not fail upon collision with foreign shmem segments. The idea here
695 : * is to detect and re-use keys that may have been assigned by a crashed
696 : * postmaster or backend.
697 : */
698 : PGShmemHeader *
699 1566 : PGSharedMemoryCreate(Size size,
700 : PGShmemHeader **shim)
701 : {
702 : IpcMemoryKey NextShmemSegID;
703 : void *memAddress;
704 : PGShmemHeader *hdr;
705 : struct stat statbuf;
706 : Size sysvsize;
707 :
708 : /*
709 : * We use the data directory's ID info (inode and device numbers) to
710 : * positively identify shmem segments associated with this data dir, and
711 : * also as seeds for searching for a free shmem key.
712 : */
713 1566 : if (stat(DataDir, &statbuf) < 0)
714 0 : ereport(FATAL,
715 : (errcode_for_file_access(),
716 : errmsg("could not stat data directory \"%s\": %m",
717 : DataDir)));
718 :
719 : /* Complain if hugepages demanded but we can't possibly support them */
720 : #if !defined(MAP_HUGETLB)
721 : if (huge_pages == HUGE_PAGES_ON)
722 : ereport(ERROR,
723 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
724 : errmsg("huge pages not supported on this platform")));
725 : #endif
726 :
727 : /* For now, we don't support huge pages in SysV memory */
728 1566 : if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP)
729 0 : ereport(ERROR,
730 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
731 : errmsg("huge pages not supported with the current shared_memory_type setting")));
732 :
733 : /* Room for a header? */
734 : Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
735 :
736 1566 : if (shared_memory_type == SHMEM_TYPE_MMAP)
737 : {
738 1566 : AnonymousShmem = CreateAnonymousSegment(&size);
739 1566 : AnonymousShmemSize = size;
740 :
741 : /* Register on-exit routine to unmap the anonymous segment */
742 1566 : on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
743 :
744 : /* Now we need only allocate a minimal-sized SysV shmem block. */
745 1566 : sysvsize = sizeof(PGShmemHeader);
746 : }
747 : else
748 : {
749 0 : sysvsize = size;
750 :
751 : /* huge pages are only available with mmap */
752 0 : SetConfigOption("huge_pages_status", "off",
753 : PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
754 : }
755 :
756 : /*
757 : * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
758 : * ensure no more than one postmaster per data directory can enter this
759 : * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
760 : * that, but prefer fixing it over coping here.)
761 : */
762 1566 : NextShmemSegID = statbuf.st_ino;
763 :
764 : for (;;)
765 14 : {
766 : IpcMemoryId shmid;
767 : PGShmemHeader *oldhdr;
768 : IpcMemoryState state;
769 :
770 : /* Try to create new segment */
771 1580 : memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
772 1580 : if (memAddress)
773 1562 : break; /* successful create and attach */
774 :
775 : /* Check shared memory and possibly remove and recreate */
776 :
777 : /*
778 : * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
779 : * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
780 : * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
781 : */
782 18 : shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
783 18 : if (shmid < 0)
784 : {
785 0 : oldhdr = NULL;
786 0 : state = SHMSTATE_FOREIGN;
787 : }
788 : else
789 18 : state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
790 :
791 18 : switch (state)
792 : {
793 4 : case SHMSTATE_ANALYSIS_FAILURE:
794 : case SHMSTATE_ATTACHED:
795 4 : ereport(FATAL,
796 : (errcode(ERRCODE_LOCK_FILE_EXISTS),
797 : errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
798 : (unsigned long) NextShmemSegID,
799 : (unsigned long) shmid),
800 : errhint("Terminate any old server processes associated with data directory \"%s\".",
801 : DataDir)));
802 : break;
803 0 : case SHMSTATE_ENOENT:
804 :
805 : /*
806 : * To our surprise, some other process deleted since our last
807 : * InternalIpcMemoryCreate(). Moments earlier, we would have
808 : * seen SHMSTATE_FOREIGN. Try that same ID again.
809 : */
810 0 : elog(LOG,
811 : "shared memory block (key %lu, ID %lu) deleted during startup",
812 : (unsigned long) NextShmemSegID,
813 : (unsigned long) shmid);
814 0 : break;
815 8 : case SHMSTATE_FOREIGN:
816 8 : NextShmemSegID++;
817 8 : break;
818 6 : case SHMSTATE_UNATTACHED:
819 :
820 : /*
821 : * The segment pertains to DataDir, and every process that had
822 : * used it has died or detached. Zap it, if possible, and any
823 : * associated dynamic shared memory segments, as well. This
824 : * shouldn't fail, but if it does, assume the segment belongs
825 : * to someone else after all, and try the next candidate.
826 : * Otherwise, try again to create the segment. That may fail
827 : * if some other process creates the same shmem key before we
828 : * do, in which case we'll try the next key.
829 : */
830 6 : if (oldhdr->dsm_control != 0)
831 6 : dsm_cleanup_using_control_segment(oldhdr->dsm_control);
832 6 : if (shmctl(shmid, IPC_RMID, NULL) < 0)
833 0 : NextShmemSegID++;
834 6 : break;
835 : }
836 :
837 14 : if (oldhdr && shmdt((void *) oldhdr) < 0)
838 0 : elog(LOG, "shmdt(%p) failed: %m", oldhdr);
839 : }
840 :
841 : /* Initialize new segment. */
842 1562 : hdr = (PGShmemHeader *) memAddress;
843 1562 : hdr->creatorPID = getpid();
844 1562 : hdr->magic = PGShmemMagic;
845 1562 : hdr->dsm_control = 0;
846 :
847 : /* Fill in the data directory ID info, too */
848 1562 : hdr->device = statbuf.st_dev;
849 1562 : hdr->inode = statbuf.st_ino;
850 :
851 : /*
852 : * Initialize space allocation status for segment.
853 : */
854 1562 : hdr->totalsize = size;
855 1562 : hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
856 1562 : *shim = hdr;
857 :
858 : /* Save info for possible future use */
859 1562 : UsedShmemSegAddr = memAddress;
860 1562 : UsedShmemSegID = (unsigned long) NextShmemSegID;
861 :
862 : /*
863 : * If AnonymousShmem is NULL here, then we're not using anonymous shared
864 : * memory, and should return a pointer to the System V shared memory
865 : * block. Otherwise, the System V shared memory block is only a shim, and
866 : * we must return a pointer to the real block.
867 : */
868 1562 : if (AnonymousShmem == NULL)
869 0 : return hdr;
870 1562 : memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
871 1562 : return (PGShmemHeader *) AnonymousShmem;
872 : }
873 :
874 : #ifdef EXEC_BACKEND
875 :
876 : /*
877 : * PGSharedMemoryReAttach
878 : *
879 : * This is called during startup of a postmaster child process to re-attach to
880 : * an already existing shared memory segment. This is needed only in the
881 : * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
882 : * segment attachment via fork().
883 : *
884 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
885 : * routine. The caller must have already restored them to the postmaster's
886 : * values.
887 : */
888 : void
889 : PGSharedMemoryReAttach(void)
890 : {
891 : IpcMemoryId shmid;
892 : PGShmemHeader *hdr;
893 : IpcMemoryState state;
894 : void *origUsedShmemSegAddr = UsedShmemSegAddr;
895 :
896 : Assert(UsedShmemSegAddr != NULL);
897 : Assert(IsUnderPostmaster);
898 :
899 : #ifdef __CYGWIN__
900 : /* cygipc (currently) appears to not detach on exec. */
901 : PGSharedMemoryDetach();
902 : UsedShmemSegAddr = origUsedShmemSegAddr;
903 : #endif
904 :
905 : elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
906 : shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
907 : if (shmid < 0)
908 : state = SHMSTATE_FOREIGN;
909 : else
910 : state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
911 : if (state != SHMSTATE_ATTACHED)
912 : elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
913 : (int) UsedShmemSegID, UsedShmemSegAddr);
914 : if (hdr != origUsedShmemSegAddr)
915 : elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
916 : hdr, origUsedShmemSegAddr);
917 : dsm_set_control_handle(hdr->dsm_control);
918 :
919 : UsedShmemSegAddr = hdr; /* probably redundant */
920 : }
921 :
922 : /*
923 : * PGSharedMemoryNoReAttach
924 : *
925 : * This is called during startup of a postmaster child process when we choose
926 : * *not* to re-attach to the existing shared memory segment. We must clean up
927 : * to leave things in the appropriate state. This is not used in the non
928 : * EXEC_BACKEND case, either.
929 : *
930 : * The child process startup logic might or might not call PGSharedMemoryDetach
931 : * after this; make sure that it will be a no-op if called.
932 : *
933 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
934 : * routine. The caller must have already restored them to the postmaster's
935 : * values.
936 : */
937 : void
938 : PGSharedMemoryNoReAttach(void)
939 : {
940 : Assert(UsedShmemSegAddr != NULL);
941 : Assert(IsUnderPostmaster);
942 :
943 : #ifdef __CYGWIN__
944 : /* cygipc (currently) appears to not detach on exec. */
945 : PGSharedMemoryDetach();
946 : #endif
947 :
948 : /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
949 : UsedShmemSegAddr = NULL;
950 : /* And the same for UsedShmemSegID. */
951 : UsedShmemSegID = 0;
952 : }
953 :
954 : #endif /* EXEC_BACKEND */
955 :
956 : /*
957 : * PGSharedMemoryDetach
958 : *
959 : * Detach from the shared memory segment, if still attached. This is not
960 : * intended to be called explicitly by the process that originally created the
961 : * segment (it will have on_shmem_exit callback(s) registered to do that).
962 : * Rather, this is for subprocesses that have inherited an attachment and want
963 : * to get rid of it.
964 : *
965 : * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
966 : * routine, also AnonymousShmem and AnonymousShmemSize.
967 : */
968 : void
969 2 : PGSharedMemoryDetach(void)
970 : {
971 2 : if (UsedShmemSegAddr != NULL)
972 : {
973 2 : if ((shmdt(UsedShmemSegAddr) < 0)
974 : #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
975 : /* Work-around for cygipc exec bug */
976 : && shmdt(NULL) < 0
977 : #endif
978 : )
979 0 : elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
980 2 : UsedShmemSegAddr = NULL;
981 : }
982 :
983 2 : if (AnonymousShmem != NULL)
984 : {
985 2 : if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
986 0 : elog(LOG, "munmap(%p, %zu) failed: %m",
987 : AnonymousShmem, AnonymousShmemSize);
988 2 : AnonymousShmem = NULL;
989 : }
990 2 : }
|