LCOV - code coverage report
Current view: top level - src/backend/port - pg_shmem.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 137 191 71.7 %
Date: 2024-11-21 08:14:44 Functions: 11 11 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * sysv_shmem.c
       4             :  *    Implement shared memory using SysV facilities
       5             :  *
       6             :  * These routines used to be a fairly thin layer on top of SysV shared
       7             :  * memory functionality.  With the addition of anonymous-shmem logic,
       8             :  * they're a bit fatter now.  We still require a SysV shmem block to
       9             :  * exist, though, because mmap'd shmem provides no way to find out how
      10             :  * many processes are attached, which we need for interlocking purposes.
      11             :  *
      12             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
      13             :  * Portions Copyright (c) 1994, Regents of the University of California
      14             :  *
      15             :  * IDENTIFICATION
      16             :  *    src/backend/port/sysv_shmem.c
      17             :  *
      18             :  *-------------------------------------------------------------------------
      19             :  */
      20             : #include "postgres.h"
      21             : 
      22             : #include <signal.h>
      23             : #include <unistd.h>
      24             : #include <sys/file.h>
      25             : #include <sys/ipc.h>
      26             : #include <sys/mman.h>
      27             : #include <sys/shm.h>
      28             : #include <sys/stat.h>
      29             : 
      30             : #include "miscadmin.h"
      31             : #include "port/pg_bitutils.h"
      32             : #include "portability/mem.h"
      33             : #include "storage/dsm.h"
      34             : #include "storage/fd.h"
      35             : #include "storage/ipc.h"
      36             : #include "storage/pg_shmem.h"
      37             : #include "utils/guc.h"
      38             : #include "utils/guc_hooks.h"
      39             : #include "utils/pidfile.h"
      40             : 
      41             : 
      42             : /*
      43             :  * As of PostgreSQL 9.3, we normally allocate only a very small amount of
      44             :  * System V shared memory, and only for the purposes of providing an
      45             :  * interlock to protect the data directory.  The real shared memory block
      46             :  * is allocated using mmap().  This works around the problem that many
      47             :  * systems have very low limits on the amount of System V shared memory
      48             :  * that can be allocated.  Even a limit of a few megabytes will be enough
      49             :  * to run many copies of PostgreSQL without needing to adjust system settings.
      50             :  *
      51             :  * We assume that no one will attempt to run PostgreSQL 9.3 or later on
      52             :  * systems that are ancient enough that anonymous shared memory is not
      53             :  * supported, such as pre-2.4 versions of Linux.  If that turns out to be
      54             :  * false, we might need to add compile and/or run-time tests here and do this
      55             :  * only if the running kernel supports it.
      56             :  *
      57             :  * However, we must always disable this logic in the EXEC_BACKEND case, and
      58             :  * fall back to the old method of allocating the entire segment using System V
      59             :  * shared memory, because there's no way to attach an anonymous mmap'd segment
      60             :  * to a process after exec().  Since EXEC_BACKEND is intended only for
      61             :  * developer use, this shouldn't be a big problem.  Because of this, we do
      62             :  * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
      63             :  *
      64             :  * As of PostgreSQL 12, we regained the ability to use a large System V shared
      65             :  * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
      66             :  * to sysv (though this is not the default).
      67             :  */
      68             : 
      69             : 
      70             : typedef key_t IpcMemoryKey;     /* shared memory key passed to shmget(2) */
      71             : typedef int IpcMemoryId;        /* shared memory ID returned by shmget(2) */
      72             : 
      73             : /*
      74             :  * How does a given IpcMemoryId relate to this PostgreSQL process?
      75             :  *
      76             :  * One could recycle unattached segments of different data directories if we
      77             :  * distinguished that case from other SHMSTATE_FOREIGN cases.  Doing so would
      78             :  * cause us to visit less of the key space, making us less likely to detect a
      79             :  * SHMSTATE_ATTACHED key.  It would also complicate the concurrency analysis,
      80             :  * in that postmasters of different data directories could simultaneously
      81             :  * attempt to recycle a given key.  We'll waste keys longer in some cases, but
      82             :  * avoiding the problems of the alternative justifies that loss.
      83             :  */
      84             : typedef enum
      85             : {
      86             :     SHMSTATE_ANALYSIS_FAILURE,  /* unexpected failure to analyze the ID */
      87             :     SHMSTATE_ATTACHED,          /* pertinent to DataDir, has attached PIDs */
      88             :     SHMSTATE_ENOENT,            /* no segment of that ID */
      89             :     SHMSTATE_FOREIGN,           /* exists, but not pertinent to DataDir */
      90             :     SHMSTATE_UNATTACHED,        /* pertinent to DataDir, no attached PIDs */
      91             : } IpcMemoryState;
      92             : 
      93             : 
      94             : unsigned long UsedShmemSegID = 0;
      95             : void       *UsedShmemSegAddr = NULL;
      96             : 
      97             : static Size AnonymousShmemSize;
      98             : static void *AnonymousShmem = NULL;
      99             : 
     100             : static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
     101             : static void IpcMemoryDetach(int status, Datum shmaddr);
     102             : static void IpcMemoryDelete(int status, Datum shmId);
     103             : static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
     104             :                                            void *attachAt,
     105             :                                            PGShmemHeader **addr);
     106             : 
     107             : 
     108             : /*
     109             :  *  InternalIpcMemoryCreate(memKey, size)
     110             :  *
     111             :  * Attempt to create a new shared memory segment with the specified key.
     112             :  * Will fail (return NULL) if such a segment already exists.  If successful,
     113             :  * attach the segment to the current process and return its attached address.
     114             :  * On success, callbacks are registered with on_shmem_exit to detach and
     115             :  * delete the segment when on_shmem_exit is called.
     116             :  *
     117             :  * If we fail with a failure code other than collision-with-existing-segment,
     118             :  * print out an error and abort.  Other types of errors are not recoverable.
     119             :  */
     120             : static void *
     121        1920 : InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
     122             : {
     123             :     IpcMemoryId shmid;
     124        1920 :     void       *requestedAddress = NULL;
     125             :     void       *memAddress;
     126             : 
     127             :     /*
     128             :      * Normally we just pass requestedAddress = NULL to shmat(), allowing the
     129             :      * system to choose where the segment gets mapped.  But in an EXEC_BACKEND
     130             :      * build, it's possible for whatever is chosen in the postmaster to not
     131             :      * work for backends, due to variations in address space layout.  As a
     132             :      * rather klugy workaround, allow the user to specify the address to use
     133             :      * via setting the environment variable PG_SHMEM_ADDR.  (If this were of
     134             :      * interest for anything except debugging, we'd probably create a cleaner
     135             :      * and better-documented way to set it, such as a GUC.)
     136             :      */
     137             : #ifdef EXEC_BACKEND
     138             :     {
     139             :         char       *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
     140             : 
     141             :         if (pg_shmem_addr)
     142             :             requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
     143             :         else
     144             :         {
     145             : #if defined(__darwin__) && SIZEOF_VOID_P == 8
     146             :             /*
     147             :              * Provide a default value that is believed to avoid problems with
     148             :              * ASLR on the current macOS release.
     149             :              */
     150             :             requestedAddress = (void *) 0x80000000000;
     151             : #endif
     152             :         }
     153             :     }
     154             : #endif
     155             : 
     156        1920 :     shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
     157             : 
     158        1920 :     if (shmid < 0)
     159             :     {
     160          18 :         int         shmget_errno = errno;
     161             : 
     162             :         /*
     163             :          * Fail quietly if error indicates a collision with existing segment.
     164             :          * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
     165             :          * we could get a permission violation instead?  Also, EIDRM might
     166             :          * occur if an old seg is slated for destruction but not gone yet.
     167             :          */
     168          18 :         if (shmget_errno == EEXIST || shmget_errno == EACCES
     169             : #ifdef EIDRM
     170           0 :             || shmget_errno == EIDRM
     171             : #endif
     172             :             )
     173          18 :             return NULL;
     174             : 
     175             :         /*
     176             :          * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
     177             :          * there is an existing segment but it's smaller than "size" (this is
     178             :          * a result of poorly-thought-out ordering of error tests). To
     179             :          * distinguish between collision and invalid size in such cases, we
     180             :          * make a second try with size = 0.  These kernels do not test size
     181             :          * against SHMMIN in the preexisting-segment case, so we will not get
     182             :          * EINVAL a second time if there is such a segment.
     183             :          */
     184           0 :         if (shmget_errno == EINVAL)
     185             :         {
     186           0 :             shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
     187             : 
     188           0 :             if (shmid < 0)
     189             :             {
     190             :                 /* As above, fail quietly if we verify a collision */
     191           0 :                 if (errno == EEXIST || errno == EACCES
     192             : #ifdef EIDRM
     193           0 :                     || errno == EIDRM
     194             : #endif
     195             :                     )
     196           0 :                     return NULL;
     197             :                 /* Otherwise, fall through to report the original error */
     198             :             }
     199             :             else
     200             :             {
     201             :                 /*
     202             :                  * On most platforms we cannot get here because SHMMIN is
     203             :                  * greater than zero.  However, if we do succeed in creating a
     204             :                  * zero-size segment, free it and then fall through to report
     205             :                  * the original error.
     206             :                  */
     207           0 :                 if (shmctl(shmid, IPC_RMID, NULL) < 0)
     208           0 :                     elog(LOG, "shmctl(%d, %d, 0) failed: %m",
     209             :                          (int) shmid, IPC_RMID);
     210             :             }
     211             :         }
     212             : 
     213             :         /*
     214             :          * Else complain and abort.
     215             :          *
     216             :          * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
     217             :          * is violated.  SHMALL violation might be reported as either ENOMEM
     218             :          * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
     219             :          * it should be.  SHMMNI violation is ENOSPC, per spec.  Just plain
     220             :          * not-enough-RAM is ENOMEM.
     221             :          */
     222           0 :         errno = shmget_errno;
     223           0 :         ereport(FATAL,
     224             :                 (errmsg("could not create shared memory segment: %m"),
     225             :                  errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
     226             :                            (unsigned long) memKey, size,
     227             :                            IPC_CREAT | IPC_EXCL | IPCProtection),
     228             :                  (shmget_errno == EINVAL) ?
     229             :                  errhint("This error usually means that PostgreSQL's request for a shared memory "
     230             :                          "segment exceeded your kernel's SHMMAX parameter, or possibly that "
     231             :                          "it is less than "
     232             :                          "your kernel's SHMMIN parameter.\n"
     233             :                          "The PostgreSQL documentation contains more information about shared "
     234             :                          "memory configuration.") : 0,
     235             :                  (shmget_errno == ENOMEM) ?
     236             :                  errhint("This error usually means that PostgreSQL's request for a shared "
     237             :                          "memory segment exceeded your kernel's SHMALL parameter.  You might need "
     238             :                          "to reconfigure the kernel with larger SHMALL.\n"
     239             :                          "The PostgreSQL documentation contains more information about shared "
     240             :                          "memory configuration.") : 0,
     241             :                  (shmget_errno == ENOSPC) ?
     242             :                  errhint("This error does *not* mean that you have run out of disk space.  "
     243             :                          "It occurs either if all available shared memory IDs have been taken, "
     244             :                          "in which case you need to raise the SHMMNI parameter in your kernel, "
     245             :                          "or because the system's overall limit for shared memory has been "
     246             :                          "reached.\n"
     247             :                          "The PostgreSQL documentation contains more information about shared "
     248             :                          "memory configuration.") : 0));
     249             :     }
     250             : 
     251             :     /* Register on-exit routine to delete the new segment */
     252        1902 :     on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
     253             : 
     254             :     /* OK, should be able to attach to the segment */
     255        1902 :     memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
     256             : 
     257        1902 :     if (memAddress == (void *) -1)
     258           0 :         elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
     259             :              shmid, requestedAddress, PG_SHMAT_FLAGS);
     260             : 
     261             :     /* Register on-exit routine to detach new segment before deleting */
     262        1902 :     on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
     263             : 
     264             :     /*
     265             :      * Store shmem key and ID in data directory lockfile.  Format to try to
     266             :      * keep it the same length always (trailing junk in the lockfile won't
     267             :      * hurt, but might confuse humans).
     268             :      */
     269             :     {
     270             :         char        line[64];
     271             : 
     272        1902 :         sprintf(line, "%9lu %9lu",
     273             :                 (unsigned long) memKey, (unsigned long) shmid);
     274        1902 :         AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
     275             :     }
     276             : 
     277        1902 :     return memAddress;
     278             : }
     279             : 
     280             : /****************************************************************************/
     281             : /*  IpcMemoryDetach(status, shmaddr)    removes a shared memory segment     */
     282             : /*                                      from process' address space         */
     283             : /*  (called as an on_shmem_exit callback, hence funny argument list)        */
     284             : /****************************************************************************/
     285             : static void
     286        1896 : IpcMemoryDetach(int status, Datum shmaddr)
     287             : {
     288             :     /* Detach System V shared memory block. */
     289        1896 :     if (shmdt((void *) DatumGetPointer(shmaddr)) < 0)
     290           0 :         elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
     291        1896 : }
     292             : 
     293             : /****************************************************************************/
     294             : /*  IpcMemoryDelete(status, shmId)      deletes a shared memory segment     */
     295             : /*  (called as an on_shmem_exit callback, hence funny argument list)        */
     296             : /****************************************************************************/
     297             : static void
     298        1896 : IpcMemoryDelete(int status, Datum shmId)
     299             : {
     300        1896 :     if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
     301           0 :         elog(LOG, "shmctl(%d, %d, 0) failed: %m",
     302             :              DatumGetInt32(shmId), IPC_RMID);
     303        1896 : }
     304             : 
     305             : /*
     306             :  * PGSharedMemoryIsInUse
     307             :  *
     308             :  * Is a previously-existing shmem segment still existing and in use?
     309             :  *
     310             :  * The point of this exercise is to detect the case where a prior postmaster
     311             :  * crashed, but it left child backends that are still running.  Therefore
     312             :  * we only care about shmem segments that are associated with the intended
     313             :  * DataDir.  This is an important consideration since accidental matches of
     314             :  * shmem segment IDs are reasonably common.
     315             :  */
     316             : bool
     317           4 : PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
     318             : {
     319             :     PGShmemHeader *memAddress;
     320             :     IpcMemoryState state;
     321             : 
     322           4 :     state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
     323           4 :     if (memAddress && shmdt((void *) memAddress) < 0)
     324           0 :         elog(LOG, "shmdt(%p) failed: %m", memAddress);
     325           4 :     switch (state)
     326             :     {
     327           4 :         case SHMSTATE_ENOENT:
     328             :         case SHMSTATE_FOREIGN:
     329             :         case SHMSTATE_UNATTACHED:
     330           4 :             return false;
     331           0 :         case SHMSTATE_ANALYSIS_FAILURE:
     332             :         case SHMSTATE_ATTACHED:
     333           0 :             return true;
     334             :     }
     335           0 :     return true;
     336             : }
     337             : 
     338             : /*
     339             :  * Test for a segment with id shmId; see comment at IpcMemoryState.
     340             :  *
     341             :  * If the segment exists, we'll attempt to attach to it, using attachAt
     342             :  * if that's not NULL (but it's best to pass NULL if possible).
     343             :  *
     344             :  * *addr is set to the segment memory address if we attached to it, else NULL.
     345             :  */
     346             : static IpcMemoryState
     347          22 : PGSharedMemoryAttach(IpcMemoryId shmId,
     348             :                      void *attachAt,
     349             :                      PGShmemHeader **addr)
     350             : {
     351             :     struct shmid_ds shmStat;
     352             :     struct stat statbuf;
     353             :     PGShmemHeader *hdr;
     354             : 
     355          22 :     *addr = NULL;
     356             : 
     357             :     /*
     358             :      * First, try to stat the shm segment ID, to see if it exists at all.
     359             :      */
     360          22 :     if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
     361             :     {
     362             :         /*
     363             :          * EINVAL actually has multiple possible causes documented in the
     364             :          * shmctl man page, but we assume it must mean the segment no longer
     365             :          * exists.
     366             :          */
     367           0 :         if (errno == EINVAL)
     368           0 :             return SHMSTATE_ENOENT;
     369             : 
     370             :         /*
     371             :          * EACCES implies we have no read permission, which means it is not a
     372             :          * Postgres shmem segment (or at least, not one that is relevant to
     373             :          * our data directory).
     374             :          */
     375           0 :         if (errno == EACCES)
     376           0 :             return SHMSTATE_FOREIGN;
     377             : 
     378             :         /*
     379             :          * Some Linux kernel versions (in fact, all of them as of July 2007)
     380             :          * sometimes return EIDRM when EINVAL is correct.  The Linux kernel
     381             :          * actually does not have any internal state that would justify
     382             :          * returning EIDRM, so we can get away with assuming that EIDRM is
     383             :          * equivalent to EINVAL on that platform.
     384             :          */
     385             : #ifdef HAVE_LINUX_EIDRM_BUG
     386           0 :         if (errno == EIDRM)
     387           0 :             return SHMSTATE_ENOENT;
     388             : #endif
     389             : 
     390             :         /*
     391             :          * Otherwise, we had better assume that the segment is in use.  The
     392             :          * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
     393             :          * which implies that the segment has been IPC_RMID'd but there are
     394             :          * still processes attached to it.
     395             :          */
     396           0 :         return SHMSTATE_ANALYSIS_FAILURE;
     397             :     }
     398             : 
     399             :     /*
     400             :      * Try to attach to the segment and see if it matches our data directory.
     401             :      * This avoids any risk of duplicate-shmem-key conflicts on machines that
     402             :      * are running several postmasters under the same userid.
     403             :      *
     404             :      * (When we're called from PGSharedMemoryCreate, this stat call is
     405             :      * duplicative; but since this isn't a high-traffic case it's not worth
     406             :      * trying to optimize.)
     407             :      */
     408          22 :     if (stat(DataDir, &statbuf) < 0)
     409           0 :         return SHMSTATE_ANALYSIS_FAILURE;   /* can't stat; be conservative */
     410             : 
     411          22 :     hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
     412          22 :     if (hdr == (PGShmemHeader *) -1)
     413             :     {
     414             :         /*
     415             :          * Attachment failed.  The cases we're interested in are the same as
     416             :          * for the shmctl() call above.  In particular, note that the owning
     417             :          * postmaster could have terminated and removed the segment between
     418             :          * shmctl() and shmat().
     419             :          *
     420             :          * If attachAt isn't NULL, it's possible that EINVAL reflects a
     421             :          * problem with that address not a vanished segment, so it's best to
     422             :          * pass NULL when probing for conflicting segments.
     423             :          */
     424           0 :         if (errno == EINVAL)
     425           0 :             return SHMSTATE_ENOENT; /* segment disappeared */
     426           0 :         if (errno == EACCES)
     427           0 :             return SHMSTATE_FOREIGN;    /* must be non-Postgres */
     428             : #ifdef HAVE_LINUX_EIDRM_BUG
     429           0 :         if (errno == EIDRM)
     430           0 :             return SHMSTATE_ENOENT; /* segment disappeared */
     431             : #endif
     432             :         /* Otherwise, be conservative. */
     433           0 :         return SHMSTATE_ANALYSIS_FAILURE;
     434             :     }
     435          22 :     *addr = hdr;
     436             : 
     437          22 :     if (hdr->magic != PGShmemMagic ||
     438          14 :         hdr->device != statbuf.st_dev ||
     439          14 :         hdr->inode != statbuf.st_ino)
     440             :     {
     441             :         /*
     442             :          * It's either not a Postgres segment, or not one for my data
     443             :          * directory.
     444             :          */
     445           8 :         return SHMSTATE_FOREIGN;
     446             :     }
     447             : 
     448             :     /*
     449             :      * It does match our data directory, so now test whether any processes are
     450             :      * still attached to it.  (We are, now, but the shm_nattch result is from
     451             :      * before we attached to it.)
     452             :      */
     453          14 :     return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
     454             : }
     455             : 
     456             : /*
     457             :  * Identify the huge page size to use, and compute the related mmap flags.
     458             :  *
     459             :  * Some Linux kernel versions have a bug causing mmap() to fail on requests
     460             :  * that are not a multiple of the hugepage size.  Versions without that bug
     461             :  * instead silently round the request up to the next hugepage multiple ---
     462             :  * and then munmap() fails when we give it a size different from that.
     463             :  * So we have to round our request up to a multiple of the actual hugepage
     464             :  * size to avoid trouble.
     465             :  *
     466             :  * Doing the round-up ourselves also lets us make use of the extra memory,
     467             :  * rather than just wasting it.  Currently, we just increase the available
     468             :  * space recorded in the shmem header, which will make the extra usable for
     469             :  * purposes such as additional locktable entries.  Someday, for very large
     470             :  * hugepage sizes, we might want to think about more invasive strategies,
     471             :  * such as increasing shared_buffers to absorb the extra space.
     472             :  *
     473             :  * Returns the (real, assumed or config provided) page size into
     474             :  * *hugepagesize, and the hugepage-related mmap flags to use into
     475             :  * *mmap_flags if requested by the caller.  If huge pages are not supported,
     476             :  * *hugepagesize and *mmap_flags are set to 0.
     477             :  */
     478             : void
     479        3534 : GetHugePageSize(Size *hugepagesize, int *mmap_flags)
     480             : {
     481             : #ifdef MAP_HUGETLB
     482             : 
     483        3534 :     Size        default_hugepagesize = 0;
     484        3534 :     Size        hugepagesize_local = 0;
     485        3534 :     int         mmap_flags_local = 0;
     486             : 
     487             :     /*
     488             :      * System-dependent code to find out the default huge page size.
     489             :      *
     490             :      * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
     491             :      * nnnn kB".  Ignore any failures, falling back to the preset default.
     492             :      */
     493             : #ifdef __linux__
     494             : 
     495             :     {
     496        3534 :         FILE       *fp = AllocateFile("/proc/meminfo", "r");
     497             :         char        buf[128];
     498             :         unsigned int sz;
     499             :         char        ch;
     500             : 
     501        3534 :         if (fp)
     502             :         {
     503      166098 :             while (fgets(buf, sizeof(buf), fp))
     504             :             {
     505      166098 :                 if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
     506             :                 {
     507        3534 :                     if (ch == 'k')
     508             :                     {
     509        3534 :                         default_hugepagesize = sz * (Size) 1024;
     510        3534 :                         break;
     511             :                     }
     512             :                     /* We could accept other units besides kB, if needed */
     513             :                 }
     514             :             }
     515        3534 :             FreeFile(fp);
     516             :         }
     517             :     }
     518             : #endif                          /* __linux__ */
     519             : 
     520        3534 :     if (huge_page_size != 0)
     521             :     {
     522             :         /* If huge page size is requested explicitly, use that. */
     523           0 :         hugepagesize_local = (Size) huge_page_size * 1024;
     524             :     }
     525        3534 :     else if (default_hugepagesize != 0)
     526             :     {
     527             :         /* Otherwise use the system default, if we have it. */
     528        3534 :         hugepagesize_local = default_hugepagesize;
     529             :     }
     530             :     else
     531             :     {
     532             :         /*
     533             :          * If we fail to find out the system's default huge page size, or no
     534             :          * huge page size is requested explicitly, assume it is 2MB. This will
     535             :          * work fine when the actual size is less.  If it's more, we might get
     536             :          * mmap() or munmap() failures due to unaligned requests; but at this
     537             :          * writing, there are no reports of any non-Linux systems being picky
     538             :          * about that.
     539             :          */
     540           0 :         hugepagesize_local = 2 * 1024 * 1024;
     541             :     }
     542             : 
     543        3534 :     mmap_flags_local = MAP_HUGETLB;
     544             : 
     545             :     /*
     546             :      * On recent enough Linux, also include the explicit page size, if
     547             :      * necessary.
     548             :      */
     549             : #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
     550        3534 :     if (hugepagesize_local != default_hugepagesize)
     551             :     {
     552           0 :         int         shift = pg_ceil_log2_64(hugepagesize_local);
     553             : 
     554           0 :         mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
     555             :     }
     556             : #endif
     557             : 
     558             :     /* assign the results found */
     559        3534 :     if (mmap_flags)
     560        1906 :         *mmap_flags = mmap_flags_local;
     561        3534 :     if (hugepagesize)
     562        3534 :         *hugepagesize = hugepagesize_local;
     563             : 
     564             : #else
     565             : 
     566             :     if (hugepagesize)
     567             :         *hugepagesize = 0;
     568             :     if (mmap_flags)
     569             :         *mmap_flags = 0;
     570             : 
     571             : #endif                          /* MAP_HUGETLB */
     572        3534 : }
     573             : 
     574             : /*
     575             :  * GUC check_hook for huge_page_size
     576             :  */
     577             : bool
     578        1966 : check_huge_page_size(int *newval, void **extra, GucSource source)
     579             : {
     580             : #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
     581             :     /* Recent enough Linux only, for now.  See GetHugePageSize(). */
     582             :     if (*newval != 0)
     583             :     {
     584             :         GUC_check_errdetail("\"huge_page_size\" must be 0 on this platform.");
     585             :         return false;
     586             :     }
     587             : #endif
     588        1966 :     return true;
     589             : }
     590             : 
     591             : /*
     592             :  * Creates an anonymous mmap()ed shared memory segment.
     593             :  *
     594             :  * Pass the requested size in *size.  This function will modify *size to the
     595             :  * actual size of the allocation, if it ends up allocating a segment that is
     596             :  * larger than requested.
     597             :  */
     598             : static void *
     599        1906 : CreateAnonymousSegment(Size *size)
     600             : {
     601        1906 :     Size        allocsize = *size;
     602        1906 :     void       *ptr = MAP_FAILED;
     603        1906 :     int         mmap_errno = 0;
     604             : 
     605             : #ifndef MAP_HUGETLB
     606             :     /* PGSharedMemoryCreate should have dealt with this case */
     607             :     Assert(huge_pages != HUGE_PAGES_ON);
     608             : #else
     609        1906 :     if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
     610             :     {
     611             :         /*
     612             :          * Round up the request size to a suitable large value.
     613             :          */
     614             :         Size        hugepagesize;
     615             :         int         mmap_flags;
     616             : 
     617        1906 :         GetHugePageSize(&hugepagesize, &mmap_flags);
     618             : 
     619        1906 :         if (allocsize % hugepagesize != 0)
     620        1906 :             allocsize += hugepagesize - (allocsize % hugepagesize);
     621             : 
     622        1906 :         ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
     623             :                    PG_MMAP_FLAGS | mmap_flags, -1, 0);
     624        1906 :         mmap_errno = errno;
     625        1906 :         if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
     626        1906 :             elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
     627             :                  allocsize);
     628             :     }
     629             : #endif
     630             : 
     631             :     /*
     632             :      * Report whether huge pages are in use.  This needs to be tracked before
     633             :      * the second mmap() call if attempting to use huge pages failed
     634             :      * previously.
     635             :      */
     636        1906 :     SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
     637             :                     PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
     638             : 
     639        1906 :     if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
     640             :     {
     641             :         /*
     642             :          * Use the original size, not the rounded-up value, when falling back
     643             :          * to non-huge pages.
     644             :          */
     645        1906 :         allocsize = *size;
     646        1906 :         ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
     647             :                    PG_MMAP_FLAGS, -1, 0);
     648        1906 :         mmap_errno = errno;
     649             :     }
     650             : 
     651        1906 :     if (ptr == MAP_FAILED)
     652             :     {
     653           0 :         errno = mmap_errno;
     654           0 :         ereport(FATAL,
     655             :                 (errmsg("could not map anonymous shared memory: %m"),
     656             :                  (mmap_errno == ENOMEM) ?
     657             :                  errhint("This error usually means that PostgreSQL's request "
     658             :                          "for a shared memory segment exceeded available memory, "
     659             :                          "swap space, or huge pages. To reduce the request size "
     660             :                          "(currently %zu bytes), reduce PostgreSQL's shared "
     661             :                          "memory usage, perhaps by reducing \"shared_buffers\" or "
     662             :                          "\"max_connections\".",
     663             :                          allocsize) : 0));
     664             :     }
     665             : 
     666        1906 :     *size = allocsize;
     667        1906 :     return ptr;
     668             : }
     669             : 
     670             : /*
     671             :  * AnonymousShmemDetach --- detach from an anonymous mmap'd block
     672             :  * (called as an on_shmem_exit callback, hence funny argument list)
     673             :  */
     674             : static void
     675        1900 : AnonymousShmemDetach(int status, Datum arg)
     676             : {
     677             :     /* Release anonymous shared memory block, if any. */
     678        1900 :     if (AnonymousShmem != NULL)
     679             :     {
     680        1900 :         if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
     681           0 :             elog(LOG, "munmap(%p, %zu) failed: %m",
     682             :                  AnonymousShmem, AnonymousShmemSize);
     683        1900 :         AnonymousShmem = NULL;
     684             :     }
     685        1900 : }
     686             : 
     687             : /*
     688             :  * PGSharedMemoryCreate
     689             :  *
     690             :  * Create a shared memory segment of the given size and initialize its
     691             :  * standard header.  Also, register an on_shmem_exit callback to release
     692             :  * the storage.
     693             :  *
     694             :  * Dead Postgres segments pertinent to this DataDir are recycled if found, but
     695             :  * we do not fail upon collision with foreign shmem segments.  The idea here
     696             :  * is to detect and re-use keys that may have been assigned by a crashed
     697             :  * postmaster or backend.
     698             :  */
     699             : PGShmemHeader *
     700        1906 : PGSharedMemoryCreate(Size size,
     701             :                      PGShmemHeader **shim)
     702             : {
     703             :     IpcMemoryKey NextShmemSegID;
     704             :     void       *memAddress;
     705             :     PGShmemHeader *hdr;
     706             :     struct stat statbuf;
     707             :     Size        sysvsize;
     708             : 
     709             :     /*
     710             :      * We use the data directory's ID info (inode and device numbers) to
     711             :      * positively identify shmem segments associated with this data dir, and
     712             :      * also as seeds for searching for a free shmem key.
     713             :      */
     714        1906 :     if (stat(DataDir, &statbuf) < 0)
     715           0 :         ereport(FATAL,
     716             :                 (errcode_for_file_access(),
     717             :                  errmsg("could not stat data directory \"%s\": %m",
     718             :                         DataDir)));
     719             : 
     720             :     /* Complain if hugepages demanded but we can't possibly support them */
     721             : #if !defined(MAP_HUGETLB)
     722             :     if (huge_pages == HUGE_PAGES_ON)
     723             :         ereport(ERROR,
     724             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     725             :                  errmsg("huge pages not supported on this platform")));
     726             : #endif
     727             : 
     728             :     /* For now, we don't support huge pages in SysV memory */
     729        1906 :     if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP)
     730           0 :         ereport(ERROR,
     731             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     732             :                  errmsg("huge pages not supported with the current \"shared_memory_type\" setting")));
     733             : 
     734             :     /* Room for a header? */
     735             :     Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
     736             : 
     737        1906 :     if (shared_memory_type == SHMEM_TYPE_MMAP)
     738             :     {
     739        1906 :         AnonymousShmem = CreateAnonymousSegment(&size);
     740        1906 :         AnonymousShmemSize = size;
     741             : 
     742             :         /* Register on-exit routine to unmap the anonymous segment */
     743        1906 :         on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
     744             : 
     745             :         /* Now we need only allocate a minimal-sized SysV shmem block. */
     746        1906 :         sysvsize = sizeof(PGShmemHeader);
     747             :     }
     748             :     else
     749             :     {
     750           0 :         sysvsize = size;
     751             : 
     752             :         /* huge pages are only available with mmap */
     753           0 :         SetConfigOption("huge_pages_status", "off",
     754             :                         PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
     755             :     }
     756             : 
     757             :     /*
     758             :      * Loop till we find a free IPC key.  Trust CreateDataDirLockFile() to
     759             :      * ensure no more than one postmaster per data directory can enter this
     760             :      * loop simultaneously.  (CreateDataDirLockFile() does not entirely ensure
     761             :      * that, but prefer fixing it over coping here.)
     762             :      */
     763        1906 :     NextShmemSegID = statbuf.st_ino;
     764             : 
     765             :     for (;;)
     766          14 :     {
     767             :         IpcMemoryId shmid;
     768             :         PGShmemHeader *oldhdr;
     769             :         IpcMemoryState state;
     770             : 
     771             :         /* Try to create new segment */
     772        1920 :         memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
     773        1920 :         if (memAddress)
     774        1902 :             break;              /* successful create and attach */
     775             : 
     776             :         /* Check shared memory and possibly remove and recreate */
     777             : 
     778             :         /*
     779             :          * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
     780             :          * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
     781             :          * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
     782             :          */
     783          18 :         shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
     784          18 :         if (shmid < 0)
     785             :         {
     786           0 :             oldhdr = NULL;
     787           0 :             state = SHMSTATE_FOREIGN;
     788             :         }
     789             :         else
     790          18 :             state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
     791             : 
     792          18 :         switch (state)
     793             :         {
     794           4 :             case SHMSTATE_ANALYSIS_FAILURE:
     795             :             case SHMSTATE_ATTACHED:
     796           4 :                 ereport(FATAL,
     797             :                         (errcode(ERRCODE_LOCK_FILE_EXISTS),
     798             :                          errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
     799             :                                 (unsigned long) NextShmemSegID,
     800             :                                 (unsigned long) shmid),
     801             :                          errhint("Terminate any old server processes associated with data directory \"%s\".",
     802             :                                  DataDir)));
     803             :                 break;
     804           0 :             case SHMSTATE_ENOENT:
     805             : 
     806             :                 /*
     807             :                  * To our surprise, some other process deleted since our last
     808             :                  * InternalIpcMemoryCreate().  Moments earlier, we would have
     809             :                  * seen SHMSTATE_FOREIGN.  Try that same ID again.
     810             :                  */
     811           0 :                 elog(LOG,
     812             :                      "shared memory block (key %lu, ID %lu) deleted during startup",
     813             :                      (unsigned long) NextShmemSegID,
     814             :                      (unsigned long) shmid);
     815           0 :                 break;
     816           8 :             case SHMSTATE_FOREIGN:
     817           8 :                 NextShmemSegID++;
     818           8 :                 break;
     819           6 :             case SHMSTATE_UNATTACHED:
     820             : 
     821             :                 /*
     822             :                  * The segment pertains to DataDir, and every process that had
     823             :                  * used it has died or detached.  Zap it, if possible, and any
     824             :                  * associated dynamic shared memory segments, as well.  This
     825             :                  * shouldn't fail, but if it does, assume the segment belongs
     826             :                  * to someone else after all, and try the next candidate.
     827             :                  * Otherwise, try again to create the segment.  That may fail
     828             :                  * if some other process creates the same shmem key before we
     829             :                  * do, in which case we'll try the next key.
     830             :                  */
     831           6 :                 if (oldhdr->dsm_control != 0)
     832           6 :                     dsm_cleanup_using_control_segment(oldhdr->dsm_control);
     833           6 :                 if (shmctl(shmid, IPC_RMID, NULL) < 0)
     834           0 :                     NextShmemSegID++;
     835           6 :                 break;
     836             :         }
     837             : 
     838          14 :         if (oldhdr && shmdt((void *) oldhdr) < 0)
     839           0 :             elog(LOG, "shmdt(%p) failed: %m", oldhdr);
     840             :     }
     841             : 
     842             :     /* Initialize new segment. */
     843        1902 :     hdr = (PGShmemHeader *) memAddress;
     844        1902 :     hdr->creatorPID = getpid();
     845        1902 :     hdr->magic = PGShmemMagic;
     846        1902 :     hdr->dsm_control = 0;
     847             : 
     848             :     /* Fill in the data directory ID info, too */
     849        1902 :     hdr->device = statbuf.st_dev;
     850        1902 :     hdr->inode = statbuf.st_ino;
     851             : 
     852             :     /*
     853             :      * Initialize space allocation status for segment.
     854             :      */
     855        1902 :     hdr->totalsize = size;
     856        1902 :     hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
     857        1902 :     *shim = hdr;
     858             : 
     859             :     /* Save info for possible future use */
     860        1902 :     UsedShmemSegAddr = memAddress;
     861        1902 :     UsedShmemSegID = (unsigned long) NextShmemSegID;
     862             : 
     863             :     /*
     864             :      * If AnonymousShmem is NULL here, then we're not using anonymous shared
     865             :      * memory, and should return a pointer to the System V shared memory
     866             :      * block. Otherwise, the System V shared memory block is only a shim, and
     867             :      * we must return a pointer to the real block.
     868             :      */
     869        1902 :     if (AnonymousShmem == NULL)
     870           0 :         return hdr;
     871        1902 :     memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
     872        1902 :     return (PGShmemHeader *) AnonymousShmem;
     873             : }
     874             : 
     875             : #ifdef EXEC_BACKEND
     876             : 
     877             : /*
     878             :  * PGSharedMemoryReAttach
     879             :  *
     880             :  * This is called during startup of a postmaster child process to re-attach to
     881             :  * an already existing shared memory segment.  This is needed only in the
     882             :  * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
     883             :  * segment attachment via fork().
     884             :  *
     885             :  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
     886             :  * routine.  The caller must have already restored them to the postmaster's
     887             :  * values.
     888             :  */
     889             : void
     890             : PGSharedMemoryReAttach(void)
     891             : {
     892             :     IpcMemoryId shmid;
     893             :     PGShmemHeader *hdr;
     894             :     IpcMemoryState state;
     895             :     void       *origUsedShmemSegAddr = UsedShmemSegAddr;
     896             : 
     897             :     Assert(UsedShmemSegAddr != NULL);
     898             :     Assert(IsUnderPostmaster);
     899             : 
     900             : #ifdef __CYGWIN__
     901             :     /* cygipc (currently) appears to not detach on exec. */
     902             :     PGSharedMemoryDetach();
     903             :     UsedShmemSegAddr = origUsedShmemSegAddr;
     904             : #endif
     905             : 
     906             :     elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
     907             :     shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
     908             :     if (shmid < 0)
     909             :         state = SHMSTATE_FOREIGN;
     910             :     else
     911             :         state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
     912             :     if (state != SHMSTATE_ATTACHED)
     913             :         elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
     914             :              (int) UsedShmemSegID, UsedShmemSegAddr);
     915             :     if (hdr != origUsedShmemSegAddr)
     916             :         elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
     917             :              hdr, origUsedShmemSegAddr);
     918             :     dsm_set_control_handle(hdr->dsm_control);
     919             : 
     920             :     UsedShmemSegAddr = hdr;     /* probably redundant */
     921             : }
     922             : 
     923             : /*
     924             :  * PGSharedMemoryNoReAttach
     925             :  *
     926             :  * This is called during startup of a postmaster child process when we choose
     927             :  * *not* to re-attach to the existing shared memory segment.  We must clean up
     928             :  * to leave things in the appropriate state.  This is not used in the non
     929             :  * EXEC_BACKEND case, either.
     930             :  *
     931             :  * The child process startup logic might or might not call PGSharedMemoryDetach
     932             :  * after this; make sure that it will be a no-op if called.
     933             :  *
     934             :  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
     935             :  * routine.  The caller must have already restored them to the postmaster's
     936             :  * values.
     937             :  */
     938             : void
     939             : PGSharedMemoryNoReAttach(void)
     940             : {
     941             :     Assert(UsedShmemSegAddr != NULL);
     942             :     Assert(IsUnderPostmaster);
     943             : 
     944             : #ifdef __CYGWIN__
     945             :     /* cygipc (currently) appears to not detach on exec. */
     946             :     PGSharedMemoryDetach();
     947             : #endif
     948             : 
     949             :     /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
     950             :     UsedShmemSegAddr = NULL;
     951             :     /* And the same for UsedShmemSegID. */
     952             :     UsedShmemSegID = 0;
     953             : }
     954             : 
     955             : #endif                          /* EXEC_BACKEND */
     956             : 
     957             : /*
     958             :  * PGSharedMemoryDetach
     959             :  *
     960             :  * Detach from the shared memory segment, if still attached.  This is not
     961             :  * intended to be called explicitly by the process that originally created the
     962             :  * segment (it will have on_shmem_exit callback(s) registered to do that).
     963             :  * Rather, this is for subprocesses that have inherited an attachment and want
     964             :  * to get rid of it.
     965             :  *
     966             :  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
     967             :  * routine, also AnonymousShmem and AnonymousShmemSize.
     968             :  */
     969             : void
     970           2 : PGSharedMemoryDetach(void)
     971             : {
     972           2 :     if (UsedShmemSegAddr != NULL)
     973             :     {
     974           2 :         if ((shmdt(UsedShmemSegAddr) < 0)
     975             : #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
     976             :         /* Work-around for cygipc exec bug */
     977             :             && shmdt(NULL) < 0
     978             : #endif
     979             :             )
     980           0 :             elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
     981           2 :         UsedShmemSegAddr = NULL;
     982             :     }
     983             : 
     984           2 :     if (AnonymousShmem != NULL)
     985             :     {
     986           2 :         if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
     987           0 :             elog(LOG, "munmap(%p, %zu) failed: %m",
     988             :                  AnonymousShmem, AnonymousShmemSize);
     989           2 :         AnonymousShmem = NULL;
     990             :     }
     991           2 : }

Generated by: LCOV version 1.14