Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * shm_toc.c 4 : * shared memory segment table of contents 5 : * 6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group 7 : * Portions Copyright (c) 1994, Regents of the University of California 8 : * 9 : * src/backend/storage/ipc/shm_toc.c 10 : * 11 : *------------------------------------------------------------------------- 12 : */ 13 : 14 : #include "postgres.h" 15 : 16 : #include "port/atomics.h" 17 : #include "storage/shm_toc.h" 18 : #include "storage/spin.h" 19 : 20 : typedef struct shm_toc_entry 21 : { 22 : uint64 key; /* Arbitrary identifier */ 23 : Size offset; /* Offset, in bytes, from TOC start */ 24 : } shm_toc_entry; 25 : 26 : struct shm_toc 27 : { 28 : uint64 toc_magic; /* Magic number identifying this TOC */ 29 : slock_t toc_mutex; /* Spinlock for mutual exclusion */ 30 : Size toc_total_bytes; /* Bytes managed by this TOC */ 31 : Size toc_allocated_bytes; /* Bytes allocated of those managed */ 32 : uint32 toc_nentry; /* Number of entries in TOC */ 33 : shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER]; 34 : }; 35 : 36 : /* 37 : * Initialize a region of shared memory with a table of contents. 38 : */ 39 : shm_toc * 40 1060 : shm_toc_create(uint64 magic, void *address, Size nbytes) 41 : { 42 1060 : shm_toc *toc = (shm_toc *) address; 43 : 44 : Assert(nbytes > offsetof(shm_toc, toc_entry)); 45 1060 : toc->toc_magic = magic; 46 1060 : SpinLockInit(&toc->toc_mutex); 47 : 48 : /* 49 : * The alignment code in shm_toc_allocate() assumes that the starting 50 : * value is buffer-aligned. 51 : */ 52 1060 : toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes); 53 1060 : toc->toc_allocated_bytes = 0; 54 1060 : toc->toc_nentry = 0; 55 : 56 1060 : return toc; 57 : } 58 : 59 : /* 60 : * Attach to an existing table of contents. If the magic number found at 61 : * the target address doesn't match our expectations, return NULL. 62 : */ 63 : shm_toc * 64 5462 : shm_toc_attach(uint64 magic, void *address) 65 : { 66 5462 : shm_toc *toc = (shm_toc *) address; 67 : 68 5462 : if (toc->toc_magic != magic) 69 0 : return NULL; 70 : 71 : Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes); 72 : Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry)); 73 : 74 5462 : return toc; 75 : } 76 : 77 : /* 78 : * Allocate shared memory from a segment managed by a table of contents. 79 : * 80 : * This is not a full-blown allocator; there's no way to free memory. It's 81 : * just a way of dividing a single physical shared memory segment into logical 82 : * chunks that may be used for different purposes. 83 : * 84 : * We allocate backwards from the end of the segment, so that the TOC entries 85 : * can grow forward from the start of the segment. 86 : */ 87 : void * 88 21182 : shm_toc_allocate(shm_toc *toc, Size nbytes) 89 : { 90 21182 : volatile shm_toc *vtoc = toc; 91 : Size total_bytes; 92 : Size allocated_bytes; 93 : Size nentry; 94 : Size toc_bytes; 95 : 96 : /* 97 : * Make sure request is well-aligned. XXX: MAXALIGN is not enough, 98 : * because atomic ops might need a wider alignment. We don't have a 99 : * proper definition for the minimum to make atomic ops safe, but 100 : * BUFFERALIGN ought to be enough. 101 : */ 102 21182 : nbytes = BUFFERALIGN(nbytes); 103 : 104 21182 : SpinLockAcquire(&toc->toc_mutex); 105 : 106 21182 : total_bytes = vtoc->toc_total_bytes; 107 21182 : allocated_bytes = vtoc->toc_allocated_bytes; 108 21182 : nentry = vtoc->toc_nentry; 109 21182 : toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) 110 21182 : + allocated_bytes; 111 : 112 : /* Check for memory exhaustion and overflow. */ 113 21182 : if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes) 114 : { 115 0 : SpinLockRelease(&toc->toc_mutex); 116 0 : ereport(ERROR, 117 : (errcode(ERRCODE_OUT_OF_MEMORY), 118 : errmsg("out of shared memory"))); 119 : } 120 21182 : vtoc->toc_allocated_bytes += nbytes; 121 : 122 21182 : SpinLockRelease(&toc->toc_mutex); 123 : 124 21182 : return ((char *) toc) + (total_bytes - allocated_bytes - nbytes); 125 : } 126 : 127 : /* 128 : * Return the number of bytes that can still be allocated. 129 : */ 130 : Size 131 0 : shm_toc_freespace(shm_toc *toc) 132 : { 133 0 : volatile shm_toc *vtoc = toc; 134 : Size total_bytes; 135 : Size allocated_bytes; 136 : Size nentry; 137 : Size toc_bytes; 138 : 139 0 : SpinLockAcquire(&toc->toc_mutex); 140 0 : total_bytes = vtoc->toc_total_bytes; 141 0 : allocated_bytes = vtoc->toc_allocated_bytes; 142 0 : nentry = vtoc->toc_nentry; 143 0 : SpinLockRelease(&toc->toc_mutex); 144 : 145 0 : toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry); 146 : Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes); 147 0 : return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes)); 148 : } 149 : 150 : /* 151 : * Insert a TOC entry. 152 : * 153 : * The idea here is that the process setting up the shared memory segment will 154 : * register the addresses of data structures within the segment using this 155 : * function. Each data structure will be identified using a 64-bit key, which 156 : * is assumed to be a well-known or discoverable integer. Other processes 157 : * accessing the shared memory segment can pass the same key to 158 : * shm_toc_lookup() to discover the addresses of those data structures. 159 : * 160 : * Since the shared memory segment may be mapped at different addresses within 161 : * different backends, we store relative rather than absolute pointers. 162 : * 163 : * This won't scale well to a large number of keys. Hopefully, that isn't 164 : * necessary; if it proves to be, we might need to provide a more sophisticated 165 : * data structure here. But the real idea here is just to give someone mapping 166 : * a dynamic shared memory the ability to find the bare minimum number of 167 : * pointers that they need to bootstrap. If you're storing a lot of stuff in 168 : * the TOC, you're doing it wrong. 169 : */ 170 : void 171 21182 : shm_toc_insert(shm_toc *toc, uint64 key, void *address) 172 : { 173 21182 : volatile shm_toc *vtoc = toc; 174 : Size total_bytes; 175 : Size allocated_bytes; 176 : Size nentry; 177 : Size toc_bytes; 178 : Size offset; 179 : 180 : /* Relativize pointer. */ 181 : Assert(address > (void *) toc); 182 21182 : offset = ((char *) address) - (char *) toc; 183 : 184 21182 : SpinLockAcquire(&toc->toc_mutex); 185 : 186 21182 : total_bytes = vtoc->toc_total_bytes; 187 21182 : allocated_bytes = vtoc->toc_allocated_bytes; 188 21182 : nentry = vtoc->toc_nentry; 189 21182 : toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) 190 21182 : + allocated_bytes; 191 : 192 : /* Check for memory exhaustion and overflow. */ 193 21182 : if (toc_bytes + sizeof(shm_toc_entry) > total_bytes || 194 21182 : toc_bytes + sizeof(shm_toc_entry) < toc_bytes || 195 : nentry >= PG_UINT32_MAX) 196 : { 197 0 : SpinLockRelease(&toc->toc_mutex); 198 0 : ereport(ERROR, 199 : (errcode(ERRCODE_OUT_OF_MEMORY), 200 : errmsg("out of shared memory"))); 201 : } 202 : 203 : Assert(offset < total_bytes); 204 21182 : vtoc->toc_entry[nentry].key = key; 205 21182 : vtoc->toc_entry[nentry].offset = offset; 206 : 207 : /* 208 : * By placing a write barrier after filling in the entry and before 209 : * updating the number of entries, we make it safe to read the TOC 210 : * unlocked. 211 : */ 212 21182 : pg_write_barrier(); 213 : 214 21182 : vtoc->toc_nentry++; 215 : 216 21182 : SpinLockRelease(&toc->toc_mutex); 217 21182 : } 218 : 219 : /* 220 : * Look up a TOC entry. 221 : * 222 : * If the key is not found, returns NULL if noError is true, otherwise 223 : * throws elog(ERROR). 224 : * 225 : * Unlike the other functions in this file, this operation acquires no lock; 226 : * it uses only barriers. It probably wouldn't hurt concurrency very much even 227 : * if it did get a lock, but since it's reasonably likely that a group of 228 : * worker processes could each read a series of entries from the same TOC 229 : * right around the same time, there seems to be some value in avoiding it. 230 : */ 231 : void * 232 81094 : shm_toc_lookup(shm_toc *toc, uint64 key, bool noError) 233 : { 234 : uint32 nentry; 235 : uint32 i; 236 : 237 : /* 238 : * Read the number of entries before we examine any entry. We assume that 239 : * reading a uint32 is atomic. 240 : */ 241 81094 : nentry = toc->toc_nentry; 242 81094 : pg_read_barrier(); 243 : 244 : /* Now search for a matching entry. */ 245 1041002 : for (i = 0; i < nentry; ++i) 246 : { 247 1032410 : if (toc->toc_entry[i].key == key) 248 72502 : return ((char *) toc) + toc->toc_entry[i].offset; 249 : } 250 : 251 : /* No matching entry was found. */ 252 8592 : if (!noError) 253 0 : elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p", 254 : key, toc); 255 8592 : return NULL; 256 : } 257 : 258 : /* 259 : * Estimate how much shared memory will be required to store a TOC and its 260 : * dependent data structures. 261 : */ 262 : Size 263 1090 : shm_toc_estimate(shm_toc_estimator *e) 264 : { 265 : Size sz; 266 : 267 1090 : sz = offsetof(shm_toc, toc_entry); 268 1090 : sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry))); 269 1090 : sz = add_size(sz, e->space_for_chunks); 270 : 271 1090 : return BUFFERALIGN(sz); 272 : }