Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * shm_toc.c
4 : * shared memory segment table of contents
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * src/backend/storage/ipc/shm_toc.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 :
14 : #include "postgres.h"
15 :
16 : #include "port/atomics.h"
17 : #include "storage/shm_toc.h"
18 : #include "storage/spin.h"
19 :
20 : typedef struct shm_toc_entry
21 : {
22 : uint64 key; /* Arbitrary identifier */
23 : Size offset; /* Offset, in bytes, from TOC start */
24 : } shm_toc_entry;
25 :
26 : struct shm_toc
27 : {
28 : uint64 toc_magic; /* Magic number identifying this TOC */
29 : slock_t toc_mutex; /* Spinlock for mutual exclusion */
30 : Size toc_total_bytes; /* Bytes managed by this TOC */
31 : Size toc_allocated_bytes; /* Bytes allocated of those managed */
32 : uint32 toc_nentry; /* Number of entries in TOC */
33 : shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
34 : };
35 :
36 : /*
37 : * Initialize a region of shared memory with a table of contents.
38 : */
39 : shm_toc *
40 816 : shm_toc_create(uint64 magic, void *address, Size nbytes)
41 : {
42 816 : shm_toc *toc = (shm_toc *) address;
43 :
44 : Assert(nbytes > offsetof(shm_toc, toc_entry));
45 816 : toc->toc_magic = magic;
46 816 : SpinLockInit(&toc->toc_mutex);
47 :
48 : /*
49 : * The alignment code in shm_toc_allocate() assumes that the starting
50 : * value is buffer-aligned.
51 : */
52 816 : toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
53 816 : toc->toc_allocated_bytes = 0;
54 816 : toc->toc_nentry = 0;
55 :
56 816 : return toc;
57 : }
58 :
59 : /*
60 : * Attach to an existing table of contents. If the magic number found at
61 : * the target address doesn't match our expectations, return NULL.
62 : */
63 : shm_toc *
64 4037 : shm_toc_attach(uint64 magic, void *address)
65 : {
66 4037 : shm_toc *toc = (shm_toc *) address;
67 :
68 4037 : if (toc->toc_magic != magic)
69 0 : return NULL;
70 :
71 : Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
72 : Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
73 :
74 4037 : return toc;
75 : }
76 :
77 : /*
78 : * Allocate shared memory from a segment managed by a table of contents.
79 : *
80 : * This is not a full-blown allocator; there's no way to free memory. It's
81 : * just a way of dividing a single physical shared memory segment into logical
82 : * chunks that may be used for different purposes.
83 : *
84 : * We allocate backwards from the end of the segment, so that the TOC entries
85 : * can grow forward from the start of the segment.
86 : */
87 : void *
88 16722 : shm_toc_allocate(shm_toc *toc, Size nbytes)
89 : {
90 16722 : volatile shm_toc *vtoc = toc;
91 : Size total_bytes;
92 : Size allocated_bytes;
93 : Size nentry;
94 : Size toc_bytes;
95 :
96 : /*
97 : * Make sure request is well-aligned. XXX: MAXALIGN is not enough,
98 : * because atomic ops might need a wider alignment. We don't have a
99 : * proper definition for the minimum to make atomic ops safe, but
100 : * BUFFERALIGN ought to be enough.
101 : */
102 16722 : nbytes = BUFFERALIGN(nbytes);
103 :
104 16722 : SpinLockAcquire(&toc->toc_mutex);
105 :
106 16722 : total_bytes = vtoc->toc_total_bytes;
107 16722 : allocated_bytes = vtoc->toc_allocated_bytes;
108 16722 : nentry = vtoc->toc_nentry;
109 16722 : toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
110 16722 : + allocated_bytes;
111 :
112 : /* Check for memory exhaustion and overflow. */
113 16722 : if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
114 : {
115 0 : SpinLockRelease(&toc->toc_mutex);
116 0 : ereport(ERROR,
117 : (errcode(ERRCODE_OUT_OF_MEMORY),
118 : errmsg("out of shared memory")));
119 : }
120 16722 : vtoc->toc_allocated_bytes += nbytes;
121 :
122 16722 : SpinLockRelease(&toc->toc_mutex);
123 :
124 16722 : return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
125 : }
126 :
127 : /*
128 : * Return the number of bytes that can still be allocated.
129 : */
130 : Size
131 0 : shm_toc_freespace(shm_toc *toc)
132 : {
133 0 : volatile shm_toc *vtoc = toc;
134 : Size total_bytes;
135 : Size allocated_bytes;
136 : Size nentry;
137 : Size toc_bytes;
138 :
139 0 : SpinLockAcquire(&toc->toc_mutex);
140 0 : total_bytes = vtoc->toc_total_bytes;
141 0 : allocated_bytes = vtoc->toc_allocated_bytes;
142 0 : nentry = vtoc->toc_nentry;
143 0 : SpinLockRelease(&toc->toc_mutex);
144 :
145 0 : toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
146 : Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
147 0 : return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
148 : }
149 :
150 : /*
151 : * Insert a TOC entry.
152 : *
153 : * The idea here is that the process setting up the shared memory segment will
154 : * register the addresses of data structures within the segment using this
155 : * function. Each data structure will be identified using a 64-bit key, which
156 : * is assumed to be a well-known or discoverable integer. Other processes
157 : * accessing the shared memory segment can pass the same key to
158 : * shm_toc_lookup() to discover the addresses of those data structures.
159 : *
160 : * Since the shared memory segment may be mapped at different addresses within
161 : * different backends, we store relative rather than absolute pointers.
162 : *
163 : * This won't scale well to a large number of keys. Hopefully, that isn't
164 : * necessary; if it proves to be, we might need to provide a more sophisticated
165 : * data structure here. But the real idea here is just to give someone mapping
166 : * a dynamic shared memory the ability to find the bare minimum number of
167 : * pointers that they need to bootstrap. If you're storing a lot of stuff in
168 : * the TOC, you're doing it wrong.
169 : */
170 : void
171 16722 : shm_toc_insert(shm_toc *toc, uint64 key, void *address)
172 : {
173 16722 : volatile shm_toc *vtoc = toc;
174 : Size total_bytes;
175 : Size allocated_bytes;
176 : Size nentry;
177 : Size toc_bytes;
178 : Size offset;
179 :
180 : /* Relativize pointer. */
181 : Assert(address > (void *) toc);
182 16722 : offset = ((char *) address) - (char *) toc;
183 :
184 16722 : SpinLockAcquire(&toc->toc_mutex);
185 :
186 16722 : total_bytes = vtoc->toc_total_bytes;
187 16722 : allocated_bytes = vtoc->toc_allocated_bytes;
188 16722 : nentry = vtoc->toc_nentry;
189 :
190 : #ifdef USE_ASSERT_CHECKING
191 : /* Verify no duplicate keys */
192 : for (Size i = 0; i < nentry; i++)
193 : Assert(vtoc->toc_entry[i].key != key);
194 : #endif
195 :
196 16722 : toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
197 16722 : + allocated_bytes;
198 :
199 : /* Check for memory exhaustion and overflow. */
200 16722 : if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
201 16722 : toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
202 : nentry >= PG_UINT32_MAX)
203 : {
204 0 : SpinLockRelease(&toc->toc_mutex);
205 0 : ereport(ERROR,
206 : (errcode(ERRCODE_OUT_OF_MEMORY),
207 : errmsg("out of shared memory")));
208 : }
209 :
210 : Assert(offset < total_bytes);
211 16722 : vtoc->toc_entry[nentry].key = key;
212 16722 : vtoc->toc_entry[nentry].offset = offset;
213 :
214 : /*
215 : * By placing a write barrier after filling in the entry and before
216 : * updating the number of entries, we make it safe to read the TOC
217 : * unlocked.
218 : */
219 16722 : pg_write_barrier();
220 :
221 16722 : vtoc->toc_nentry++;
222 :
223 16722 : SpinLockRelease(&toc->toc_mutex);
224 16722 : }
225 :
226 : /*
227 : * Look up a TOC entry.
228 : *
229 : * If the key is not found, returns NULL if noError is true, otherwise
230 : * throws elog(ERROR).
231 : *
232 : * Unlike the other functions in this file, this operation acquires no lock;
233 : * it uses only barriers. It probably wouldn't hurt concurrency very much even
234 : * if it did get a lock, but since it's reasonably likely that a group of
235 : * worker processes could each read a series of entries from the same TOC
236 : * right around the same time, there seems to be some value in avoiding it.
237 : */
238 : void *
239 60950 : shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
240 : {
241 : uint32 nentry;
242 : uint32 i;
243 :
244 : /*
245 : * Read the number of entries before we examine any entry. We assume that
246 : * reading a uint32 is atomic.
247 : */
248 60950 : nentry = toc->toc_nentry;
249 60950 : pg_read_barrier();
250 :
251 : /* Now search for a matching entry. */
252 875096 : for (i = 0; i < nentry; ++i)
253 : {
254 868515 : if (toc->toc_entry[i].key == key)
255 54369 : return ((char *) toc) + toc->toc_entry[i].offset;
256 : }
257 :
258 : /* No matching entry was found. */
259 6581 : if (!noError)
260 0 : elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
261 : key, toc);
262 6581 : return NULL;
263 : }
264 :
265 : /*
266 : * Estimate how much shared memory will be required to store a TOC and its
267 : * dependent data structures.
268 : */
269 : Size
270 835 : shm_toc_estimate(shm_toc_estimator *e)
271 : {
272 : Size sz;
273 :
274 835 : sz = offsetof(shm_toc, toc_entry);
275 835 : sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
276 835 : sz = add_size(sz, e->space_for_chunks);
277 :
278 835 : return BUFFERALIGN(sz);
279 : }
|