Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * shmem.c
4 : * create shared memory and initialize shared memory data structures.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/ipc/shmem.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * POSTGRES processes share one or more regions of shared memory.
17 : * The shared memory is created by a postmaster and is inherited
18 : * by each backend via fork() (or, in some ports, via other OS-specific
19 : * methods). The routines in this file are used for allocating and
20 : * binding to shared memory data structures.
21 : *
22 : * NOTES:
23 : * (a) There are three kinds of shared memory data structures
24 : * available to POSTGRES: fixed-size structures, queues and hash
25 : * tables. Fixed-size structures contain things like global variables
26 : * for a module and should never be allocated after the shared memory
27 : * initialization phase. Hash tables have a fixed maximum size, but
28 : * their actual size can vary dynamically. When entries are added
29 : * to the table, more space is allocated. Queues link data structures
30 : * that have been allocated either within fixed-size structures or as hash
31 : * buckets. Each shared data structure has a string name to identify
32 : * it (assigned in the module that declares it).
33 : *
34 : * (b) During initialization, each module looks for its
35 : * shared data structures in a hash table called the "Shmem Index".
36 : * If the data structure is not present, the caller can allocate
37 : * a new one and initialize it. If the data structure is present,
38 : * the caller "attaches" to the structure by initializing a pointer
39 : * in the local address space.
40 : * The shmem index has two purposes: first, it gives us
41 : * a simple model of how the world looks when a backend process
42 : * initializes. If something is present in the shmem index,
43 : * it is initialized. If it is not, it is uninitialized. Second,
44 : * the shmem index allows us to allocate shared memory on demand
45 : * instead of trying to preallocate structures and hard-wire the
46 : * sizes and locations in header files. If you are using a lot
47 : * of shared memory in a lot of different places (and changing
48 : * things during development), this is important.
49 : *
50 : * (c) In standard Unix-ish environments, individual backends do not
51 : * need to re-establish their local pointers into shared memory, because
52 : * they inherit correct values of those variables via fork() from the
53 : * postmaster. However, this does not work in the EXEC_BACKEND case.
54 : * In ports using EXEC_BACKEND, new backends have to set up their local
55 : * pointers using the method described in (b) above.
56 : *
57 : * (d) memory allocation model: shared memory can never be
58 : * freed, once allocated. Each hash table has its own free list,
59 : * so hash buckets can be reused when an item is deleted. However,
60 : * if one hash table grows very large and then shrinks, its space
61 : * cannot be redistributed to other tables. We could build a simple
62 : * hash bucket garbage collector if need be. Right now, it seems
63 : * unnecessary.
64 : */
65 :
66 : #include "postgres.h"
67 :
68 : #include <unistd.h>
69 :
70 : #include "common/int.h"
71 : #include "fmgr.h"
72 : #include "funcapi.h"
73 : #include "miscadmin.h"
74 : #include "port/pg_numa.h"
75 : #include "storage/lwlock.h"
76 : #include "storage/pg_shmem.h"
77 : #include "storage/shmem.h"
78 : #include "storage/spin.h"
79 : #include "utils/builtins.h"
80 : #include "utils/tuplestore.h"
81 :
82 : /*
83 : * This is the first data structure stored in the shared memory segment, at
84 : * the offset that PGShmemHeader->content_offset points to. Allocations by
85 : * ShmemAlloc() are carved out of the space after this.
86 : *
87 : * For the base pointer and the total size of the shmem segment, we rely on
88 : * the PGShmemHeader.
89 : */
90 : typedef struct ShmemAllocatorData
91 : {
92 : Size free_offset; /* offset to first free space from ShmemBase */
93 : HASHHDR *index; /* location of ShmemIndex */
94 :
95 : /* protects shared memory and LWLock allocation */
96 : slock_t shmem_lock;
97 : } ShmemAllocatorData;
98 :
99 : static void *ShmemAllocRaw(Size size, Size *allocated_size);
100 :
101 : /* shared memory global variables */
102 :
103 : static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
104 : static void *ShmemBase; /* start address of shared memory */
105 : static void *ShmemEnd; /* end+1 address of shared memory */
106 :
107 : static ShmemAllocatorData *ShmemAllocator;
108 : slock_t *ShmemLock; /* points to ShmemAllocator->shmem_lock */
109 : static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
110 :
111 : /* To get reliable results for NUMA inquiry we need to "touch pages" once */
112 : static bool firstNumaTouch = true;
113 :
114 : Datum pg_numa_available(PG_FUNCTION_ARGS);
115 :
116 : /*
117 : * InitShmemAllocator() --- set up basic pointers to shared memory.
118 : *
119 : * Called at postmaster or stand-alone backend startup, to initialize the
120 : * allocator's data structure in the shared memory segment. In EXEC_BACKEND,
121 : * this is also called at backend startup, to set up pointers to the shared
122 : * memory areas.
123 : */
124 : void
125 1165 : InitShmemAllocator(PGShmemHeader *seghdr)
126 : {
127 : Assert(seghdr != NULL);
128 :
129 : /*
130 : * We assume the pointer and offset are MAXALIGN. Not a hard requirement,
131 : * but it's true today and keeps the math below simpler.
132 : */
133 : Assert(seghdr == (void *) MAXALIGN(seghdr));
134 : Assert(seghdr->content_offset == MAXALIGN(seghdr->content_offset));
135 :
136 1165 : ShmemSegHdr = seghdr;
137 1165 : ShmemBase = seghdr;
138 1165 : ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
139 :
140 : #ifndef EXEC_BACKEND
141 : Assert(!IsUnderPostmaster);
142 : #endif
143 1165 : if (IsUnderPostmaster)
144 : {
145 0 : PGShmemHeader *shmhdr = ShmemSegHdr;
146 :
147 0 : ShmemAllocator = (ShmemAllocatorData *) ((char *) shmhdr + shmhdr->content_offset);
148 0 : ShmemLock = &ShmemAllocator->shmem_lock;
149 : }
150 : else
151 : {
152 : Size offset;
153 :
154 : /*
155 : * Allocations after this point should go through ShmemAlloc, which
156 : * expects to allocate everything on cache line boundaries. Make sure
157 : * the first allocation begins on a cache line boundary.
158 : */
159 1165 : offset = CACHELINEALIGN(seghdr->content_offset + sizeof(ShmemAllocatorData));
160 1165 : if (offset > seghdr->totalsize)
161 0 : ereport(ERROR,
162 : (errcode(ERRCODE_OUT_OF_MEMORY),
163 : errmsg("out of shared memory (%zu bytes requested)",
164 : offset)));
165 :
166 1165 : ShmemAllocator = (ShmemAllocatorData *) ((char *) seghdr + seghdr->content_offset);
167 :
168 1165 : SpinLockInit(&ShmemAllocator->shmem_lock);
169 1165 : ShmemLock = &ShmemAllocator->shmem_lock;
170 1165 : ShmemAllocator->free_offset = offset;
171 : /* ShmemIndex can't be set up yet (need LWLocks first) */
172 1165 : ShmemAllocator->index = NULL;
173 1165 : ShmemIndex = (HTAB *) NULL;
174 : }
175 1165 : }
176 :
177 : /*
178 : * ShmemAlloc -- allocate max-aligned chunk from shared memory
179 : *
180 : * Throws error if request cannot be satisfied.
181 : *
182 : * Assumes ShmemLock and ShmemSegHdr are initialized.
183 : */
184 : void *
185 3498 : ShmemAlloc(Size size)
186 : {
187 : void *newSpace;
188 : Size allocated_size;
189 :
190 3498 : newSpace = ShmemAllocRaw(size, &allocated_size);
191 3498 : if (!newSpace)
192 0 : ereport(ERROR,
193 : (errcode(ERRCODE_OUT_OF_MEMORY),
194 : errmsg("out of shared memory (%zu bytes requested)",
195 : size)));
196 3498 : return newSpace;
197 : }
198 :
199 : /*
200 : * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
201 : *
202 : * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
203 : */
204 : void *
205 471241 : ShmemAllocNoError(Size size)
206 : {
207 : Size allocated_size;
208 :
209 471241 : return ShmemAllocRaw(size, &allocated_size);
210 : }
211 :
212 : /*
213 : * ShmemAllocRaw -- allocate align chunk and return allocated size
214 : *
215 : * Also sets *allocated_size to the number of bytes allocated, which will
216 : * be equal to the number requested plus any padding we choose to add.
217 : */
218 : static void *
219 560948 : ShmemAllocRaw(Size size, Size *allocated_size)
220 : {
221 : Size newStart;
222 : Size newFree;
223 : void *newSpace;
224 :
225 : /*
226 : * Ensure all space is adequately aligned. We used to only MAXALIGN this
227 : * space but experience has proved that on modern systems that is not good
228 : * enough. Many parts of the system are very sensitive to critical data
229 : * structures getting split across cache line boundaries. To avoid that,
230 : * attempt to align the beginning of the allocation to a cache line
231 : * boundary. The calling code will still need to be careful about how it
232 : * uses the allocated space - e.g. by padding each element in an array of
233 : * structures out to a power-of-two size - but without this, even that
234 : * won't be sufficient.
235 : */
236 560948 : size = CACHELINEALIGN(size);
237 560948 : *allocated_size = size;
238 :
239 : Assert(ShmemSegHdr != NULL);
240 :
241 560948 : SpinLockAcquire(ShmemLock);
242 :
243 560948 : newStart = ShmemAllocator->free_offset;
244 :
245 560948 : newFree = newStart + size;
246 560948 : if (newFree <= ShmemSegHdr->totalsize)
247 : {
248 560948 : newSpace = (char *) ShmemBase + newStart;
249 560948 : ShmemAllocator->free_offset = newFree;
250 : }
251 : else
252 0 : newSpace = NULL;
253 :
254 560948 : SpinLockRelease(ShmemLock);
255 :
256 : /* note this assert is okay with newSpace == NULL */
257 : Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
258 :
259 560948 : return newSpace;
260 : }
261 :
262 : /*
263 : * ShmemAddrIsValid -- test if an address refers to shared memory
264 : *
265 : * Returns true if the pointer points within the shared memory segment.
266 : */
267 : bool
268 0 : ShmemAddrIsValid(const void *addr)
269 : {
270 0 : return (addr >= ShmemBase) && (addr < ShmemEnd);
271 : }
272 :
273 : /*
274 : * InitShmemIndex() --- set up or attach to shmem index table.
275 : */
276 : void
277 1165 : InitShmemIndex(void)
278 : {
279 : HASHCTL info;
280 :
281 : /*
282 : * Create the shared memory shmem index.
283 : *
284 : * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
285 : * hashtable to exist already, we have a bit of a circularity problem in
286 : * initializing the ShmemIndex itself. The special "ShmemIndex" hash
287 : * table name will tell ShmemInitStruct to fake it.
288 : */
289 1165 : info.keysize = SHMEM_INDEX_KEYSIZE;
290 1165 : info.entrysize = sizeof(ShmemIndexEnt);
291 :
292 1165 : ShmemIndex = ShmemInitHash("ShmemIndex",
293 : SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
294 : &info,
295 : HASH_ELEM | HASH_STRINGS);
296 1165 : }
297 :
298 : /*
299 : * ShmemInitHash -- Create and initialize, or attach to, a
300 : * shared memory hash table.
301 : *
302 : * We assume caller is doing some kind of synchronization
303 : * so that two processes don't try to create/initialize the same
304 : * table at once. (In practice, all creations are done in the postmaster
305 : * process; child processes should always be attaching to existing tables.)
306 : *
307 : * max_size is the estimated maximum number of hashtable entries. This is
308 : * not a hard limit, but the access efficiency will degrade if it is
309 : * exceeded substantially (since it's used to compute directory size and
310 : * the hash table buckets will get overfull).
311 : *
312 : * init_size is the number of hashtable entries to preallocate. For a table
313 : * whose maximum size is certain, this should be equal to max_size; that
314 : * ensures that no run-time out-of-shared-memory failures can occur.
315 : *
316 : * *infoP and hash_flags must specify at least the entry sizes and key
317 : * comparison semantics (see hash_create()). Flag bits and values specific
318 : * to shared-memory hash tables are added here, except that callers may
319 : * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
320 : *
321 : * Note: before Postgres 9.0, this function returned NULL for some failure
322 : * cases. Now, it always throws error instead, so callers need not check
323 : * for NULL.
324 : */
325 : HTAB *
326 10492 : ShmemInitHash(const char *name, /* table string name for shmem index */
327 : int64 init_size, /* initial table size */
328 : int64 max_size, /* max size of the table */
329 : HASHCTL *infoP, /* info about key and bucket size */
330 : int hash_flags) /* info about infoP */
331 : {
332 : bool found;
333 : void *location;
334 :
335 : /*
336 : * Hash tables allocated in shared memory have a fixed directory; it can't
337 : * grow or other backends wouldn't be able to find it. So, make sure we
338 : * make it big enough to start with.
339 : *
340 : * The shared memory allocator must be specified too.
341 : */
342 10492 : infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
343 10492 : infoP->alloc = ShmemAllocNoError;
344 10492 : hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
345 :
346 : /* look it up in the shmem index */
347 10492 : location = ShmemInitStruct(name,
348 : hash_get_shared_size(infoP, hash_flags),
349 : &found);
350 :
351 : /*
352 : * if it already exists, attach to it rather than allocate and initialize
353 : * new space
354 : */
355 10492 : if (found)
356 0 : hash_flags |= HASH_ATTACH;
357 :
358 : /* Pass location of hashtable header to hash_create */
359 10492 : infoP->hctl = (HASHHDR *) location;
360 :
361 10492 : return hash_create(name, init_size, infoP, hash_flags);
362 : }
363 :
364 : /*
365 : * ShmemInitStruct -- Create/attach to a structure in shared memory.
366 : *
367 : * This is called during initialization to find or allocate
368 : * a data structure in shared memory. If no other process
369 : * has created the structure, this routine allocates space
370 : * for it. If it exists already, a pointer to the existing
371 : * structure is returned.
372 : *
373 : * Returns: pointer to the object. *foundPtr is set true if the object was
374 : * already in the shmem index (hence, already initialized).
375 : *
376 : * Note: before Postgres 9.0, this function returned NULL for some failure
377 : * cases. Now, it always throws error instead, so callers need not check
378 : * for NULL.
379 : */
380 : void *
381 87374 : ShmemInitStruct(const char *name, Size size, bool *foundPtr)
382 : {
383 : ShmemIndexEnt *result;
384 : void *structPtr;
385 :
386 87374 : LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
387 :
388 87374 : if (!ShmemIndex)
389 : {
390 : /* Must be trying to create/attach to ShmemIndex itself */
391 : Assert(strcmp(name, "ShmemIndex") == 0);
392 :
393 1165 : if (IsUnderPostmaster)
394 : {
395 : /* Must be initializing a (non-standalone) backend */
396 : Assert(ShmemAllocator->index != NULL);
397 0 : structPtr = ShmemAllocator->index;
398 0 : *foundPtr = true;
399 : }
400 : else
401 : {
402 : /*
403 : * If the shmem index doesn't exist, we are bootstrapping: we must
404 : * be trying to init the shmem index itself.
405 : *
406 : * Notice that the ShmemIndexLock is released before the shmem
407 : * index has been initialized. This should be OK because no other
408 : * process can be accessing shared memory yet.
409 : */
410 : Assert(ShmemAllocator->index == NULL);
411 1165 : structPtr = ShmemAlloc(size);
412 1165 : ShmemAllocator->index = structPtr;
413 1165 : *foundPtr = false;
414 : }
415 1165 : LWLockRelease(ShmemIndexLock);
416 1165 : return structPtr;
417 : }
418 :
419 : /* look it up in the shmem index */
420 : result = (ShmemIndexEnt *)
421 86209 : hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
422 :
423 86209 : if (!result)
424 : {
425 0 : LWLockRelease(ShmemIndexLock);
426 0 : ereport(ERROR,
427 : (errcode(ERRCODE_OUT_OF_MEMORY),
428 : errmsg("could not create ShmemIndex entry for data structure \"%s\"",
429 : name)));
430 : }
431 :
432 86209 : if (*foundPtr)
433 : {
434 : /*
435 : * Structure is in the shmem index so someone else has allocated it
436 : * already. The size better be the same as the size we are trying to
437 : * initialize to, or there is a name conflict (or worse).
438 : */
439 0 : if (result->size != size)
440 : {
441 0 : LWLockRelease(ShmemIndexLock);
442 0 : ereport(ERROR,
443 : (errmsg("ShmemIndex entry size is wrong for data structure"
444 : " \"%s\": expected %zu, actual %zu",
445 : name, size, result->size)));
446 : }
447 0 : structPtr = result->location;
448 : }
449 : else
450 : {
451 : Size allocated_size;
452 :
453 : /* It isn't in the table yet. allocate and initialize it */
454 86209 : structPtr = ShmemAllocRaw(size, &allocated_size);
455 86209 : if (structPtr == NULL)
456 : {
457 : /* out of memory; remove the failed ShmemIndex entry */
458 0 : hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
459 0 : LWLockRelease(ShmemIndexLock);
460 0 : ereport(ERROR,
461 : (errcode(ERRCODE_OUT_OF_MEMORY),
462 : errmsg("not enough shared memory for data structure"
463 : " \"%s\" (%zu bytes requested)",
464 : name, size)));
465 : }
466 86209 : result->size = size;
467 86209 : result->allocated_size = allocated_size;
468 86209 : result->location = structPtr;
469 : }
470 :
471 86209 : LWLockRelease(ShmemIndexLock);
472 :
473 : Assert(ShmemAddrIsValid(structPtr));
474 :
475 : Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
476 :
477 86209 : return structPtr;
478 : }
479 :
480 :
481 : /*
482 : * Add two Size values, checking for overflow
483 : */
484 : Size
485 587417 : add_size(Size s1, Size s2)
486 : {
487 : Size result;
488 :
489 587417 : if (pg_add_size_overflow(s1, s2, &result))
490 0 : ereport(ERROR,
491 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
492 : errmsg("requested shared memory size overflows size_t")));
493 587417 : return result;
494 : }
495 :
496 : /*
497 : * Multiply two Size values, checking for overflow
498 : */
499 : Size
500 277784 : mul_size(Size s1, Size s2)
501 : {
502 : Size result;
503 :
504 277784 : if (pg_mul_size_overflow(s1, s2, &result))
505 0 : ereport(ERROR,
506 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
507 : errmsg("requested shared memory size overflows size_t")));
508 277784 : return result;
509 : }
510 :
511 : /* SQL SRF showing allocated shared memory */
512 : Datum
513 3 : pg_get_shmem_allocations(PG_FUNCTION_ARGS)
514 : {
515 : #define PG_GET_SHMEM_SIZES_COLS 4
516 3 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
517 : HASH_SEQ_STATUS hstat;
518 : ShmemIndexEnt *ent;
519 3 : Size named_allocated = 0;
520 : Datum values[PG_GET_SHMEM_SIZES_COLS];
521 : bool nulls[PG_GET_SHMEM_SIZES_COLS];
522 :
523 3 : InitMaterializedSRF(fcinfo, 0);
524 :
525 3 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
526 :
527 3 : hash_seq_init(&hstat, ShmemIndex);
528 :
529 : /* output all allocated entries */
530 3 : memset(nulls, 0, sizeof(nulls));
531 227 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
532 : {
533 224 : values[0] = CStringGetTextDatum(ent->key);
534 224 : values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
535 224 : values[2] = Int64GetDatum(ent->size);
536 224 : values[3] = Int64GetDatum(ent->allocated_size);
537 224 : named_allocated += ent->allocated_size;
538 :
539 224 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
540 : values, nulls);
541 : }
542 :
543 : /* output shared memory allocated but not counted via the shmem index */
544 3 : values[0] = CStringGetTextDatum("<anonymous>");
545 3 : nulls[1] = true;
546 3 : values[2] = Int64GetDatum(ShmemAllocator->free_offset - named_allocated);
547 3 : values[3] = values[2];
548 3 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
549 :
550 : /* output as-of-yet unused shared memory */
551 3 : nulls[0] = true;
552 3 : values[1] = Int64GetDatum(ShmemAllocator->free_offset);
553 3 : nulls[1] = false;
554 3 : values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemAllocator->free_offset);
555 3 : values[3] = values[2];
556 3 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
557 :
558 3 : LWLockRelease(ShmemIndexLock);
559 :
560 3 : return (Datum) 0;
561 : }
562 :
563 : /*
564 : * SQL SRF showing NUMA memory nodes for allocated shared memory
565 : *
566 : * Compared to pg_get_shmem_allocations(), this function does not return
567 : * information about shared anonymous allocations and unused shared memory.
568 : */
569 : Datum
570 3 : pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
571 : {
572 : #define PG_GET_SHMEM_NUMA_SIZES_COLS 3
573 3 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
574 : HASH_SEQ_STATUS hstat;
575 : ShmemIndexEnt *ent;
576 : Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
577 : bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
578 : Size os_page_size;
579 : void **page_ptrs;
580 : int *pages_status;
581 : uint64 shm_total_page_count,
582 : shm_ent_page_count,
583 : max_nodes;
584 : Size *nodes;
585 :
586 3 : if (pg_numa_init() == -1)
587 3 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
588 :
589 0 : InitMaterializedSRF(fcinfo, 0);
590 :
591 0 : max_nodes = pg_numa_get_max_node();
592 0 : nodes = palloc_array(Size, max_nodes + 2);
593 :
594 : /*
595 : * Shared memory allocations can vary in size and may not align with OS
596 : * memory page boundaries, while NUMA queries work on pages.
597 : *
598 : * To correctly map each allocation to NUMA nodes, we need to: 1.
599 : * Determine the OS memory page size. 2. Align each allocation's start/end
600 : * addresses to page boundaries. 3. Query NUMA node information for all
601 : * pages spanning the allocation.
602 : */
603 0 : os_page_size = pg_get_shmem_pagesize();
604 :
605 : /*
606 : * Allocate memory for page pointers and status based on total shared
607 : * memory size. This simplified approach allocates enough space for all
608 : * pages in shared memory rather than calculating the exact requirements
609 : * for each segment.
610 : *
611 : * Add 1, because we don't know how exactly the segments align to OS
612 : * pages, so the allocation might use one more memory page. In practice
613 : * this is not very likely, and moreover we have more entries, each of
614 : * them using only fraction of the total pages.
615 : */
616 0 : shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
617 0 : page_ptrs = palloc0_array(void *, shm_total_page_count);
618 0 : pages_status = palloc_array(int, shm_total_page_count);
619 :
620 0 : if (firstNumaTouch)
621 0 : elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
622 :
623 0 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
624 :
625 0 : hash_seq_init(&hstat, ShmemIndex);
626 :
627 : /* output all allocated entries */
628 0 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
629 : {
630 : int i;
631 : char *startptr,
632 : *endptr;
633 : Size total_len;
634 :
635 : /*
636 : * Calculate the range of OS pages used by this segment. The segment
637 : * may start / end half-way through a page, we want to count these
638 : * pages too. So we align the start/end pointers down/up, and then
639 : * calculate the number of pages from that.
640 : */
641 0 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
642 0 : endptr = (char *) TYPEALIGN(os_page_size,
643 : (char *) ent->location + ent->allocated_size);
644 0 : total_len = (endptr - startptr);
645 :
646 0 : shm_ent_page_count = total_len / os_page_size;
647 :
648 : /*
649 : * If we ever get 0xff (-1) back from kernel inquiry, then we probably
650 : * have a bug in mapping buffers to OS pages.
651 : */
652 0 : memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
653 :
654 : /*
655 : * Setup page_ptrs[] with pointers to all OS pages for this segment,
656 : * and get the NUMA status using pg_numa_query_pages.
657 : *
658 : * In order to get reliable results we also need to touch memory
659 : * pages, so that inquiry about NUMA memory node doesn't return -2
660 : * (ENOENT, which indicates unmapped/unallocated pages).
661 : */
662 0 : for (i = 0; i < shm_ent_page_count; i++)
663 : {
664 0 : page_ptrs[i] = startptr + (i * os_page_size);
665 :
666 0 : if (firstNumaTouch)
667 : pg_numa_touch_mem_if_required(page_ptrs[i]);
668 :
669 0 : CHECK_FOR_INTERRUPTS();
670 : }
671 :
672 0 : if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
673 0 : elog(ERROR, "failed NUMA pages inquiry status: %m");
674 :
675 : /* Count number of NUMA nodes used for this shared memory entry */
676 0 : memset(nodes, 0, sizeof(Size) * (max_nodes + 2));
677 :
678 0 : for (i = 0; i < shm_ent_page_count; i++)
679 : {
680 0 : int s = pages_status[i];
681 :
682 : /* Ensure we are adding only valid index to the array */
683 0 : if (s >= 0 && s <= max_nodes)
684 : {
685 : /* valid NUMA node */
686 0 : nodes[s]++;
687 0 : continue;
688 : }
689 0 : else if (s == -2)
690 : {
691 : /* -2 means ENOENT (e.g. page was moved to swap) */
692 0 : nodes[max_nodes + 1]++;
693 0 : continue;
694 : }
695 :
696 0 : elog(ERROR, "invalid NUMA node id outside of allowed range "
697 : "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
698 : }
699 :
700 : /* no NULLs for regular nodes */
701 0 : memset(nulls, 0, sizeof(nulls));
702 :
703 : /*
704 : * Add one entry for each NUMA node, including those without allocated
705 : * memory for this segment.
706 : */
707 0 : for (i = 0; i <= max_nodes; i++)
708 : {
709 0 : values[0] = CStringGetTextDatum(ent->key);
710 0 : values[1] = Int32GetDatum(i);
711 0 : values[2] = Int64GetDatum(nodes[i] * os_page_size);
712 :
713 0 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
714 : values, nulls);
715 : }
716 :
717 : /* The last entry is used for pages without a NUMA node. */
718 0 : nulls[1] = true;
719 0 : values[0] = CStringGetTextDatum(ent->key);
720 0 : values[2] = Int64GetDatum(nodes[max_nodes + 1] * os_page_size);
721 :
722 0 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
723 : values, nulls);
724 : }
725 :
726 0 : LWLockRelease(ShmemIndexLock);
727 0 : firstNumaTouch = false;
728 :
729 0 : return (Datum) 0;
730 : }
731 :
732 : /*
733 : * Determine the memory page size used for the shared memory segment.
734 : *
735 : * If the shared segment was allocated using huge pages, returns the size of
736 : * a huge page. Otherwise returns the size of regular memory page.
737 : *
738 : * This should be used only after the server is started.
739 : */
740 : Size
741 2 : pg_get_shmem_pagesize(void)
742 : {
743 : Size os_page_size;
744 : #ifdef WIN32
745 : SYSTEM_INFO sysinfo;
746 :
747 : GetSystemInfo(&sysinfo);
748 : os_page_size = sysinfo.dwPageSize;
749 : #else
750 2 : os_page_size = sysconf(_SC_PAGESIZE);
751 : #endif
752 :
753 : Assert(IsUnderPostmaster);
754 : Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
755 :
756 2 : if (huge_pages_status == HUGE_PAGES_ON)
757 0 : GetHugePageSize(&os_page_size, NULL);
758 :
759 2 : return os_page_size;
760 : }
761 :
762 : Datum
763 4 : pg_numa_available(PG_FUNCTION_ARGS)
764 : {
765 4 : PG_RETURN_BOOL(pg_numa_init() != -1);
766 : }
|