Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * shmem.c
4 : * create shared memory and initialize shared memory data structures.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/ipc/shmem.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * POSTGRES processes share one or more regions of shared memory.
17 : * The shared memory is created by a postmaster and is inherited
18 : * by each backend via fork() (or, in some ports, via other OS-specific
19 : * methods). The routines in this file are used for allocating and
20 : * binding to shared memory data structures.
21 : *
22 : * NOTES:
23 : * (a) There are three kinds of shared memory data structures
24 : * available to POSTGRES: fixed-size structures, queues and hash
25 : * tables. Fixed-size structures contain things like global variables
26 : * for a module and should never be allocated after the shared memory
27 : * initialization phase. Hash tables have a fixed maximum size, but
28 : * their actual size can vary dynamically. When entries are added
29 : * to the table, more space is allocated. Queues link data structures
30 : * that have been allocated either within fixed-size structures or as hash
31 : * buckets. Each shared data structure has a string name to identify
32 : * it (assigned in the module that declares it).
33 : *
34 : * (b) During initialization, each module looks for its
35 : * shared data structures in a hash table called the "Shmem Index".
36 : * If the data structure is not present, the caller can allocate
37 : * a new one and initialize it. If the data structure is present,
38 : * the caller "attaches" to the structure by initializing a pointer
39 : * in the local address space.
40 : * The shmem index has two purposes: first, it gives us
41 : * a simple model of how the world looks when a backend process
42 : * initializes. If something is present in the shmem index,
43 : * it is initialized. If it is not, it is uninitialized. Second,
44 : * the shmem index allows us to allocate shared memory on demand
45 : * instead of trying to preallocate structures and hard-wire the
46 : * sizes and locations in header files. If you are using a lot
47 : * of shared memory in a lot of different places (and changing
48 : * things during development), this is important.
49 : *
50 : * (c) In standard Unix-ish environments, individual backends do not
51 : * need to re-establish their local pointers into shared memory, because
52 : * they inherit correct values of those variables via fork() from the
53 : * postmaster. However, this does not work in the EXEC_BACKEND case.
54 : * In ports using EXEC_BACKEND, new backends have to set up their local
55 : * pointers using the method described in (b) above.
56 : *
57 : * (d) memory allocation model: shared memory can never be
58 : * freed, once allocated. Each hash table has its own free list,
59 : * so hash buckets can be reused when an item is deleted. However,
60 : * if one hash table grows very large and then shrinks, its space
61 : * cannot be redistributed to other tables. We could build a simple
62 : * hash bucket garbage collector if need be. Right now, it seems
63 : * unnecessary.
64 : */
65 :
66 : #include "postgres.h"
67 :
68 : #include "common/int.h"
69 : #include "fmgr.h"
70 : #include "funcapi.h"
71 : #include "miscadmin.h"
72 : #include "port/pg_numa.h"
73 : #include "storage/lwlock.h"
74 : #include "storage/pg_shmem.h"
75 : #include "storage/shmem.h"
76 : #include "storage/spin.h"
77 : #include "utils/builtins.h"
78 :
79 : static void *ShmemAllocRaw(Size size, Size *allocated_size);
80 : static void *ShmemAllocUnlocked(Size size);
81 :
82 : /* shared memory global variables */
83 :
84 : static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
85 :
86 : static void *ShmemBase; /* start address of shared memory */
87 :
88 : static void *ShmemEnd; /* end+1 address of shared memory */
89 :
90 : slock_t *ShmemLock; /* spinlock for shared memory and LWLock
91 : * allocation */
92 :
93 : static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
94 :
95 : /* To get reliable results for NUMA inquiry we need to "touch pages" once */
96 : static bool firstNumaTouch = true;
97 :
98 : Datum pg_numa_available(PG_FUNCTION_ARGS);
99 :
100 : /*
101 : * InitShmemAccess() --- set up basic pointers to shared memory.
102 : */
103 : void
104 2280 : InitShmemAccess(PGShmemHeader *seghdr)
105 : {
106 2280 : ShmemSegHdr = seghdr;
107 2280 : ShmemBase = seghdr;
108 2280 : ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
109 2280 : }
110 :
111 : /*
112 : * InitShmemAllocation() --- set up shared-memory space allocation.
113 : *
114 : * This should be called only in the postmaster or a standalone backend.
115 : */
116 : void
117 2280 : InitShmemAllocation(void)
118 : {
119 2280 : PGShmemHeader *shmhdr = ShmemSegHdr;
120 : char *aligned;
121 :
122 : Assert(shmhdr != NULL);
123 :
124 : /*
125 : * Initialize the spinlock used by ShmemAlloc. We must use
126 : * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
127 : */
128 2280 : ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
129 :
130 2280 : SpinLockInit(ShmemLock);
131 :
132 : /*
133 : * Allocations after this point should go through ShmemAlloc, which
134 : * expects to allocate everything on cache line boundaries. Make sure the
135 : * first allocation begins on a cache line boundary.
136 : */
137 2280 : aligned = (char *)
138 2280 : (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
139 2280 : shmhdr->freeoffset = aligned - (char *) shmhdr;
140 :
141 : /* ShmemIndex can't be set up yet (need LWLocks first) */
142 2280 : shmhdr->index = NULL;
143 2280 : ShmemIndex = (HTAB *) NULL;
144 2280 : }
145 :
146 : /*
147 : * ShmemAlloc -- allocate max-aligned chunk from shared memory
148 : *
149 : * Throws error if request cannot be satisfied.
150 : *
151 : * Assumes ShmemLock and ShmemSegHdr are initialized.
152 : */
153 : void *
154 6846 : ShmemAlloc(Size size)
155 : {
156 : void *newSpace;
157 : Size allocated_size;
158 :
159 6846 : newSpace = ShmemAllocRaw(size, &allocated_size);
160 6846 : if (!newSpace)
161 0 : ereport(ERROR,
162 : (errcode(ERRCODE_OUT_OF_MEMORY),
163 : errmsg("out of shared memory (%zu bytes requested)",
164 : size)));
165 6846 : return newSpace;
166 : }
167 :
168 : /*
169 : * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
170 : *
171 : * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
172 : */
173 : void *
174 920326 : ShmemAllocNoError(Size size)
175 : {
176 : Size allocated_size;
177 :
178 920326 : return ShmemAllocRaw(size, &allocated_size);
179 : }
180 :
181 : /*
182 : * ShmemAllocRaw -- allocate align chunk and return allocated size
183 : *
184 : * Also sets *allocated_size to the number of bytes allocated, which will
185 : * be equal to the number requested plus any padding we choose to add.
186 : */
187 : static void *
188 1098170 : ShmemAllocRaw(Size size, Size *allocated_size)
189 : {
190 : Size newStart;
191 : Size newFree;
192 : void *newSpace;
193 :
194 : /*
195 : * Ensure all space is adequately aligned. We used to only MAXALIGN this
196 : * space but experience has proved that on modern systems that is not good
197 : * enough. Many parts of the system are very sensitive to critical data
198 : * structures getting split across cache line boundaries. To avoid that,
199 : * attempt to align the beginning of the allocation to a cache line
200 : * boundary. The calling code will still need to be careful about how it
201 : * uses the allocated space - e.g. by padding each element in an array of
202 : * structures out to a power-of-two size - but without this, even that
203 : * won't be sufficient.
204 : */
205 1098170 : size = CACHELINEALIGN(size);
206 1098170 : *allocated_size = size;
207 :
208 : Assert(ShmemSegHdr != NULL);
209 :
210 1098170 : SpinLockAcquire(ShmemLock);
211 :
212 1098170 : newStart = ShmemSegHdr->freeoffset;
213 :
214 1098170 : newFree = newStart + size;
215 1098170 : if (newFree <= ShmemSegHdr->totalsize)
216 : {
217 1098170 : newSpace = (char *) ShmemBase + newStart;
218 1098170 : ShmemSegHdr->freeoffset = newFree;
219 : }
220 : else
221 0 : newSpace = NULL;
222 :
223 1098170 : SpinLockRelease(ShmemLock);
224 :
225 : /* note this assert is okay with newSpace == NULL */
226 : Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
227 :
228 1098170 : return newSpace;
229 : }
230 :
231 : /*
232 : * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
233 : *
234 : * Allocate space without locking ShmemLock. This should be used for,
235 : * and only for, allocations that must happen before ShmemLock is ready.
236 : *
237 : * We consider maxalign, rather than cachealign, sufficient here.
238 : */
239 : static void *
240 2280 : ShmemAllocUnlocked(Size size)
241 : {
242 : Size newStart;
243 : Size newFree;
244 : void *newSpace;
245 :
246 : /*
247 : * Ensure allocated space is adequately aligned.
248 : */
249 2280 : size = MAXALIGN(size);
250 :
251 : Assert(ShmemSegHdr != NULL);
252 :
253 2280 : newStart = ShmemSegHdr->freeoffset;
254 :
255 2280 : newFree = newStart + size;
256 2280 : if (newFree > ShmemSegHdr->totalsize)
257 0 : ereport(ERROR,
258 : (errcode(ERRCODE_OUT_OF_MEMORY),
259 : errmsg("out of shared memory (%zu bytes requested)",
260 : size)));
261 2280 : ShmemSegHdr->freeoffset = newFree;
262 :
263 2280 : newSpace = (char *) ShmemBase + newStart;
264 :
265 : Assert(newSpace == (void *) MAXALIGN(newSpace));
266 :
267 2280 : return newSpace;
268 : }
269 :
270 : /*
271 : * ShmemAddrIsValid -- test if an address refers to shared memory
272 : *
273 : * Returns true if the pointer points within the shared memory segment.
274 : */
275 : bool
276 0 : ShmemAddrIsValid(const void *addr)
277 : {
278 0 : return (addr >= ShmemBase) && (addr < ShmemEnd);
279 : }
280 :
281 : /*
282 : * InitShmemIndex() --- set up or attach to shmem index table.
283 : */
284 : void
285 2280 : InitShmemIndex(void)
286 : {
287 : HASHCTL info;
288 :
289 : /*
290 : * Create the shared memory shmem index.
291 : *
292 : * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
293 : * hashtable to exist already, we have a bit of a circularity problem in
294 : * initializing the ShmemIndex itself. The special "ShmemIndex" hash
295 : * table name will tell ShmemInitStruct to fake it.
296 : */
297 2280 : info.keysize = SHMEM_INDEX_KEYSIZE;
298 2280 : info.entrysize = sizeof(ShmemIndexEnt);
299 :
300 2280 : ShmemIndex = ShmemInitHash("ShmemIndex",
301 : SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
302 : &info,
303 : HASH_ELEM | HASH_STRINGS);
304 2280 : }
305 :
306 : /*
307 : * ShmemInitHash -- Create and initialize, or attach to, a
308 : * shared memory hash table.
309 : *
310 : * We assume caller is doing some kind of synchronization
311 : * so that two processes don't try to create/initialize the same
312 : * table at once. (In practice, all creations are done in the postmaster
313 : * process; child processes should always be attaching to existing tables.)
314 : *
315 : * max_size is the estimated maximum number of hashtable entries. This is
316 : * not a hard limit, but the access efficiency will degrade if it is
317 : * exceeded substantially (since it's used to compute directory size and
318 : * the hash table buckets will get overfull).
319 : *
320 : * init_size is the number of hashtable entries to preallocate. For a table
321 : * whose maximum size is certain, this should be equal to max_size; that
322 : * ensures that no run-time out-of-shared-memory failures can occur.
323 : *
324 : * *infoP and hash_flags must specify at least the entry sizes and key
325 : * comparison semantics (see hash_create()). Flag bits and values specific
326 : * to shared-memory hash tables are added here, except that callers may
327 : * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
328 : *
329 : * Note: before Postgres 9.0, this function returned NULL for some failure
330 : * cases. Now, it always throws error instead, so callers need not check
331 : * for NULL.
332 : */
333 : HTAB *
334 20534 : ShmemInitHash(const char *name, /* table string name for shmem index */
335 : int64 init_size, /* initial table size */
336 : int64 max_size, /* max size of the table */
337 : HASHCTL *infoP, /* info about key and bucket size */
338 : int hash_flags) /* info about infoP */
339 : {
340 : bool found;
341 : void *location;
342 :
343 : /*
344 : * Hash tables allocated in shared memory have a fixed directory; it can't
345 : * grow or other backends wouldn't be able to find it. So, make sure we
346 : * make it big enough to start with.
347 : *
348 : * The shared memory allocator must be specified too.
349 : */
350 20534 : infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
351 20534 : infoP->alloc = ShmemAllocNoError;
352 20534 : hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
353 :
354 : /* look it up in the shmem index */
355 20534 : location = ShmemInitStruct(name,
356 : hash_get_shared_size(infoP, hash_flags),
357 : &found);
358 :
359 : /*
360 : * if it already exists, attach to it rather than allocate and initialize
361 : * new space
362 : */
363 20534 : if (found)
364 0 : hash_flags |= HASH_ATTACH;
365 :
366 : /* Pass location of hashtable header to hash_create */
367 20534 : infoP->hctl = (HASHHDR *) location;
368 :
369 20534 : return hash_create(name, init_size, infoP, hash_flags);
370 : }
371 :
372 : /*
373 : * ShmemInitStruct -- Create/attach to a structure in shared memory.
374 : *
375 : * This is called during initialization to find or allocate
376 : * a data structure in shared memory. If no other process
377 : * has created the structure, this routine allocates space
378 : * for it. If it exists already, a pointer to the existing
379 : * structure is returned.
380 : *
381 : * Returns: pointer to the object. *foundPtr is set true if the object was
382 : * already in the shmem index (hence, already initialized).
383 : *
384 : * Note: before Postgres 9.0, this function returned NULL for some failure
385 : * cases. Now, it always throws error instead, so callers need not check
386 : * for NULL.
387 : */
388 : void *
389 173278 : ShmemInitStruct(const char *name, Size size, bool *foundPtr)
390 : {
391 : ShmemIndexEnt *result;
392 : void *structPtr;
393 :
394 173278 : LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
395 :
396 173278 : if (!ShmemIndex)
397 : {
398 2280 : PGShmemHeader *shmemseghdr = ShmemSegHdr;
399 :
400 : /* Must be trying to create/attach to ShmemIndex itself */
401 : Assert(strcmp(name, "ShmemIndex") == 0);
402 :
403 2280 : if (IsUnderPostmaster)
404 : {
405 : /* Must be initializing a (non-standalone) backend */
406 : Assert(shmemseghdr->index != NULL);
407 0 : structPtr = shmemseghdr->index;
408 0 : *foundPtr = true;
409 : }
410 : else
411 : {
412 : /*
413 : * If the shmem index doesn't exist, we are bootstrapping: we must
414 : * be trying to init the shmem index itself.
415 : *
416 : * Notice that the ShmemIndexLock is released before the shmem
417 : * index has been initialized. This should be OK because no other
418 : * process can be accessing shared memory yet.
419 : */
420 : Assert(shmemseghdr->index == NULL);
421 2280 : structPtr = ShmemAlloc(size);
422 2280 : shmemseghdr->index = structPtr;
423 2280 : *foundPtr = false;
424 : }
425 2280 : LWLockRelease(ShmemIndexLock);
426 2280 : return structPtr;
427 : }
428 :
429 : /* look it up in the shmem index */
430 : result = (ShmemIndexEnt *)
431 170998 : hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
432 :
433 170998 : if (!result)
434 : {
435 0 : LWLockRelease(ShmemIndexLock);
436 0 : ereport(ERROR,
437 : (errcode(ERRCODE_OUT_OF_MEMORY),
438 : errmsg("could not create ShmemIndex entry for data structure \"%s\"",
439 : name)));
440 : }
441 :
442 170998 : if (*foundPtr)
443 : {
444 : /*
445 : * Structure is in the shmem index so someone else has allocated it
446 : * already. The size better be the same as the size we are trying to
447 : * initialize to, or there is a name conflict (or worse).
448 : */
449 0 : if (result->size != size)
450 : {
451 0 : LWLockRelease(ShmemIndexLock);
452 0 : ereport(ERROR,
453 : (errmsg("ShmemIndex entry size is wrong for data structure"
454 : " \"%s\": expected %zu, actual %zu",
455 : name, size, result->size)));
456 : }
457 0 : structPtr = result->location;
458 : }
459 : else
460 : {
461 : Size allocated_size;
462 :
463 : /* It isn't in the table yet. allocate and initialize it */
464 170998 : structPtr = ShmemAllocRaw(size, &allocated_size);
465 170998 : if (structPtr == NULL)
466 : {
467 : /* out of memory; remove the failed ShmemIndex entry */
468 0 : hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
469 0 : LWLockRelease(ShmemIndexLock);
470 0 : ereport(ERROR,
471 : (errcode(ERRCODE_OUT_OF_MEMORY),
472 : errmsg("not enough shared memory for data structure"
473 : " \"%s\" (%zu bytes requested)",
474 : name, size)));
475 : }
476 170998 : result->size = size;
477 170998 : result->allocated_size = allocated_size;
478 170998 : result->location = structPtr;
479 : }
480 :
481 170998 : LWLockRelease(ShmemIndexLock);
482 :
483 : Assert(ShmemAddrIsValid(structPtr));
484 :
485 : Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
486 :
487 170998 : return structPtr;
488 : }
489 :
490 :
491 : /*
492 : * Add two Size values, checking for overflow
493 : */
494 : Size
495 1139824 : add_size(Size s1, Size s2)
496 : {
497 : Size result;
498 :
499 1139824 : if (pg_add_size_overflow(s1, s2, &result))
500 0 : ereport(ERROR,
501 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
502 : errmsg("requested shared memory size overflows size_t")));
503 1139824 : return result;
504 : }
505 :
506 : /*
507 : * Multiply two Size values, checking for overflow
508 : */
509 : Size
510 534232 : mul_size(Size s1, Size s2)
511 : {
512 : Size result;
513 :
514 534232 : if (pg_mul_size_overflow(s1, s2, &result))
515 0 : ereport(ERROR,
516 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
517 : errmsg("requested shared memory size overflows size_t")));
518 534232 : return result;
519 : }
520 :
521 : /* SQL SRF showing allocated shared memory */
522 : Datum
523 6 : pg_get_shmem_allocations(PG_FUNCTION_ARGS)
524 : {
525 : #define PG_GET_SHMEM_SIZES_COLS 4
526 6 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
527 : HASH_SEQ_STATUS hstat;
528 : ShmemIndexEnt *ent;
529 6 : Size named_allocated = 0;
530 : Datum values[PG_GET_SHMEM_SIZES_COLS];
531 : bool nulls[PG_GET_SHMEM_SIZES_COLS];
532 :
533 6 : InitMaterializedSRF(fcinfo, 0);
534 :
535 6 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
536 :
537 6 : hash_seq_init(&hstat, ShmemIndex);
538 :
539 : /* output all allocated entries */
540 6 : memset(nulls, 0, sizeof(nulls));
541 460 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
542 : {
543 454 : values[0] = CStringGetTextDatum(ent->key);
544 454 : values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
545 454 : values[2] = Int64GetDatum(ent->size);
546 454 : values[3] = Int64GetDatum(ent->allocated_size);
547 454 : named_allocated += ent->allocated_size;
548 :
549 454 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
550 : values, nulls);
551 : }
552 :
553 : /* output shared memory allocated but not counted via the shmem index */
554 6 : values[0] = CStringGetTextDatum("<anonymous>");
555 6 : nulls[1] = true;
556 6 : values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
557 6 : values[3] = values[2];
558 6 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
559 :
560 : /* output as-of-yet unused shared memory */
561 6 : nulls[0] = true;
562 6 : values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
563 6 : nulls[1] = false;
564 6 : values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
565 6 : values[3] = values[2];
566 6 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
567 :
568 6 : LWLockRelease(ShmemIndexLock);
569 :
570 6 : return (Datum) 0;
571 : }
572 :
573 : /*
574 : * SQL SRF showing NUMA memory nodes for allocated shared memory
575 : *
576 : * Compared to pg_get_shmem_allocations(), this function does not return
577 : * information about shared anonymous allocations and unused shared memory.
578 : */
579 : Datum
580 6 : pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
581 : {
582 : #define PG_GET_SHMEM_NUMA_SIZES_COLS 3
583 6 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
584 : HASH_SEQ_STATUS hstat;
585 : ShmemIndexEnt *ent;
586 : Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
587 : bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
588 : Size os_page_size;
589 : void **page_ptrs;
590 : int *pages_status;
591 : uint64 shm_total_page_count,
592 : shm_ent_page_count,
593 : max_nodes;
594 : Size *nodes;
595 :
596 6 : if (pg_numa_init() == -1)
597 6 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
598 :
599 0 : InitMaterializedSRF(fcinfo, 0);
600 :
601 0 : max_nodes = pg_numa_get_max_node();
602 0 : nodes = palloc_array(Size, max_nodes + 2);
603 :
604 : /*
605 : * Shared memory allocations can vary in size and may not align with OS
606 : * memory page boundaries, while NUMA queries work on pages.
607 : *
608 : * To correctly map each allocation to NUMA nodes, we need to: 1.
609 : * Determine the OS memory page size. 2. Align each allocation's start/end
610 : * addresses to page boundaries. 3. Query NUMA node information for all
611 : * pages spanning the allocation.
612 : */
613 0 : os_page_size = pg_get_shmem_pagesize();
614 :
615 : /*
616 : * Allocate memory for page pointers and status based on total shared
617 : * memory size. This simplified approach allocates enough space for all
618 : * pages in shared memory rather than calculating the exact requirements
619 : * for each segment.
620 : *
621 : * Add 1, because we don't know how exactly the segments align to OS
622 : * pages, so the allocation might use one more memory page. In practice
623 : * this is not very likely, and moreover we have more entries, each of
624 : * them using only fraction of the total pages.
625 : */
626 0 : shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
627 0 : page_ptrs = palloc0_array(void *, shm_total_page_count);
628 0 : pages_status = palloc_array(int, shm_total_page_count);
629 :
630 0 : if (firstNumaTouch)
631 0 : elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
632 :
633 0 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
634 :
635 0 : hash_seq_init(&hstat, ShmemIndex);
636 :
637 : /* output all allocated entries */
638 0 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
639 : {
640 : int i;
641 : char *startptr,
642 : *endptr;
643 : Size total_len;
644 :
645 : /*
646 : * Calculate the range of OS pages used by this segment. The segment
647 : * may start / end half-way through a page, we want to count these
648 : * pages too. So we align the start/end pointers down/up, and then
649 : * calculate the number of pages from that.
650 : */
651 0 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
652 0 : endptr = (char *) TYPEALIGN(os_page_size,
653 : (char *) ent->location + ent->allocated_size);
654 0 : total_len = (endptr - startptr);
655 :
656 0 : shm_ent_page_count = total_len / os_page_size;
657 :
658 : /*
659 : * If we ever get 0xff (-1) back from kernel inquiry, then we probably
660 : * have a bug in mapping buffers to OS pages.
661 : */
662 0 : memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
663 :
664 : /*
665 : * Setup page_ptrs[] with pointers to all OS pages for this segment,
666 : * and get the NUMA status using pg_numa_query_pages.
667 : *
668 : * In order to get reliable results we also need to touch memory
669 : * pages, so that inquiry about NUMA memory node doesn't return -2
670 : * (ENOENT, which indicates unmapped/unallocated pages).
671 : */
672 0 : for (i = 0; i < shm_ent_page_count; i++)
673 : {
674 0 : page_ptrs[i] = startptr + (i * os_page_size);
675 :
676 0 : if (firstNumaTouch)
677 : pg_numa_touch_mem_if_required(page_ptrs[i]);
678 :
679 0 : CHECK_FOR_INTERRUPTS();
680 : }
681 :
682 0 : if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
683 0 : elog(ERROR, "failed NUMA pages inquiry status: %m");
684 :
685 : /* Count number of NUMA nodes used for this shared memory entry */
686 0 : memset(nodes, 0, sizeof(Size) * (max_nodes + 2));
687 :
688 0 : for (i = 0; i < shm_ent_page_count; i++)
689 : {
690 0 : int s = pages_status[i];
691 :
692 : /* Ensure we are adding only valid index to the array */
693 0 : if (s >= 0 && s <= max_nodes)
694 : {
695 : /* valid NUMA node */
696 0 : nodes[s]++;
697 0 : continue;
698 : }
699 0 : else if (s == -2)
700 : {
701 : /* -2 means ENOENT (e.g. page was moved to swap) */
702 0 : nodes[max_nodes + 1]++;
703 0 : continue;
704 : }
705 :
706 0 : elog(ERROR, "invalid NUMA node id outside of allowed range "
707 : "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
708 : }
709 :
710 : /* no NULLs for regular nodes */
711 0 : memset(nulls, 0, sizeof(nulls));
712 :
713 : /*
714 : * Add one entry for each NUMA node, including those without allocated
715 : * memory for this segment.
716 : */
717 0 : for (i = 0; i <= max_nodes; i++)
718 : {
719 0 : values[0] = CStringGetTextDatum(ent->key);
720 0 : values[1] = Int32GetDatum(i);
721 0 : values[2] = Int64GetDatum(nodes[i] * os_page_size);
722 :
723 0 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
724 : values, nulls);
725 : }
726 :
727 : /* The last entry is used for pages without a NUMA node. */
728 0 : nulls[1] = true;
729 0 : values[0] = CStringGetTextDatum(ent->key);
730 0 : values[2] = Int64GetDatum(nodes[max_nodes + 1] * os_page_size);
731 :
732 0 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
733 : values, nulls);
734 : }
735 :
736 0 : LWLockRelease(ShmemIndexLock);
737 0 : firstNumaTouch = false;
738 :
739 0 : return (Datum) 0;
740 : }
741 :
742 : /*
743 : * Determine the memory page size used for the shared memory segment.
744 : *
745 : * If the shared segment was allocated using huge pages, returns the size of
746 : * a huge page. Otherwise returns the size of regular memory page.
747 : *
748 : * This should be used only after the server is started.
749 : */
750 : Size
751 4 : pg_get_shmem_pagesize(void)
752 : {
753 : Size os_page_size;
754 : #ifdef WIN32
755 : SYSTEM_INFO sysinfo;
756 :
757 : GetSystemInfo(&sysinfo);
758 : os_page_size = sysinfo.dwPageSize;
759 : #else
760 4 : os_page_size = sysconf(_SC_PAGESIZE);
761 : #endif
762 :
763 : Assert(IsUnderPostmaster);
764 : Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
765 :
766 4 : if (huge_pages_status == HUGE_PAGES_ON)
767 0 : GetHugePageSize(&os_page_size, NULL);
768 :
769 4 : return os_page_size;
770 : }
771 :
772 : Datum
773 8 : pg_numa_available(PG_FUNCTION_ARGS)
774 : {
775 8 : PG_RETURN_BOOL(pg_numa_init() != -1);
776 : }
|