Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * shmem.c
4 : * create shared memory and initialize shared memory data structures.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/ipc/shmem.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : /*
16 : * POSTGRES processes share one or more regions of shared memory.
17 : * The shared memory is created by a postmaster and is inherited
18 : * by each backend via fork() (or, in some ports, via other OS-specific
19 : * methods). The routines in this file are used for allocating and
20 : * binding to shared memory data structures.
21 : *
22 : * NOTES:
23 : * (a) There are three kinds of shared memory data structures
24 : * available to POSTGRES: fixed-size structures, queues and hash
25 : * tables. Fixed-size structures contain things like global variables
26 : * for a module and should never be allocated after the shared memory
27 : * initialization phase. Hash tables have a fixed maximum size, but
28 : * their actual size can vary dynamically. When entries are added
29 : * to the table, more space is allocated. Queues link data structures
30 : * that have been allocated either within fixed-size structures or as hash
31 : * buckets. Each shared data structure has a string name to identify
32 : * it (assigned in the module that declares it).
33 : *
34 : * (b) During initialization, each module looks for its
35 : * shared data structures in a hash table called the "Shmem Index".
36 : * If the data structure is not present, the caller can allocate
37 : * a new one and initialize it. If the data structure is present,
38 : * the caller "attaches" to the structure by initializing a pointer
39 : * in the local address space.
40 : * The shmem index has two purposes: first, it gives us
41 : * a simple model of how the world looks when a backend process
42 : * initializes. If something is present in the shmem index,
43 : * it is initialized. If it is not, it is uninitialized. Second,
44 : * the shmem index allows us to allocate shared memory on demand
45 : * instead of trying to preallocate structures and hard-wire the
46 : * sizes and locations in header files. If you are using a lot
47 : * of shared memory in a lot of different places (and changing
48 : * things during development), this is important.
49 : *
50 : * (c) In standard Unix-ish environments, individual backends do not
51 : * need to re-establish their local pointers into shared memory, because
52 : * they inherit correct values of those variables via fork() from the
53 : * postmaster. However, this does not work in the EXEC_BACKEND case.
54 : * In ports using EXEC_BACKEND, new backends have to set up their local
55 : * pointers using the method described in (b) above.
56 : *
57 : * (d) memory allocation model: shared memory can never be
58 : * freed, once allocated. Each hash table has its own free list,
59 : * so hash buckets can be reused when an item is deleted. However,
60 : * if one hash table grows very large and then shrinks, its space
61 : * cannot be redistributed to other tables. We could build a simple
62 : * hash bucket garbage collector if need be. Right now, it seems
63 : * unnecessary.
64 : */
65 :
66 : #include "postgres.h"
67 :
68 : #include "common/int.h"
69 : #include "fmgr.h"
70 : #include "funcapi.h"
71 : #include "miscadmin.h"
72 : #include "port/pg_numa.h"
73 : #include "storage/lwlock.h"
74 : #include "storage/pg_shmem.h"
75 : #include "storage/shmem.h"
76 : #include "storage/spin.h"
77 : #include "utils/builtins.h"
78 :
79 : static void *ShmemAllocRaw(Size size, Size *allocated_size);
80 : static void *ShmemAllocUnlocked(Size size);
81 :
82 : /* shared memory global variables */
83 :
84 : static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
85 :
86 : static void *ShmemBase; /* start address of shared memory */
87 :
88 : static void *ShmemEnd; /* end+1 address of shared memory */
89 :
90 : slock_t *ShmemLock; /* spinlock for shared memory and LWLock
91 : * allocation */
92 :
93 : static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
94 :
95 : /* To get reliable results for NUMA inquiry we need to "touch pages" once */
96 : static bool firstNumaTouch = true;
97 :
98 : Datum pg_numa_available(PG_FUNCTION_ARGS);
99 :
100 : /*
101 : * InitShmemAccess() --- set up basic pointers to shared memory.
102 : */
103 : void
104 2204 : InitShmemAccess(PGShmemHeader *seghdr)
105 : {
106 2204 : ShmemSegHdr = seghdr;
107 2204 : ShmemBase = seghdr;
108 2204 : ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
109 2204 : }
110 :
111 : /*
112 : * InitShmemAllocation() --- set up shared-memory space allocation.
113 : *
114 : * This should be called only in the postmaster or a standalone backend.
115 : */
116 : void
117 2204 : InitShmemAllocation(void)
118 : {
119 2204 : PGShmemHeader *shmhdr = ShmemSegHdr;
120 : char *aligned;
121 :
122 : Assert(shmhdr != NULL);
123 :
124 : /*
125 : * Initialize the spinlock used by ShmemAlloc. We must use
126 : * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
127 : */
128 2204 : ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
129 :
130 2204 : SpinLockInit(ShmemLock);
131 :
132 : /*
133 : * Allocations after this point should go through ShmemAlloc, which
134 : * expects to allocate everything on cache line boundaries. Make sure the
135 : * first allocation begins on a cache line boundary.
136 : */
137 2204 : aligned = (char *)
138 2204 : (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
139 2204 : shmhdr->freeoffset = aligned - (char *) shmhdr;
140 :
141 : /* ShmemIndex can't be set up yet (need LWLocks first) */
142 2204 : shmhdr->index = NULL;
143 2204 : ShmemIndex = (HTAB *) NULL;
144 2204 : }
145 :
146 : /*
147 : * ShmemAlloc -- allocate max-aligned chunk from shared memory
148 : *
149 : * Throws error if request cannot be satisfied.
150 : *
151 : * Assumes ShmemLock and ShmemSegHdr are initialized.
152 : */
153 : void *
154 6622 : ShmemAlloc(Size size)
155 : {
156 : void *newSpace;
157 : Size allocated_size;
158 :
159 6622 : newSpace = ShmemAllocRaw(size, &allocated_size);
160 6622 : if (!newSpace)
161 0 : ereport(ERROR,
162 : (errcode(ERRCODE_OUT_OF_MEMORY),
163 : errmsg("out of shared memory (%zu bytes requested)",
164 : size)));
165 6622 : return newSpace;
166 : }
167 :
168 : /*
169 : * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
170 : *
171 : * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
172 : */
173 : void *
174 893112 : ShmemAllocNoError(Size size)
175 : {
176 : Size allocated_size;
177 :
178 893112 : return ShmemAllocRaw(size, &allocated_size);
179 : }
180 :
181 : /*
182 : * ShmemAllocRaw -- allocate align chunk and return allocated size
183 : *
184 : * Also sets *allocated_size to the number of bytes allocated, which will
185 : * be equal to the number requested plus any padding we choose to add.
186 : */
187 : static void *
188 1062828 : ShmemAllocRaw(Size size, Size *allocated_size)
189 : {
190 : Size newStart;
191 : Size newFree;
192 : void *newSpace;
193 :
194 : /*
195 : * Ensure all space is adequately aligned. We used to only MAXALIGN this
196 : * space but experience has proved that on modern systems that is not good
197 : * enough. Many parts of the system are very sensitive to critical data
198 : * structures getting split across cache line boundaries. To avoid that,
199 : * attempt to align the beginning of the allocation to a cache line
200 : * boundary. The calling code will still need to be careful about how it
201 : * uses the allocated space - e.g. by padding each element in an array of
202 : * structures out to a power-of-two size - but without this, even that
203 : * won't be sufficient.
204 : */
205 1062828 : size = CACHELINEALIGN(size);
206 1062828 : *allocated_size = size;
207 :
208 : Assert(ShmemSegHdr != NULL);
209 :
210 1062828 : SpinLockAcquire(ShmemLock);
211 :
212 1062828 : newStart = ShmemSegHdr->freeoffset;
213 :
214 1062828 : newFree = newStart + size;
215 1062828 : if (newFree <= ShmemSegHdr->totalsize)
216 : {
217 1062828 : newSpace = (char *) ShmemBase + newStart;
218 1062828 : ShmemSegHdr->freeoffset = newFree;
219 : }
220 : else
221 0 : newSpace = NULL;
222 :
223 1062828 : SpinLockRelease(ShmemLock);
224 :
225 : /* note this assert is okay with newSpace == NULL */
226 : Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
227 :
228 1062828 : return newSpace;
229 : }
230 :
231 : /*
232 : * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
233 : *
234 : * Allocate space without locking ShmemLock. This should be used for,
235 : * and only for, allocations that must happen before ShmemLock is ready.
236 : *
237 : * We consider maxalign, rather than cachealign, sufficient here.
238 : */
239 : static void *
240 2204 : ShmemAllocUnlocked(Size size)
241 : {
242 : Size newStart;
243 : Size newFree;
244 : void *newSpace;
245 :
246 : /*
247 : * Ensure allocated space is adequately aligned.
248 : */
249 2204 : size = MAXALIGN(size);
250 :
251 : Assert(ShmemSegHdr != NULL);
252 :
253 2204 : newStart = ShmemSegHdr->freeoffset;
254 :
255 2204 : newFree = newStart + size;
256 2204 : if (newFree > ShmemSegHdr->totalsize)
257 0 : ereport(ERROR,
258 : (errcode(ERRCODE_OUT_OF_MEMORY),
259 : errmsg("out of shared memory (%zu bytes requested)",
260 : size)));
261 2204 : ShmemSegHdr->freeoffset = newFree;
262 :
263 2204 : newSpace = (char *) ShmemBase + newStart;
264 :
265 : Assert(newSpace == (void *) MAXALIGN(newSpace));
266 :
267 2204 : return newSpace;
268 : }
269 :
270 : /*
271 : * ShmemAddrIsValid -- test if an address refers to shared memory
272 : *
273 : * Returns true if the pointer points within the shared memory segment.
274 : */
275 : bool
276 0 : ShmemAddrIsValid(const void *addr)
277 : {
278 0 : return (addr >= ShmemBase) && (addr < ShmemEnd);
279 : }
280 :
281 : /*
282 : * InitShmemIndex() --- set up or attach to shmem index table.
283 : */
284 : void
285 2204 : InitShmemIndex(void)
286 : {
287 : HASHCTL info;
288 :
289 : /*
290 : * Create the shared memory shmem index.
291 : *
292 : * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
293 : * hashtable to exist already, we have a bit of a circularity problem in
294 : * initializing the ShmemIndex itself. The special "ShmemIndex" hash
295 : * table name will tell ShmemInitStruct to fake it.
296 : */
297 2204 : info.keysize = SHMEM_INDEX_KEYSIZE;
298 2204 : info.entrysize = sizeof(ShmemIndexEnt);
299 :
300 2204 : ShmemIndex = ShmemInitHash("ShmemIndex",
301 : SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
302 : &info,
303 : HASH_ELEM | HASH_STRINGS);
304 2204 : }
305 :
306 : /*
307 : * ShmemInitHash -- Create and initialize, or attach to, a
308 : * shared memory hash table.
309 : *
310 : * We assume caller is doing some kind of synchronization
311 : * so that two processes don't try to create/initialize the same
312 : * table at once. (In practice, all creations are done in the postmaster
313 : * process; child processes should always be attaching to existing tables.)
314 : *
315 : * max_size is the estimated maximum number of hashtable entries. This is
316 : * not a hard limit, but the access efficiency will degrade if it is
317 : * exceeded substantially (since it's used to compute directory size and
318 : * the hash table buckets will get overfull).
319 : *
320 : * init_size is the number of hashtable entries to preallocate. For a table
321 : * whose maximum size is certain, this should be equal to max_size; that
322 : * ensures that no run-time out-of-shared-memory failures can occur.
323 : *
324 : * *infoP and hash_flags must specify at least the entry sizes and key
325 : * comparison semantics (see hash_create()). Flag bits and values specific
326 : * to shared-memory hash tables are added here, except that callers may
327 : * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
328 : *
329 : * Note: before Postgres 9.0, this function returned NULL for some failure
330 : * cases. Now, it always throws error instead, so callers need not check
331 : * for NULL.
332 : */
333 : HTAB *
334 19850 : ShmemInitHash(const char *name, /* table string name for shmem index */
335 : int64 init_size, /* initial table size */
336 : int64 max_size, /* max size of the table */
337 : HASHCTL *infoP, /* info about key and bucket size */
338 : int hash_flags) /* info about infoP */
339 : {
340 : bool found;
341 : void *location;
342 :
343 : /*
344 : * Hash tables allocated in shared memory have a fixed directory; it can't
345 : * grow or other backends wouldn't be able to find it. So, make sure we
346 : * make it big enough to start with.
347 : *
348 : * The shared memory allocator must be specified too.
349 : */
350 19850 : infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
351 19850 : infoP->alloc = ShmemAllocNoError;
352 19850 : hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
353 :
354 : /* look it up in the shmem index */
355 19850 : location = ShmemInitStruct(name,
356 : hash_get_shared_size(infoP, hash_flags),
357 : &found);
358 :
359 : /*
360 : * if it already exists, attach to it rather than allocate and initialize
361 : * new space
362 : */
363 19850 : if (found)
364 0 : hash_flags |= HASH_ATTACH;
365 :
366 : /* Pass location of hashtable header to hash_create */
367 19850 : infoP->hctl = (HASHHDR *) location;
368 :
369 19850 : return hash_create(name, init_size, infoP, hash_flags);
370 : }
371 :
372 : /*
373 : * ShmemInitStruct -- Create/attach to a structure in shared memory.
374 : *
375 : * This is called during initialization to find or allocate
376 : * a data structure in shared memory. If no other process
377 : * has created the structure, this routine allocates space
378 : * for it. If it exists already, a pointer to the existing
379 : * structure is returned.
380 : *
381 : * Returns: pointer to the object. *foundPtr is set true if the object was
382 : * already in the shmem index (hence, already initialized).
383 : *
384 : * Note: before Postgres 9.0, this function returned NULL for some failure
385 : * cases. Now, it always throws error instead, so callers need not check
386 : * for NULL.
387 : */
388 : void *
389 165298 : ShmemInitStruct(const char *name, Size size, bool *foundPtr)
390 : {
391 : ShmemIndexEnt *result;
392 : void *structPtr;
393 :
394 165298 : LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
395 :
396 165298 : if (!ShmemIndex)
397 : {
398 2204 : PGShmemHeader *shmemseghdr = ShmemSegHdr;
399 :
400 : /* Must be trying to create/attach to ShmemIndex itself */
401 : Assert(strcmp(name, "ShmemIndex") == 0);
402 :
403 2204 : if (IsUnderPostmaster)
404 : {
405 : /* Must be initializing a (non-standalone) backend */
406 : Assert(shmemseghdr->index != NULL);
407 0 : structPtr = shmemseghdr->index;
408 0 : *foundPtr = true;
409 : }
410 : else
411 : {
412 : /*
413 : * If the shmem index doesn't exist, we are bootstrapping: we must
414 : * be trying to init the shmem index itself.
415 : *
416 : * Notice that the ShmemIndexLock is released before the shmem
417 : * index has been initialized. This should be OK because no other
418 : * process can be accessing shared memory yet.
419 : */
420 : Assert(shmemseghdr->index == NULL);
421 2204 : structPtr = ShmemAlloc(size);
422 2204 : shmemseghdr->index = structPtr;
423 2204 : *foundPtr = false;
424 : }
425 2204 : LWLockRelease(ShmemIndexLock);
426 2204 : return structPtr;
427 : }
428 :
429 : /* look it up in the shmem index */
430 : result = (ShmemIndexEnt *)
431 163094 : hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
432 :
433 163094 : if (!result)
434 : {
435 0 : LWLockRelease(ShmemIndexLock);
436 0 : ereport(ERROR,
437 : (errcode(ERRCODE_OUT_OF_MEMORY),
438 : errmsg("could not create ShmemIndex entry for data structure \"%s\"",
439 : name)));
440 : }
441 :
442 163094 : if (*foundPtr)
443 : {
444 : /*
445 : * Structure is in the shmem index so someone else has allocated it
446 : * already. The size better be the same as the size we are trying to
447 : * initialize to, or there is a name conflict (or worse).
448 : */
449 0 : if (result->size != size)
450 : {
451 0 : LWLockRelease(ShmemIndexLock);
452 0 : ereport(ERROR,
453 : (errmsg("ShmemIndex entry size is wrong for data structure"
454 : " \"%s\": expected %zu, actual %zu",
455 : name, size, result->size)));
456 : }
457 0 : structPtr = result->location;
458 : }
459 : else
460 : {
461 : Size allocated_size;
462 :
463 : /* It isn't in the table yet. allocate and initialize it */
464 163094 : structPtr = ShmemAllocRaw(size, &allocated_size);
465 163094 : if (structPtr == NULL)
466 : {
467 : /* out of memory; remove the failed ShmemIndex entry */
468 0 : hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
469 0 : LWLockRelease(ShmemIndexLock);
470 0 : ereport(ERROR,
471 : (errcode(ERRCODE_OUT_OF_MEMORY),
472 : errmsg("not enough shared memory for data structure"
473 : " \"%s\" (%zu bytes requested)",
474 : name, size)));
475 : }
476 163094 : result->size = size;
477 163094 : result->allocated_size = allocated_size;
478 163094 : result->location = structPtr;
479 : }
480 :
481 163094 : LWLockRelease(ShmemIndexLock);
482 :
483 : Assert(ShmemAddrIsValid(structPtr));
484 :
485 : Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
486 :
487 163094 : return structPtr;
488 : }
489 :
490 :
491 : /*
492 : * Add two Size values, checking for overflow
493 : */
494 : Size
495 1093218 : add_size(Size s1, Size s2)
496 : {
497 : Size result;
498 :
499 1093218 : if (pg_add_size_overflow(s1, s2, &result))
500 0 : ereport(ERROR,
501 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
502 : errmsg("requested shared memory size overflows size_t")));
503 1093218 : return result;
504 : }
505 :
506 : /*
507 : * Multiply two Size values, checking for overflow
508 : */
509 : Size
510 516016 : mul_size(Size s1, Size s2)
511 : {
512 : Size result;
513 :
514 516016 : if (pg_mul_size_overflow(s1, s2, &result))
515 0 : ereport(ERROR,
516 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
517 : errmsg("requested shared memory size overflows size_t")));
518 516016 : return result;
519 : }
520 :
521 : /* SQL SRF showing allocated shared memory */
522 : Datum
523 6 : pg_get_shmem_allocations(PG_FUNCTION_ARGS)
524 : {
525 : #define PG_GET_SHMEM_SIZES_COLS 4
526 6 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
527 : HASH_SEQ_STATUS hstat;
528 : ShmemIndexEnt *ent;
529 6 : Size named_allocated = 0;
530 : Datum values[PG_GET_SHMEM_SIZES_COLS];
531 : bool nulls[PG_GET_SHMEM_SIZES_COLS];
532 :
533 6 : InitMaterializedSRF(fcinfo, 0);
534 :
535 6 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
536 :
537 6 : hash_seq_init(&hstat, ShmemIndex);
538 :
539 : /* output all allocated entries */
540 6 : memset(nulls, 0, sizeof(nulls));
541 454 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
542 : {
543 448 : values[0] = CStringGetTextDatum(ent->key);
544 448 : values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
545 448 : values[2] = Int64GetDatum(ent->size);
546 448 : values[3] = Int64GetDatum(ent->allocated_size);
547 448 : named_allocated += ent->allocated_size;
548 :
549 448 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
550 : values, nulls);
551 : }
552 :
553 : /* output shared memory allocated but not counted via the shmem index */
554 6 : values[0] = CStringGetTextDatum("<anonymous>");
555 6 : nulls[1] = true;
556 6 : values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
557 6 : values[3] = values[2];
558 6 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
559 :
560 : /* output as-of-yet unused shared memory */
561 6 : nulls[0] = true;
562 6 : values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
563 6 : nulls[1] = false;
564 6 : values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
565 6 : values[3] = values[2];
566 6 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
567 :
568 6 : LWLockRelease(ShmemIndexLock);
569 :
570 6 : return (Datum) 0;
571 : }
572 :
573 : /*
574 : * SQL SRF showing NUMA memory nodes for allocated shared memory
575 : *
576 : * Compared to pg_get_shmem_allocations(), this function does not return
577 : * information about shared anonymous allocations and unused shared memory.
578 : */
579 : Datum
580 6 : pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
581 : {
582 : #define PG_GET_SHMEM_NUMA_SIZES_COLS 3
583 6 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
584 : HASH_SEQ_STATUS hstat;
585 : ShmemIndexEnt *ent;
586 : Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
587 : bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
588 : Size os_page_size;
589 : void **page_ptrs;
590 : int *pages_status;
591 : uint64 shm_total_page_count,
592 : shm_ent_page_count,
593 : max_nodes;
594 : Size *nodes;
595 :
596 6 : if (pg_numa_init() == -1)
597 6 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
598 :
599 0 : InitMaterializedSRF(fcinfo, 0);
600 :
601 0 : max_nodes = pg_numa_get_max_node();
602 0 : nodes = palloc(sizeof(Size) * (max_nodes + 1));
603 :
604 : /*
605 : * Shared memory allocations can vary in size and may not align with OS
606 : * memory page boundaries, while NUMA queries work on pages.
607 : *
608 : * To correctly map each allocation to NUMA nodes, we need to: 1.
609 : * Determine the OS memory page size. 2. Align each allocation's start/end
610 : * addresses to page boundaries. 3. Query NUMA node information for all
611 : * pages spanning the allocation.
612 : */
613 0 : os_page_size = pg_get_shmem_pagesize();
614 :
615 : /*
616 : * Allocate memory for page pointers and status based on total shared
617 : * memory size. This simplified approach allocates enough space for all
618 : * pages in shared memory rather than calculating the exact requirements
619 : * for each segment.
620 : *
621 : * Add 1, because we don't know how exactly the segments align to OS
622 : * pages, so the allocation might use one more memory page. In practice
623 : * this is not very likely, and moreover we have more entries, each of
624 : * them using only fraction of the total pages.
625 : */
626 0 : shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
627 0 : page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
628 0 : pages_status = palloc(sizeof(int) * shm_total_page_count);
629 :
630 0 : if (firstNumaTouch)
631 0 : elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
632 :
633 0 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
634 :
635 0 : hash_seq_init(&hstat, ShmemIndex);
636 :
637 : /* output all allocated entries */
638 0 : memset(nulls, 0, sizeof(nulls));
639 0 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
640 : {
641 : int i;
642 : char *startptr,
643 : *endptr;
644 : Size total_len;
645 :
646 : /*
647 : * Calculate the range of OS pages used by this segment. The segment
648 : * may start / end half-way through a page, we want to count these
649 : * pages too. So we align the start/end pointers down/up, and then
650 : * calculate the number of pages from that.
651 : */
652 0 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
653 0 : endptr = (char *) TYPEALIGN(os_page_size,
654 : (char *) ent->location + ent->allocated_size);
655 0 : total_len = (endptr - startptr);
656 :
657 0 : shm_ent_page_count = total_len / os_page_size;
658 :
659 : /*
660 : * If we ever get 0xff (-1) back from kernel inquiry, then we probably
661 : * have a bug in mapping buffers to OS pages.
662 : */
663 0 : memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
664 :
665 : /*
666 : * Setup page_ptrs[] with pointers to all OS pages for this segment,
667 : * and get the NUMA status using pg_numa_query_pages.
668 : *
669 : * In order to get reliable results we also need to touch memory
670 : * pages, so that inquiry about NUMA memory node doesn't return -2
671 : * (ENOENT, which indicates unmapped/unallocated pages).
672 : */
673 0 : for (i = 0; i < shm_ent_page_count; i++)
674 : {
675 0 : page_ptrs[i] = startptr + (i * os_page_size);
676 :
677 0 : if (firstNumaTouch)
678 : pg_numa_touch_mem_if_required(page_ptrs[i]);
679 :
680 0 : CHECK_FOR_INTERRUPTS();
681 : }
682 :
683 0 : if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
684 0 : elog(ERROR, "failed NUMA pages inquiry status: %m");
685 :
686 : /* Count number of NUMA nodes used for this shared memory entry */
687 0 : memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
688 :
689 0 : for (i = 0; i < shm_ent_page_count; i++)
690 : {
691 0 : int s = pages_status[i];
692 :
693 : /* Ensure we are adding only valid index to the array */
694 0 : if (s < 0 || s > max_nodes)
695 : {
696 0 : elog(ERROR, "invalid NUMA node id outside of allowed range "
697 : "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
698 : }
699 :
700 0 : nodes[s]++;
701 : }
702 :
703 : /*
704 : * Add one entry for each NUMA node, including those without allocated
705 : * memory for this segment.
706 : */
707 0 : for (i = 0; i <= max_nodes; i++)
708 : {
709 0 : values[0] = CStringGetTextDatum(ent->key);
710 0 : values[1] = Int32GetDatum(i);
711 0 : values[2] = Int64GetDatum(nodes[i] * os_page_size);
712 :
713 0 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
714 : values, nulls);
715 : }
716 : }
717 :
718 0 : LWLockRelease(ShmemIndexLock);
719 0 : firstNumaTouch = false;
720 :
721 0 : return (Datum) 0;
722 : }
723 :
724 : /*
725 : * Determine the memory page size used for the shared memory segment.
726 : *
727 : * If the shared segment was allocated using huge pages, returns the size of
728 : * a huge page. Otherwise returns the size of regular memory page.
729 : *
730 : * This should be used only after the server is started.
731 : */
732 : Size
733 4 : pg_get_shmem_pagesize(void)
734 : {
735 : Size os_page_size;
736 : #ifdef WIN32
737 : SYSTEM_INFO sysinfo;
738 :
739 : GetSystemInfo(&sysinfo);
740 : os_page_size = sysinfo.dwPageSize;
741 : #else
742 4 : os_page_size = sysconf(_SC_PAGESIZE);
743 : #endif
744 :
745 : Assert(IsUnderPostmaster);
746 : Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
747 :
748 4 : if (huge_pages_status == HUGE_PAGES_ON)
749 0 : GetHugePageSize(&os_page_size, NULL);
750 :
751 4 : return os_page_size;
752 : }
753 :
754 : Datum
755 8 : pg_numa_available(PG_FUNCTION_ARGS)
756 : {
757 8 : PG_RETURN_BOOL(pg_numa_init() != -1);
758 : }
|