Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * pg_numa.c 4 : * Basic NUMA portability routines 5 : * 6 : * 7 : * Copyright (c) 2025, PostgreSQL Global Development Group 8 : * 9 : * 10 : * IDENTIFICATION 11 : * src/port/pg_numa.c 12 : * 13 : *------------------------------------------------------------------------- 14 : */ 15 : 16 : #include "c.h" 17 : #include <unistd.h> 18 : 19 : #include "miscadmin.h" 20 : #include "port/pg_numa.h" 21 : 22 : /* 23 : * At this point we provide support only for Linux thanks to libnuma, but in 24 : * future support for other platforms e.g. Win32 or FreeBSD might be possible 25 : * too. For Win32 NUMA APIs see 26 : * https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support 27 : */ 28 : #ifdef USE_LIBNUMA 29 : 30 : #include <numa.h> 31 : #include <numaif.h> 32 : 33 : /* 34 : * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug 35 : * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same 36 : * chunk size, we make it work even on unfixed kernels. 37 : * 38 : * 64-bit system are not affected by the bug, and so use much larger chunks. 39 : */ 40 : #if SIZEOF_SIZE_T == 4 41 : #define NUMA_QUERY_CHUNK_SIZE 16 42 : #else 43 : #define NUMA_QUERY_CHUNK_SIZE 1024 44 : #endif 45 : 46 : /* libnuma requires initialization as per numa(3) on Linux */ 47 : int 48 : pg_numa_init(void) 49 : { 50 : int r; 51 : 52 : /* 53 : * XXX libnuma versions before 2.0.19 don't handle EPERM by disabling 54 : * NUMA, which then leads to unexpected failures later. This affects 55 : * containers that disable get_mempolicy by a seccomp profile. 56 : */ 57 : if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && (errno == EPERM)) 58 : r = -1; 59 : else 60 : r = numa_available(); 61 : 62 : return r; 63 : } 64 : 65 : /* 66 : * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the 67 : * first one allows us to batch and query about many memory pages in one single 68 : * giant system call that is way faster. 69 : * 70 : * We call numa_move_pages() for smaller chunks of the whole array. The first 71 : * reason is to work around a kernel bug, but also to allow interrupting the 72 : * query between the calls (for many pointers processing the whole array can 73 : * take a lot of time). 74 : */ 75 : int 76 : pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) 77 : { 78 : unsigned long next = 0; 79 : int ret = 0; 80 : 81 : /* 82 : * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE 83 : * items, to work around a kernel bug in do_pages_stat(). 84 : */ 85 : while (next < count) 86 : { 87 : unsigned long count_chunk = Min(count - next, 88 : NUMA_QUERY_CHUNK_SIZE); 89 : 90 : CHECK_FOR_INTERRUPTS(); 91 : 92 : /* 93 : * Bail out if any of the chunks errors out (ret<0). We ignore (ret>0) 94 : * which is used to return number of nonmigrated pages, but we're not 95 : * migrating any pages here. 96 : */ 97 : ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0); 98 : if (ret < 0) 99 : { 100 : /* plain error, return as is */ 101 : return ret; 102 : } 103 : 104 : next += count_chunk; 105 : } 106 : 107 : /* should have consumed the input array exactly */ 108 : Assert(next == count); 109 : 110 : return 0; 111 : } 112 : 113 : int 114 : pg_numa_get_max_node(void) 115 : { 116 : return numa_max_node(); 117 : } 118 : 119 : #else 120 : 121 : /* Empty wrappers */ 122 : int 123 14 : pg_numa_init(void) 124 : { 125 : /* We state that NUMA is not available */ 126 14 : return -1; 127 : } 128 : 129 : int 130 0 : pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) 131 : { 132 0 : return 0; 133 : } 134 : 135 : int 136 0 : pg_numa_get_max_node(void) 137 : { 138 0 : return 0; 139 : } 140 : 141 : #endif