Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_buffercache_pages.c
4 : * display some contents of the buffer cache
5 : *
6 : * contrib/pg_buffercache/pg_buffercache_pages.c
7 : *-------------------------------------------------------------------------
8 : */
9 : #include "postgres.h"
10 :
11 : #include "access/htup_details.h"
12 : #include "access/relation.h"
13 : #include "catalog/pg_type.h"
14 : #include "funcapi.h"
15 : #include "port/pg_numa.h"
16 : #include "storage/buf_internals.h"
17 : #include "storage/bufmgr.h"
18 : #include "utils/rel.h"
19 :
20 :
21 : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
22 : #define NUM_BUFFERCACHE_PAGES_ELEM 9
23 : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
24 : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
25 : #define NUM_BUFFERCACHE_EVICT_ELEM 2
26 : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
27 : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
28 : #define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2
29 : #define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3
30 : #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
31 :
32 : #define NUM_BUFFERCACHE_OS_PAGES_ELEM 3
33 :
34 2 : PG_MODULE_MAGIC_EXT(
35 : .name = "pg_buffercache",
36 : .version = PG_VERSION
37 : );
38 :
39 : /*
40 : * Record structure holding the to be exposed cache data.
41 : */
42 : typedef struct
43 : {
44 : uint32 bufferid;
45 : RelFileNumber relfilenumber;
46 : Oid reltablespace;
47 : Oid reldatabase;
48 : ForkNumber forknum;
49 : BlockNumber blocknum;
50 : bool isvalid;
51 : bool isdirty;
52 : uint16 usagecount;
53 :
54 : /*
55 : * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
56 : * being pinned by too many backends and each backend will only pin once
57 : * because of bufmgr.c's PrivateRefCount infrastructure.
58 : */
59 : int32 pinning_backends;
60 : } BufferCachePagesRec;
61 :
62 :
63 : /*
64 : * Function context for data persisting over repeated calls.
65 : */
66 : typedef struct
67 : {
68 : TupleDesc tupdesc;
69 : BufferCachePagesRec *record;
70 : } BufferCachePagesContext;
71 :
72 : /*
73 : * Record structure holding the to be exposed cache data for OS pages. This
74 : * structure is used by pg_buffercache_os_pages(), where NUMA information may
75 : * or may not be included.
76 : */
77 : typedef struct
78 : {
79 : uint32 bufferid;
80 : int64 page_num;
81 : int32 numa_node;
82 : } BufferCacheOsPagesRec;
83 :
84 : /*
85 : * Function context for data persisting over repeated calls.
86 : */
87 : typedef struct
88 : {
89 : TupleDesc tupdesc;
90 : bool include_numa;
91 : BufferCacheOsPagesRec *record;
92 : } BufferCacheOsPagesContext;
93 :
94 :
95 : /*
96 : * Function returning data from the shared buffer cache - buffer number,
97 : * relation node/tablespace/database/blocknum and dirty indicator.
98 : */
99 4 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
100 4 : PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
101 2 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
102 4 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
103 4 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
104 6 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
105 4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
106 4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
107 4 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
108 4 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
109 4 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
110 :
111 :
112 : /* Only need to touch memory once per backend process lifetime */
113 : static bool firstNumaTouch = true;
114 :
115 :
116 : Datum
117 65540 : pg_buffercache_pages(PG_FUNCTION_ARGS)
118 : {
119 : FuncCallContext *funcctx;
120 : Datum result;
121 : MemoryContext oldcontext;
122 : BufferCachePagesContext *fctx; /* User function context. */
123 : TupleDesc tupledesc;
124 : TupleDesc expected_tupledesc;
125 : HeapTuple tuple;
126 :
127 65540 : if (SRF_IS_FIRSTCALL())
128 : {
129 : int i;
130 :
131 4 : funcctx = SRF_FIRSTCALL_INIT();
132 :
133 : /* Switch context when allocating stuff to be used in later calls */
134 4 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
135 :
136 : /* Create a user function context for cross-call persistence */
137 4 : fctx = palloc_object(BufferCachePagesContext);
138 :
139 : /*
140 : * To smoothly support upgrades from version 1.0 of this extension
141 : * transparently handle the (non-)existence of the pinning_backends
142 : * column. We unfortunately have to get the result type for that... -
143 : * we can't use the result type determined by the function definition
144 : * without potentially crashing when somebody uses the old (or even
145 : * wrong) function definition though.
146 : */
147 4 : if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
148 0 : elog(ERROR, "return type must be a row type");
149 :
150 4 : if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
151 4 : expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
152 0 : elog(ERROR, "incorrect number of output arguments");
153 :
154 : /* Construct a tuple descriptor for the result rows. */
155 4 : tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
156 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
157 : INT4OID, -1, 0);
158 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
159 : OIDOID, -1, 0);
160 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
161 : OIDOID, -1, 0);
162 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
163 : OIDOID, -1, 0);
164 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
165 : INT2OID, -1, 0);
166 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
167 : INT8OID, -1, 0);
168 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
169 : BOOLOID, -1, 0);
170 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
171 : INT2OID, -1, 0);
172 :
173 4 : if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
174 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
175 : INT4OID, -1, 0);
176 :
177 4 : fctx->tupdesc = BlessTupleDesc(tupledesc);
178 :
179 : /* Allocate NBuffers worth of BufferCachePagesRec records. */
180 4 : fctx->record = (BufferCachePagesRec *)
181 4 : MemoryContextAllocHuge(CurrentMemoryContext,
182 : sizeof(BufferCachePagesRec) * NBuffers);
183 :
184 : /* Set max calls and remember the user function context. */
185 4 : funcctx->max_calls = NBuffers;
186 4 : funcctx->user_fctx = fctx;
187 :
188 : /* Return to original context when allocating transient memory */
189 4 : MemoryContextSwitchTo(oldcontext);
190 :
191 : /*
192 : * Scan through all the buffers, saving the relevant fields in the
193 : * fctx->record structure.
194 : *
195 : * We don't hold the partition locks, so we don't get a consistent
196 : * snapshot across all buffers, but we do grab the buffer header
197 : * locks, so the information of each buffer is self-consistent.
198 : */
199 65540 : for (i = 0; i < NBuffers; i++)
200 : {
201 : BufferDesc *bufHdr;
202 : uint64 buf_state;
203 :
204 65536 : CHECK_FOR_INTERRUPTS();
205 :
206 65536 : bufHdr = GetBufferDescriptor(i);
207 : /* Lock each buffer header before inspecting. */
208 65536 : buf_state = LockBufHdr(bufHdr);
209 :
210 65536 : fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
211 65536 : fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
212 65536 : fctx->record[i].reltablespace = bufHdr->tag.spcOid;
213 65536 : fctx->record[i].reldatabase = bufHdr->tag.dbOid;
214 65536 : fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
215 65536 : fctx->record[i].blocknum = bufHdr->tag.blockNum;
216 65536 : fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
217 65536 : fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
218 :
219 65536 : if (buf_state & BM_DIRTY)
220 3800 : fctx->record[i].isdirty = true;
221 : else
222 61736 : fctx->record[i].isdirty = false;
223 :
224 : /* Note if the buffer is valid, and has storage created */
225 65536 : if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
226 7992 : fctx->record[i].isvalid = true;
227 : else
228 57544 : fctx->record[i].isvalid = false;
229 :
230 65536 : UnlockBufHdr(bufHdr);
231 : }
232 : }
233 :
234 65540 : funcctx = SRF_PERCALL_SETUP();
235 :
236 : /* Get the saved state */
237 65540 : fctx = funcctx->user_fctx;
238 :
239 65540 : if (funcctx->call_cntr < funcctx->max_calls)
240 : {
241 65536 : uint32 i = funcctx->call_cntr;
242 : Datum values[NUM_BUFFERCACHE_PAGES_ELEM];
243 : bool nulls[NUM_BUFFERCACHE_PAGES_ELEM];
244 :
245 65536 : values[0] = Int32GetDatum(fctx->record[i].bufferid);
246 65536 : nulls[0] = false;
247 :
248 : /*
249 : * Set all fields except the bufferid to null if the buffer is unused
250 : * or not valid.
251 : */
252 65536 : if (fctx->record[i].blocknum == InvalidBlockNumber ||
253 7992 : fctx->record[i].isvalid == false)
254 : {
255 57544 : nulls[1] = true;
256 57544 : nulls[2] = true;
257 57544 : nulls[3] = true;
258 57544 : nulls[4] = true;
259 57544 : nulls[5] = true;
260 57544 : nulls[6] = true;
261 57544 : nulls[7] = true;
262 : /* unused for v1.0 callers, but the array is always long enough */
263 57544 : nulls[8] = true;
264 : }
265 : else
266 : {
267 7992 : values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
268 7992 : nulls[1] = false;
269 7992 : values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
270 7992 : nulls[2] = false;
271 7992 : values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
272 7992 : nulls[3] = false;
273 7992 : values[4] = Int16GetDatum(fctx->record[i].forknum);
274 7992 : nulls[4] = false;
275 7992 : values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
276 7992 : nulls[5] = false;
277 7992 : values[6] = BoolGetDatum(fctx->record[i].isdirty);
278 7992 : nulls[6] = false;
279 7992 : values[7] = UInt16GetDatum(fctx->record[i].usagecount);
280 7992 : nulls[7] = false;
281 : /* unused for v1.0 callers, but the array is always long enough */
282 7992 : values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
283 7992 : nulls[8] = false;
284 : }
285 :
286 : /* Build and return the tuple. */
287 65536 : tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
288 65536 : result = HeapTupleGetDatum(tuple);
289 :
290 65536 : SRF_RETURN_NEXT(funcctx, result);
291 : }
292 : else
293 4 : SRF_RETURN_DONE(funcctx);
294 : }
295 :
296 : /*
297 : * Inquire about OS pages mappings for shared buffers, with NUMA information,
298 : * optionally.
299 : *
300 : * When "include_numa" is false, this routines ignores everything related
301 : * to NUMA (returned as NULL values), returning mapping information between
302 : * shared buffers and OS pages.
303 : *
304 : * When "include_numa" is true, NUMA is initialized and numa_node values
305 : * are generated. In order to get reliable results we also need to touch
306 : * memory pages, so that the inquiry about NUMA memory node does not return
307 : * -2, indicating unmapped/unallocated pages.
308 : *
309 : * Buffers may be smaller or larger than OS memory pages. For each buffer we
310 : * return one entry for each memory page used by the buffer (if the buffer is
311 : * smaller, it only uses a part of one memory page).
312 : *
313 : * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
314 : * one is always a multiple of the other.
315 : *
316 : */
317 : static Datum
318 131076 : pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
319 : {
320 : FuncCallContext *funcctx;
321 : MemoryContext oldcontext;
322 : BufferCacheOsPagesContext *fctx; /* User function context. */
323 : TupleDesc tupledesc;
324 : TupleDesc expected_tupledesc;
325 : HeapTuple tuple;
326 : Datum result;
327 :
328 131076 : if (SRF_IS_FIRSTCALL())
329 : {
330 : int i,
331 : idx;
332 : Size os_page_size;
333 : int pages_per_buffer;
334 4 : int *os_page_status = NULL;
335 4 : uint64 os_page_count = 0;
336 : int max_entries;
337 : char *startptr,
338 : *endptr;
339 :
340 : /* If NUMA information is requested, initialize NUMA support. */
341 4 : if (include_numa && pg_numa_init() == -1)
342 0 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
343 :
344 : /*
345 : * The database block size and OS memory page size are unlikely to be
346 : * the same. The block size is 1-32KB, the memory page size depends on
347 : * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
348 : * there are also features like THP etc. Moreover, we don't quite know
349 : * how the pages and buffers "align" in memory - the buffers may be
350 : * shifted in some way, using more memory pages than necessary.
351 : *
352 : * So we need to be careful about mapping buffers to memory pages. We
353 : * calculate the maximum number of pages a buffer might use, so that
354 : * we allocate enough space for the entries. And then we count the
355 : * actual number of entries as we scan the buffers.
356 : *
357 : * This information is needed before calling move_pages() for NUMA
358 : * node id inquiry.
359 : */
360 4 : os_page_size = pg_get_shmem_pagesize();
361 :
362 : /*
363 : * The pages and block size is expected to be 2^k, so one divides the
364 : * other (we don't know in which direction). This does not say
365 : * anything about relative alignment of pages/buffers.
366 : */
367 : Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
368 :
369 4 : if (include_numa)
370 : {
371 0 : void **os_page_ptrs = NULL;
372 :
373 : /*
374 : * How many addresses we are going to query? Simply get the page
375 : * for the first buffer, and first page after the last buffer, and
376 : * count the pages from that.
377 : */
378 0 : startptr = (char *) TYPEALIGN_DOWN(os_page_size,
379 : BufferGetBlock(1));
380 0 : endptr = (char *) TYPEALIGN(os_page_size,
381 : (char *) BufferGetBlock(NBuffers) + BLCKSZ);
382 0 : os_page_count = (endptr - startptr) / os_page_size;
383 :
384 : /* Used to determine the NUMA node for all OS pages at once */
385 0 : os_page_ptrs = palloc0_array(void *, os_page_count);
386 0 : os_page_status = palloc_array(int, os_page_count);
387 :
388 : /*
389 : * Fill pointers for all the memory pages. This loop stores and
390 : * touches (if needed) addresses into os_page_ptrs[] as input to
391 : * one big move_pages(2) inquiry system call, as done in
392 : * pg_numa_query_pages().
393 : */
394 0 : idx = 0;
395 0 : for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
396 : {
397 0 : os_page_ptrs[idx++] = ptr;
398 :
399 : /* Only need to touch memory once per backend process lifetime */
400 0 : if (firstNumaTouch)
401 : pg_numa_touch_mem_if_required(ptr);
402 : }
403 :
404 : Assert(idx == os_page_count);
405 :
406 0 : elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
407 : "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
408 :
409 : /*
410 : * If we ever get 0xff back from kernel inquiry, then we probably
411 : * have bug in our buffers to OS page mapping code here.
412 : */
413 0 : memset(os_page_status, 0xff, sizeof(int) * os_page_count);
414 :
415 : /* Query NUMA status for all the pointers */
416 0 : if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
417 0 : elog(ERROR, "failed NUMA pages inquiry: %m");
418 : }
419 :
420 : /* Initialize the multi-call context, load entries about buffers */
421 :
422 4 : funcctx = SRF_FIRSTCALL_INIT();
423 :
424 : /* Switch context when allocating stuff to be used in later calls */
425 4 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
426 :
427 : /* Create a user function context for cross-call persistence */
428 4 : fctx = palloc_object(BufferCacheOsPagesContext);
429 :
430 4 : if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
431 0 : elog(ERROR, "return type must be a row type");
432 :
433 4 : if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
434 0 : elog(ERROR, "incorrect number of output arguments");
435 :
436 : /* Construct a tuple descriptor for the result rows. */
437 4 : tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
438 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
439 : INT4OID, -1, 0);
440 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
441 : INT8OID, -1, 0);
442 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
443 : INT4OID, -1, 0);
444 :
445 4 : fctx->tupdesc = BlessTupleDesc(tupledesc);
446 4 : fctx->include_numa = include_numa;
447 :
448 : /*
449 : * Each buffer needs at least one entry, but it might be offset in
450 : * some way, and use one extra entry. So we allocate space for the
451 : * maximum number of entries we might need, and then count the exact
452 : * number as we're walking buffers. That way we can do it in one pass,
453 : * without reallocating memory.
454 : */
455 4 : pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
456 4 : max_entries = NBuffers * pages_per_buffer;
457 :
458 : /* Allocate entries for BufferCacheOsPagesRec records. */
459 4 : fctx->record = (BufferCacheOsPagesRec *)
460 4 : MemoryContextAllocHuge(CurrentMemoryContext,
461 : sizeof(BufferCacheOsPagesRec) * max_entries);
462 :
463 : /* Return to original context when allocating transient memory */
464 4 : MemoryContextSwitchTo(oldcontext);
465 :
466 4 : if (include_numa && firstNumaTouch)
467 0 : elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
468 :
469 : /*
470 : * Scan through all the buffers, saving the relevant fields in the
471 : * fctx->record structure.
472 : *
473 : * We don't hold the partition locks, so we don't get a consistent
474 : * snapshot across all buffers, but we do grab the buffer header
475 : * locks, so the information of each buffer is self-consistent.
476 : */
477 4 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
478 4 : idx = 0;
479 65540 : for (i = 0; i < NBuffers; i++)
480 : {
481 65536 : char *buffptr = (char *) BufferGetBlock(i + 1);
482 : BufferDesc *bufHdr;
483 : uint32 bufferid;
484 : int32 page_num;
485 : char *startptr_buff,
486 : *endptr_buff;
487 :
488 65536 : CHECK_FOR_INTERRUPTS();
489 :
490 65536 : bufHdr = GetBufferDescriptor(i);
491 :
492 : /* Lock each buffer header before inspecting. */
493 65536 : LockBufHdr(bufHdr);
494 65536 : bufferid = BufferDescriptorGetBuffer(bufHdr);
495 65536 : UnlockBufHdr(bufHdr);
496 :
497 : /* start of the first page of this buffer */
498 65536 : startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
499 :
500 : /* end of the buffer (no need to align to memory page) */
501 65536 : endptr_buff = buffptr + BLCKSZ;
502 :
503 : Assert(startptr_buff < endptr_buff);
504 :
505 : /* calculate ID of the first page for this buffer */
506 65536 : page_num = (startptr_buff - startptr) / os_page_size;
507 :
508 : /* Add an entry for each OS page overlapping with this buffer. */
509 196608 : for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
510 : {
511 131072 : fctx->record[idx].bufferid = bufferid;
512 131072 : fctx->record[idx].page_num = page_num;
513 131072 : fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
514 :
515 : /* advance to the next entry/page */
516 131072 : ++idx;
517 131072 : ++page_num;
518 : }
519 : }
520 :
521 : Assert(idx <= max_entries);
522 :
523 : if (include_numa)
524 : Assert(idx >= os_page_count);
525 :
526 : /* Set max calls and remember the user function context. */
527 4 : funcctx->max_calls = idx;
528 4 : funcctx->user_fctx = fctx;
529 :
530 : /* Remember this backend touched the pages (only relevant for NUMA) */
531 4 : if (include_numa)
532 0 : firstNumaTouch = false;
533 : }
534 :
535 131076 : funcctx = SRF_PERCALL_SETUP();
536 :
537 : /* Get the saved state */
538 131076 : fctx = funcctx->user_fctx;
539 :
540 131076 : if (funcctx->call_cntr < funcctx->max_calls)
541 : {
542 131072 : uint32 i = funcctx->call_cntr;
543 : Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
544 : bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
545 :
546 131072 : values[0] = Int32GetDatum(fctx->record[i].bufferid);
547 131072 : nulls[0] = false;
548 :
549 131072 : values[1] = Int64GetDatum(fctx->record[i].page_num);
550 131072 : nulls[1] = false;
551 :
552 131072 : if (fctx->include_numa)
553 : {
554 : /* status is valid node number */
555 0 : if (fctx->record[i].numa_node >= 0)
556 : {
557 0 : values[2] = Int32GetDatum(fctx->record[i].numa_node);
558 0 : nulls[2] = false;
559 : }
560 : else
561 : {
562 : /* some kind of error (e.g. pages moved to swap) */
563 0 : values[2] = (Datum) 0;
564 0 : nulls[2] = true;
565 : }
566 : }
567 : else
568 : {
569 131072 : values[2] = (Datum) 0;
570 131072 : nulls[2] = true;
571 : }
572 :
573 : /* Build and return the tuple. */
574 131072 : tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
575 131072 : result = HeapTupleGetDatum(tuple);
576 :
577 131072 : SRF_RETURN_NEXT(funcctx, result);
578 : }
579 : else
580 4 : SRF_RETURN_DONE(funcctx);
581 : }
582 :
583 : /*
584 : * pg_buffercache_os_pages
585 : *
586 : * Retrieve information about OS pages, with or without NUMA information.
587 : */
588 : Datum
589 131076 : pg_buffercache_os_pages(PG_FUNCTION_ARGS)
590 : {
591 : bool include_numa;
592 :
593 : /* Get the boolean parameter that controls the NUMA behavior. */
594 131076 : include_numa = PG_GETARG_BOOL(0);
595 :
596 131076 : return pg_buffercache_os_pages_internal(fcinfo, include_numa);
597 : }
598 :
599 : /* Backward-compatible wrapper for v1.6. */
600 : Datum
601 0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
602 : {
603 : /* Call internal function with include_numa=true */
604 0 : return pg_buffercache_os_pages_internal(fcinfo, true);
605 : }
606 :
607 : Datum
608 4 : pg_buffercache_summary(PG_FUNCTION_ARGS)
609 : {
610 : Datum result;
611 : TupleDesc tupledesc;
612 : HeapTuple tuple;
613 : Datum values[NUM_BUFFERCACHE_SUMMARY_ELEM];
614 : bool nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
615 :
616 4 : int32 buffers_used = 0;
617 4 : int32 buffers_unused = 0;
618 4 : int32 buffers_dirty = 0;
619 4 : int32 buffers_pinned = 0;
620 4 : int64 usagecount_total = 0;
621 :
622 4 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
623 0 : elog(ERROR, "return type must be a row type");
624 :
625 65540 : for (int i = 0; i < NBuffers; i++)
626 : {
627 : BufferDesc *bufHdr;
628 : uint64 buf_state;
629 :
630 65536 : CHECK_FOR_INTERRUPTS();
631 :
632 : /*
633 : * This function summarizes the state of all headers. Locking the
634 : * buffer headers wouldn't provide an improved result as the state of
635 : * the buffer can still change after we release the lock and it'd
636 : * noticeably increase the cost of the function.
637 : */
638 65536 : bufHdr = GetBufferDescriptor(i);
639 65536 : buf_state = pg_atomic_read_u64(&bufHdr->state);
640 :
641 65536 : if (buf_state & BM_VALID)
642 : {
643 7992 : buffers_used++;
644 7992 : usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
645 :
646 7992 : if (buf_state & BM_DIRTY)
647 3800 : buffers_dirty++;
648 : }
649 : else
650 57544 : buffers_unused++;
651 :
652 65536 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
653 0 : buffers_pinned++;
654 : }
655 :
656 4 : memset(nulls, 0, sizeof(nulls));
657 4 : values[0] = Int32GetDatum(buffers_used);
658 4 : values[1] = Int32GetDatum(buffers_unused);
659 4 : values[2] = Int32GetDatum(buffers_dirty);
660 4 : values[3] = Int32GetDatum(buffers_pinned);
661 :
662 4 : if (buffers_used != 0)
663 4 : values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
664 : else
665 0 : nulls[4] = true;
666 :
667 : /* Build and return the tuple. */
668 4 : tuple = heap_form_tuple(tupledesc, values, nulls);
669 4 : result = HeapTupleGetDatum(tuple);
670 :
671 4 : PG_RETURN_DATUM(result);
672 : }
673 :
674 : Datum
675 4 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
676 : {
677 4 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
678 4 : int usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
679 4 : int dirty[BM_MAX_USAGE_COUNT + 1] = {0};
680 4 : int pinned[BM_MAX_USAGE_COUNT + 1] = {0};
681 : Datum values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
682 4 : bool nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
683 :
684 4 : InitMaterializedSRF(fcinfo, 0);
685 :
686 65540 : for (int i = 0; i < NBuffers; i++)
687 : {
688 65536 : BufferDesc *bufHdr = GetBufferDescriptor(i);
689 65536 : uint64 buf_state = pg_atomic_read_u64(&bufHdr->state);
690 : int usage_count;
691 :
692 65536 : CHECK_FOR_INTERRUPTS();
693 :
694 65536 : usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
695 65536 : usage_counts[usage_count]++;
696 :
697 65536 : if (buf_state & BM_DIRTY)
698 3800 : dirty[usage_count]++;
699 :
700 65536 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
701 0 : pinned[usage_count]++;
702 : }
703 :
704 28 : for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
705 : {
706 24 : values[0] = Int32GetDatum(i);
707 24 : values[1] = Int32GetDatum(usage_counts[i]);
708 24 : values[2] = Int32GetDatum(dirty[i]);
709 24 : values[3] = Int32GetDatum(pinned[i]);
710 :
711 24 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
712 : }
713 :
714 4 : return (Datum) 0;
715 : }
716 :
717 : /*
718 : * Helper function to check if the user has superuser privileges.
719 : */
720 : static void
721 40 : pg_buffercache_superuser_check(char *func_name)
722 : {
723 40 : if (!superuser())
724 12 : ereport(ERROR,
725 : (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
726 : errmsg("must be superuser to use %s()",
727 : func_name)));
728 28 : }
729 :
730 : /*
731 : * Try to evict a shared buffer.
732 : */
733 : Datum
734 10 : pg_buffercache_evict(PG_FUNCTION_ARGS)
735 : {
736 : Datum result;
737 : TupleDesc tupledesc;
738 : HeapTuple tuple;
739 : Datum values[NUM_BUFFERCACHE_EVICT_ELEM];
740 10 : bool nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
741 :
742 10 : Buffer buf = PG_GETARG_INT32(0);
743 : bool buffer_flushed;
744 :
745 10 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
746 0 : elog(ERROR, "return type must be a row type");
747 :
748 10 : pg_buffercache_superuser_check("pg_buffercache_evict");
749 :
750 8 : if (buf < 1 || buf > NBuffers)
751 6 : elog(ERROR, "bad buffer ID: %d", buf);
752 :
753 2 : values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
754 2 : values[1] = BoolGetDatum(buffer_flushed);
755 :
756 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
757 2 : result = HeapTupleGetDatum(tuple);
758 :
759 2 : PG_RETURN_DATUM(result);
760 : }
761 :
762 : /*
763 : * Try to evict specified relation.
764 : */
765 : Datum
766 6 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
767 : {
768 : Datum result;
769 : TupleDesc tupledesc;
770 : HeapTuple tuple;
771 : Datum values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
772 6 : bool nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
773 :
774 : Oid relOid;
775 : Relation rel;
776 :
777 6 : int32 buffers_evicted = 0;
778 6 : int32 buffers_flushed = 0;
779 6 : int32 buffers_skipped = 0;
780 :
781 6 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
782 0 : elog(ERROR, "return type must be a row type");
783 :
784 6 : pg_buffercache_superuser_check("pg_buffercache_evict_relation");
785 :
786 4 : relOid = PG_GETARG_OID(0);
787 :
788 4 : rel = relation_open(relOid, AccessShareLock);
789 :
790 4 : if (RelationUsesLocalBuffers(rel))
791 2 : ereport(ERROR,
792 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
793 : errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
794 : "pg_buffercache_evict_relation")));
795 :
796 2 : EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
797 : &buffers_skipped);
798 :
799 2 : relation_close(rel, AccessShareLock);
800 :
801 2 : values[0] = Int32GetDatum(buffers_evicted);
802 2 : values[1] = Int32GetDatum(buffers_flushed);
803 2 : values[2] = Int32GetDatum(buffers_skipped);
804 :
805 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
806 2 : result = HeapTupleGetDatum(tuple);
807 :
808 2 : PG_RETURN_DATUM(result);
809 : }
810 :
811 :
812 : /*
813 : * Try to evict all shared buffers.
814 : */
815 : Datum
816 4 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
817 : {
818 : Datum result;
819 : TupleDesc tupledesc;
820 : HeapTuple tuple;
821 : Datum values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
822 4 : bool nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
823 :
824 4 : int32 buffers_evicted = 0;
825 4 : int32 buffers_flushed = 0;
826 4 : int32 buffers_skipped = 0;
827 :
828 4 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
829 0 : elog(ERROR, "return type must be a row type");
830 :
831 4 : pg_buffercache_superuser_check("pg_buffercache_evict_all");
832 :
833 2 : EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
834 : &buffers_skipped);
835 :
836 2 : values[0] = Int32GetDatum(buffers_evicted);
837 2 : values[1] = Int32GetDatum(buffers_flushed);
838 2 : values[2] = Int32GetDatum(buffers_skipped);
839 :
840 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
841 2 : result = HeapTupleGetDatum(tuple);
842 :
843 2 : PG_RETURN_DATUM(result);
844 : }
845 :
846 : /*
847 : * Try to mark a shared buffer as dirty.
848 : */
849 : Datum
850 10 : pg_buffercache_mark_dirty(PG_FUNCTION_ARGS)
851 : {
852 :
853 : Datum result;
854 : TupleDesc tupledesc;
855 : HeapTuple tuple;
856 : Datum values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM];
857 10 : bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0};
858 :
859 10 : Buffer buf = PG_GETARG_INT32(0);
860 : bool buffer_already_dirty;
861 :
862 10 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
863 0 : elog(ERROR, "return type must be a row type");
864 :
865 10 : pg_buffercache_superuser_check("pg_buffercache_mark_dirty");
866 :
867 8 : if (buf < 1 || buf > NBuffers)
868 6 : elog(ERROR, "bad buffer ID: %d", buf);
869 :
870 2 : values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty));
871 2 : values[1] = BoolGetDatum(buffer_already_dirty);
872 :
873 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
874 2 : result = HeapTupleGetDatum(tuple);
875 :
876 2 : PG_RETURN_DATUM(result);
877 : }
878 :
879 : /*
880 : * Try to mark all the shared buffers of a relation as dirty.
881 : */
882 : Datum
883 6 : pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS)
884 : {
885 : Datum result;
886 : TupleDesc tupledesc;
887 : HeapTuple tuple;
888 : Datum values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM];
889 6 : bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0};
890 :
891 : Oid relOid;
892 : Relation rel;
893 :
894 6 : int32 buffers_already_dirty = 0;
895 6 : int32 buffers_dirtied = 0;
896 6 : int32 buffers_skipped = 0;
897 :
898 6 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
899 0 : elog(ERROR, "return type must be a row type");
900 :
901 6 : pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation");
902 :
903 4 : relOid = PG_GETARG_OID(0);
904 :
905 4 : rel = relation_open(relOid, AccessShareLock);
906 :
907 4 : if (RelationUsesLocalBuffers(rel))
908 2 : ereport(ERROR,
909 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
910 : errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
911 : "pg_buffercache_mark_dirty_relation")));
912 :
913 2 : MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty,
914 : &buffers_skipped);
915 :
916 2 : relation_close(rel, AccessShareLock);
917 :
918 2 : values[0] = Int32GetDatum(buffers_dirtied);
919 2 : values[1] = Int32GetDatum(buffers_already_dirty);
920 2 : values[2] = Int32GetDatum(buffers_skipped);
921 :
922 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
923 2 : result = HeapTupleGetDatum(tuple);
924 :
925 2 : PG_RETURN_DATUM(result);
926 : }
927 :
928 : /*
929 : * Try to mark all the shared buffers as dirty.
930 : */
931 : Datum
932 4 : pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
933 : {
934 : Datum result;
935 : TupleDesc tupledesc;
936 : HeapTuple tuple;
937 : Datum values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM];
938 4 : bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0};
939 :
940 4 : int32 buffers_already_dirty = 0;
941 4 : int32 buffers_dirtied = 0;
942 4 : int32 buffers_skipped = 0;
943 :
944 4 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
945 0 : elog(ERROR, "return type must be a row type");
946 :
947 4 : pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all");
948 :
949 2 : MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty,
950 : &buffers_skipped);
951 :
952 2 : values[0] = Int32GetDatum(buffers_dirtied);
953 2 : values[1] = Int32GetDatum(buffers_already_dirty);
954 2 : values[2] = Int32GetDatum(buffers_skipped);
955 :
956 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
957 2 : result = HeapTupleGetDatum(tuple);
958 :
959 2 : PG_RETURN_DATUM(result);
960 : }
|