Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_buffercache_pages.c
4 : * display some contents of the buffer cache
5 : *
6 : * contrib/pg_buffercache/pg_buffercache_pages.c
7 : *-------------------------------------------------------------------------
8 : */
9 : #include "postgres.h"
10 :
11 : #include "access/htup_details.h"
12 : #include "access/relation.h"
13 : #include "catalog/pg_type.h"
14 : #include "funcapi.h"
15 : #include "port/pg_numa.h"
16 : #include "storage/buf_internals.h"
17 : #include "storage/bufmgr.h"
18 : #include "utils/rel.h"
19 : #include "utils/tuplestore.h"
20 :
21 :
22 : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
23 : #define NUM_BUFFERCACHE_PAGES_ELEM 9
24 : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
25 : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
26 : #define NUM_BUFFERCACHE_EVICT_ELEM 2
27 : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
28 : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
29 : #define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2
30 : #define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3
31 : #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
32 :
33 : #define NUM_BUFFERCACHE_OS_PAGES_ELEM 3
34 :
35 1 : PG_MODULE_MAGIC_EXT(
36 : .name = "pg_buffercache",
37 : .version = PG_VERSION
38 : );
39 :
40 : /*
41 : * Record structure holding the to be exposed cache data.
42 : */
43 : typedef struct
44 : {
45 : uint32 bufferid;
46 : RelFileNumber relfilenumber;
47 : Oid reltablespace;
48 : Oid reldatabase;
49 : ForkNumber forknum;
50 : BlockNumber blocknum;
51 : bool isvalid;
52 : bool isdirty;
53 : uint16 usagecount;
54 :
55 : /*
56 : * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
57 : * being pinned by too many backends and each backend will only pin once
58 : * because of bufmgr.c's PrivateRefCount infrastructure.
59 : */
60 : int32 pinning_backends;
61 : } BufferCachePagesRec;
62 :
63 :
64 : /*
65 : * Function context for data persisting over repeated calls.
66 : */
67 : typedef struct
68 : {
69 : TupleDesc tupdesc;
70 : BufferCachePagesRec *record;
71 : } BufferCachePagesContext;
72 :
73 : /*
74 : * Record structure holding the to be exposed cache data for OS pages. This
75 : * structure is used by pg_buffercache_os_pages(), where NUMA information may
76 : * or may not be included.
77 : */
78 : typedef struct
79 : {
80 : uint32 bufferid;
81 : int64 page_num;
82 : int32 numa_node;
83 : } BufferCacheOsPagesRec;
84 :
85 : /*
86 : * Function context for data persisting over repeated calls.
87 : */
88 : typedef struct
89 : {
90 : TupleDesc tupdesc;
91 : bool include_numa;
92 : BufferCacheOsPagesRec *record;
93 : } BufferCacheOsPagesContext;
94 :
95 :
96 : /*
97 : * Function returning data from the shared buffer cache - buffer number,
98 : * relation node/tablespace/database/blocknum and dirty indicator.
99 : */
100 2 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
101 2 : PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
102 1 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
103 2 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
104 2 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
105 3 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
106 2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
107 2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
108 2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
109 2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
110 2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
111 :
112 :
113 : /* Only need to touch memory once per backend process lifetime */
114 : static bool firstNumaTouch = true;
115 :
116 :
117 : Datum
118 32770 : pg_buffercache_pages(PG_FUNCTION_ARGS)
119 : {
120 : FuncCallContext *funcctx;
121 : Datum result;
122 : MemoryContext oldcontext;
123 : BufferCachePagesContext *fctx; /* User function context. */
124 : TupleDesc tupledesc;
125 : TupleDesc expected_tupledesc;
126 : HeapTuple tuple;
127 :
128 32770 : if (SRF_IS_FIRSTCALL())
129 : {
130 : int i;
131 :
132 2 : funcctx = SRF_FIRSTCALL_INIT();
133 :
134 : /* Switch context when allocating stuff to be used in later calls */
135 2 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
136 :
137 : /* Create a user function context for cross-call persistence */
138 2 : fctx = palloc_object(BufferCachePagesContext);
139 :
140 : /*
141 : * To smoothly support upgrades from version 1.0 of this extension
142 : * transparently handle the (non-)existence of the pinning_backends
143 : * column. We unfortunately have to get the result type for that... -
144 : * we can't use the result type determined by the function definition
145 : * without potentially crashing when somebody uses the old (or even
146 : * wrong) function definition though.
147 : */
148 2 : if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
149 0 : elog(ERROR, "return type must be a row type");
150 :
151 2 : if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
152 2 : expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
153 0 : elog(ERROR, "incorrect number of output arguments");
154 :
155 : /* Construct a tuple descriptor for the result rows. */
156 2 : tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
157 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
158 : INT4OID, -1, 0);
159 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
160 : OIDOID, -1, 0);
161 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
162 : OIDOID, -1, 0);
163 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
164 : OIDOID, -1, 0);
165 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
166 : INT2OID, -1, 0);
167 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
168 : INT8OID, -1, 0);
169 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
170 : BOOLOID, -1, 0);
171 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
172 : INT2OID, -1, 0);
173 :
174 2 : if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
175 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
176 : INT4OID, -1, 0);
177 :
178 2 : TupleDescFinalize(tupledesc);
179 2 : fctx->tupdesc = BlessTupleDesc(tupledesc);
180 :
181 : /* Allocate NBuffers worth of BufferCachePagesRec records. */
182 2 : fctx->record = (BufferCachePagesRec *)
183 2 : MemoryContextAllocHuge(CurrentMemoryContext,
184 : sizeof(BufferCachePagesRec) * NBuffers);
185 :
186 : /* Set max calls and remember the user function context. */
187 2 : funcctx->max_calls = NBuffers;
188 2 : funcctx->user_fctx = fctx;
189 :
190 : /* Return to original context when allocating transient memory */
191 2 : MemoryContextSwitchTo(oldcontext);
192 :
193 : /*
194 : * Scan through all the buffers, saving the relevant fields in the
195 : * fctx->record structure.
196 : *
197 : * We don't hold the partition locks, so we don't get a consistent
198 : * snapshot across all buffers, but we do grab the buffer header
199 : * locks, so the information of each buffer is self-consistent.
200 : */
201 32770 : for (i = 0; i < NBuffers; i++)
202 : {
203 : BufferDesc *bufHdr;
204 : uint64 buf_state;
205 :
206 32768 : CHECK_FOR_INTERRUPTS();
207 :
208 32768 : bufHdr = GetBufferDescriptor(i);
209 : /* Lock each buffer header before inspecting. */
210 32768 : buf_state = LockBufHdr(bufHdr);
211 :
212 32768 : fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
213 32768 : fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
214 32768 : fctx->record[i].reltablespace = bufHdr->tag.spcOid;
215 32768 : fctx->record[i].reldatabase = bufHdr->tag.dbOid;
216 32768 : fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
217 32768 : fctx->record[i].blocknum = bufHdr->tag.blockNum;
218 32768 : fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
219 32768 : fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
220 :
221 32768 : if (buf_state & BM_DIRTY)
222 1954 : fctx->record[i].isdirty = true;
223 : else
224 30814 : fctx->record[i].isdirty = false;
225 :
226 : /* Note if the buffer is valid, and has storage created */
227 32768 : if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
228 4106 : fctx->record[i].isvalid = true;
229 : else
230 28662 : fctx->record[i].isvalid = false;
231 :
232 32768 : UnlockBufHdr(bufHdr);
233 : }
234 : }
235 :
236 32770 : funcctx = SRF_PERCALL_SETUP();
237 :
238 : /* Get the saved state */
239 32770 : fctx = funcctx->user_fctx;
240 :
241 32770 : if (funcctx->call_cntr < funcctx->max_calls)
242 : {
243 32768 : uint32 i = funcctx->call_cntr;
244 : Datum values[NUM_BUFFERCACHE_PAGES_ELEM];
245 : bool nulls[NUM_BUFFERCACHE_PAGES_ELEM];
246 :
247 32768 : values[0] = Int32GetDatum(fctx->record[i].bufferid);
248 32768 : nulls[0] = false;
249 :
250 : /*
251 : * Set all fields except the bufferid to null if the buffer is unused
252 : * or not valid.
253 : */
254 32768 : if (fctx->record[i].blocknum == InvalidBlockNumber ||
255 4106 : fctx->record[i].isvalid == false)
256 : {
257 28662 : nulls[1] = true;
258 28662 : nulls[2] = true;
259 28662 : nulls[3] = true;
260 28662 : nulls[4] = true;
261 28662 : nulls[5] = true;
262 28662 : nulls[6] = true;
263 28662 : nulls[7] = true;
264 : /* unused for v1.0 callers, but the array is always long enough */
265 28662 : nulls[8] = true;
266 : }
267 : else
268 : {
269 4106 : values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
270 4106 : nulls[1] = false;
271 4106 : values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
272 4106 : nulls[2] = false;
273 4106 : values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
274 4106 : nulls[3] = false;
275 4106 : values[4] = Int16GetDatum(fctx->record[i].forknum);
276 4106 : nulls[4] = false;
277 4106 : values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
278 4106 : nulls[5] = false;
279 4106 : values[6] = BoolGetDatum(fctx->record[i].isdirty);
280 4106 : nulls[6] = false;
281 4106 : values[7] = UInt16GetDatum(fctx->record[i].usagecount);
282 4106 : nulls[7] = false;
283 : /* unused for v1.0 callers, but the array is always long enough */
284 4106 : values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
285 4106 : nulls[8] = false;
286 : }
287 :
288 : /* Build and return the tuple. */
289 32768 : tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
290 32768 : result = HeapTupleGetDatum(tuple);
291 :
292 32768 : SRF_RETURN_NEXT(funcctx, result);
293 : }
294 : else
295 2 : SRF_RETURN_DONE(funcctx);
296 : }
297 :
298 : /*
299 : * Inquire about OS pages mappings for shared buffers, with NUMA information,
300 : * optionally.
301 : *
302 : * When "include_numa" is false, this routines ignores everything related
303 : * to NUMA (returned as NULL values), returning mapping information between
304 : * shared buffers and OS pages.
305 : *
306 : * When "include_numa" is true, NUMA is initialized and numa_node values
307 : * are generated. In order to get reliable results we also need to touch
308 : * memory pages, so that the inquiry about NUMA memory node does not return
309 : * -2, indicating unmapped/unallocated pages.
310 : *
311 : * Buffers may be smaller or larger than OS memory pages. For each buffer we
312 : * return one entry for each memory page used by the buffer (if the buffer is
313 : * smaller, it only uses a part of one memory page).
314 : *
315 : * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
316 : * one is always a multiple of the other.
317 : *
318 : */
319 : static Datum
320 65538 : pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
321 : {
322 : FuncCallContext *funcctx;
323 : MemoryContext oldcontext;
324 : BufferCacheOsPagesContext *fctx; /* User function context. */
325 : TupleDesc tupledesc;
326 : TupleDesc expected_tupledesc;
327 : HeapTuple tuple;
328 : Datum result;
329 :
330 65538 : if (SRF_IS_FIRSTCALL())
331 : {
332 : int i,
333 : idx;
334 : Size os_page_size;
335 : int pages_per_buffer;
336 2 : int *os_page_status = NULL;
337 2 : uint64 os_page_count = 0;
338 : int max_entries;
339 : char *startptr,
340 : *endptr;
341 :
342 : /* If NUMA information is requested, initialize NUMA support. */
343 2 : if (include_numa && pg_numa_init() == -1)
344 0 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
345 :
346 : /*
347 : * The database block size and OS memory page size are unlikely to be
348 : * the same. The block size is 1-32KB, the memory page size depends on
349 : * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
350 : * there are also features like THP etc. Moreover, we don't quite know
351 : * how the pages and buffers "align" in memory - the buffers may be
352 : * shifted in some way, using more memory pages than necessary.
353 : *
354 : * So we need to be careful about mapping buffers to memory pages. We
355 : * calculate the maximum number of pages a buffer might use, so that
356 : * we allocate enough space for the entries. And then we count the
357 : * actual number of entries as we scan the buffers.
358 : *
359 : * This information is needed before calling move_pages() for NUMA
360 : * node id inquiry.
361 : */
362 2 : os_page_size = pg_get_shmem_pagesize();
363 :
364 : /*
365 : * The pages and block size is expected to be 2^k, so one divides the
366 : * other (we don't know in which direction). This does not say
367 : * anything about relative alignment of pages/buffers.
368 : */
369 : Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
370 :
371 2 : if (include_numa)
372 : {
373 0 : void **os_page_ptrs = NULL;
374 :
375 : /*
376 : * How many addresses we are going to query? Simply get the page
377 : * for the first buffer, and first page after the last buffer, and
378 : * count the pages from that.
379 : */
380 0 : startptr = (char *) TYPEALIGN_DOWN(os_page_size,
381 : BufferGetBlock(1));
382 0 : endptr = (char *) TYPEALIGN(os_page_size,
383 : (char *) BufferGetBlock(NBuffers) + BLCKSZ);
384 0 : os_page_count = (endptr - startptr) / os_page_size;
385 :
386 : /* Used to determine the NUMA node for all OS pages at once */
387 0 : os_page_ptrs = palloc0_array(void *, os_page_count);
388 0 : os_page_status = palloc_array(int, os_page_count);
389 :
390 : /*
391 : * Fill pointers for all the memory pages. This loop stores and
392 : * touches (if needed) addresses into os_page_ptrs[] as input to
393 : * one big move_pages(2) inquiry system call, as done in
394 : * pg_numa_query_pages().
395 : */
396 0 : idx = 0;
397 0 : for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
398 : {
399 0 : os_page_ptrs[idx++] = ptr;
400 :
401 : /* Only need to touch memory once per backend process lifetime */
402 0 : if (firstNumaTouch)
403 : pg_numa_touch_mem_if_required(ptr);
404 : }
405 :
406 : Assert(idx == os_page_count);
407 :
408 0 : elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
409 : "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
410 :
411 : /*
412 : * If we ever get 0xff back from kernel inquiry, then we probably
413 : * have bug in our buffers to OS page mapping code here.
414 : */
415 0 : memset(os_page_status, 0xff, sizeof(int) * os_page_count);
416 :
417 : /* Query NUMA status for all the pointers */
418 0 : if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
419 0 : elog(ERROR, "failed NUMA pages inquiry: %m");
420 : }
421 :
422 : /* Initialize the multi-call context, load entries about buffers */
423 :
424 2 : funcctx = SRF_FIRSTCALL_INIT();
425 :
426 : /* Switch context when allocating stuff to be used in later calls */
427 2 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
428 :
429 : /* Create a user function context for cross-call persistence */
430 2 : fctx = palloc_object(BufferCacheOsPagesContext);
431 :
432 2 : if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
433 0 : elog(ERROR, "return type must be a row type");
434 :
435 2 : if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
436 0 : elog(ERROR, "incorrect number of output arguments");
437 :
438 : /* Construct a tuple descriptor for the result rows. */
439 2 : tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
440 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
441 : INT4OID, -1, 0);
442 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
443 : INT8OID, -1, 0);
444 2 : TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
445 : INT4OID, -1, 0);
446 :
447 2 : TupleDescFinalize(tupledesc);
448 2 : fctx->tupdesc = BlessTupleDesc(tupledesc);
449 2 : fctx->include_numa = include_numa;
450 :
451 : /*
452 : * Each buffer needs at least one entry, but it might be offset in
453 : * some way, and use one extra entry. So we allocate space for the
454 : * maximum number of entries we might need, and then count the exact
455 : * number as we're walking buffers. That way we can do it in one pass,
456 : * without reallocating memory.
457 : */
458 2 : pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
459 2 : max_entries = NBuffers * pages_per_buffer;
460 :
461 : /* Allocate entries for BufferCacheOsPagesRec records. */
462 2 : fctx->record = (BufferCacheOsPagesRec *)
463 2 : MemoryContextAllocHuge(CurrentMemoryContext,
464 : sizeof(BufferCacheOsPagesRec) * max_entries);
465 :
466 : /* Return to original context when allocating transient memory */
467 2 : MemoryContextSwitchTo(oldcontext);
468 :
469 2 : if (include_numa && firstNumaTouch)
470 0 : elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
471 :
472 : /*
473 : * Scan through all the buffers, saving the relevant fields in the
474 : * fctx->record structure.
475 : *
476 : * We don't hold the partition locks, so we don't get a consistent
477 : * snapshot across all buffers, but we do grab the buffer header
478 : * locks, so the information of each buffer is self-consistent.
479 : */
480 2 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
481 2 : idx = 0;
482 32770 : for (i = 0; i < NBuffers; i++)
483 : {
484 32768 : char *buffptr = (char *) BufferGetBlock(i + 1);
485 : BufferDesc *bufHdr;
486 : uint32 bufferid;
487 : int32 page_num;
488 : char *startptr_buff,
489 : *endptr_buff;
490 :
491 32768 : CHECK_FOR_INTERRUPTS();
492 :
493 32768 : bufHdr = GetBufferDescriptor(i);
494 :
495 : /* Lock each buffer header before inspecting. */
496 32768 : LockBufHdr(bufHdr);
497 32768 : bufferid = BufferDescriptorGetBuffer(bufHdr);
498 32768 : UnlockBufHdr(bufHdr);
499 :
500 : /* start of the first page of this buffer */
501 32768 : startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
502 :
503 : /* end of the buffer (no need to align to memory page) */
504 32768 : endptr_buff = buffptr + BLCKSZ;
505 :
506 : Assert(startptr_buff < endptr_buff);
507 :
508 : /* calculate ID of the first page for this buffer */
509 32768 : page_num = (startptr_buff - startptr) / os_page_size;
510 :
511 : /* Add an entry for each OS page overlapping with this buffer. */
512 98304 : for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
513 : {
514 65536 : fctx->record[idx].bufferid = bufferid;
515 65536 : fctx->record[idx].page_num = page_num;
516 65536 : fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
517 :
518 : /* advance to the next entry/page */
519 65536 : ++idx;
520 65536 : ++page_num;
521 : }
522 : }
523 :
524 : Assert(idx <= max_entries);
525 :
526 : if (include_numa)
527 : Assert(idx >= os_page_count);
528 :
529 : /* Set max calls and remember the user function context. */
530 2 : funcctx->max_calls = idx;
531 2 : funcctx->user_fctx = fctx;
532 :
533 : /* Remember this backend touched the pages (only relevant for NUMA) */
534 2 : if (include_numa)
535 0 : firstNumaTouch = false;
536 : }
537 :
538 65538 : funcctx = SRF_PERCALL_SETUP();
539 :
540 : /* Get the saved state */
541 65538 : fctx = funcctx->user_fctx;
542 :
543 65538 : if (funcctx->call_cntr < funcctx->max_calls)
544 : {
545 65536 : uint32 i = funcctx->call_cntr;
546 : Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
547 : bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
548 :
549 65536 : values[0] = Int32GetDatum(fctx->record[i].bufferid);
550 65536 : nulls[0] = false;
551 :
552 65536 : values[1] = Int64GetDatum(fctx->record[i].page_num);
553 65536 : nulls[1] = false;
554 :
555 65536 : if (fctx->include_numa)
556 : {
557 : /* status is valid node number */
558 0 : if (fctx->record[i].numa_node >= 0)
559 : {
560 0 : values[2] = Int32GetDatum(fctx->record[i].numa_node);
561 0 : nulls[2] = false;
562 : }
563 : else
564 : {
565 : /* some kind of error (e.g. pages moved to swap) */
566 0 : values[2] = (Datum) 0;
567 0 : nulls[2] = true;
568 : }
569 : }
570 : else
571 : {
572 65536 : values[2] = (Datum) 0;
573 65536 : nulls[2] = true;
574 : }
575 :
576 : /* Build and return the tuple. */
577 65536 : tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
578 65536 : result = HeapTupleGetDatum(tuple);
579 :
580 65536 : SRF_RETURN_NEXT(funcctx, result);
581 : }
582 : else
583 2 : SRF_RETURN_DONE(funcctx);
584 : }
585 :
586 : /*
587 : * pg_buffercache_os_pages
588 : *
589 : * Retrieve information about OS pages, with or without NUMA information.
590 : */
591 : Datum
592 65538 : pg_buffercache_os_pages(PG_FUNCTION_ARGS)
593 : {
594 : bool include_numa;
595 :
596 : /* Get the boolean parameter that controls the NUMA behavior. */
597 65538 : include_numa = PG_GETARG_BOOL(0);
598 :
599 65538 : return pg_buffercache_os_pages_internal(fcinfo, include_numa);
600 : }
601 :
602 : /* Backward-compatible wrapper for v1.6. */
603 : Datum
604 0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
605 : {
606 : /* Call internal function with include_numa=true */
607 0 : return pg_buffercache_os_pages_internal(fcinfo, true);
608 : }
609 :
610 : Datum
611 2 : pg_buffercache_summary(PG_FUNCTION_ARGS)
612 : {
613 : Datum result;
614 : TupleDesc tupledesc;
615 : HeapTuple tuple;
616 : Datum values[NUM_BUFFERCACHE_SUMMARY_ELEM];
617 : bool nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
618 :
619 2 : int32 buffers_used = 0;
620 2 : int32 buffers_unused = 0;
621 2 : int32 buffers_dirty = 0;
622 2 : int32 buffers_pinned = 0;
623 2 : int64 usagecount_total = 0;
624 :
625 2 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
626 0 : elog(ERROR, "return type must be a row type");
627 :
628 32770 : for (int i = 0; i < NBuffers; i++)
629 : {
630 : BufferDesc *bufHdr;
631 : uint64 buf_state;
632 :
633 32768 : CHECK_FOR_INTERRUPTS();
634 :
635 : /*
636 : * This function summarizes the state of all headers. Locking the
637 : * buffer headers wouldn't provide an improved result as the state of
638 : * the buffer can still change after we release the lock and it'd
639 : * noticeably increase the cost of the function.
640 : */
641 32768 : bufHdr = GetBufferDescriptor(i);
642 32768 : buf_state = pg_atomic_read_u64(&bufHdr->state);
643 :
644 32768 : if (buf_state & BM_VALID)
645 : {
646 4106 : buffers_used++;
647 4106 : usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
648 :
649 4106 : if (buf_state & BM_DIRTY)
650 1954 : buffers_dirty++;
651 : }
652 : else
653 28662 : buffers_unused++;
654 :
655 32768 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
656 0 : buffers_pinned++;
657 : }
658 :
659 2 : memset(nulls, 0, sizeof(nulls));
660 2 : values[0] = Int32GetDatum(buffers_used);
661 2 : values[1] = Int32GetDatum(buffers_unused);
662 2 : values[2] = Int32GetDatum(buffers_dirty);
663 2 : values[3] = Int32GetDatum(buffers_pinned);
664 :
665 2 : if (buffers_used != 0)
666 2 : values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
667 : else
668 0 : nulls[4] = true;
669 :
670 : /* Build and return the tuple. */
671 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
672 2 : result = HeapTupleGetDatum(tuple);
673 :
674 2 : PG_RETURN_DATUM(result);
675 : }
676 :
677 : Datum
678 2 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
679 : {
680 2 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
681 2 : int usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
682 2 : int dirty[BM_MAX_USAGE_COUNT + 1] = {0};
683 2 : int pinned[BM_MAX_USAGE_COUNT + 1] = {0};
684 : Datum values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
685 2 : bool nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
686 :
687 2 : InitMaterializedSRF(fcinfo, 0);
688 :
689 32770 : for (int i = 0; i < NBuffers; i++)
690 : {
691 32768 : BufferDesc *bufHdr = GetBufferDescriptor(i);
692 32768 : uint64 buf_state = pg_atomic_read_u64(&bufHdr->state);
693 : int usage_count;
694 :
695 32768 : CHECK_FOR_INTERRUPTS();
696 :
697 32768 : usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
698 32768 : usage_counts[usage_count]++;
699 :
700 32768 : if (buf_state & BM_DIRTY)
701 1954 : dirty[usage_count]++;
702 :
703 32768 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
704 0 : pinned[usage_count]++;
705 : }
706 :
707 14 : for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
708 : {
709 12 : values[0] = Int32GetDatum(i);
710 12 : values[1] = Int32GetDatum(usage_counts[i]);
711 12 : values[2] = Int32GetDatum(dirty[i]);
712 12 : values[3] = Int32GetDatum(pinned[i]);
713 :
714 12 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
715 : }
716 :
717 2 : return (Datum) 0;
718 : }
719 :
720 : /*
721 : * Helper function to check if the user has superuser privileges.
722 : */
723 : static void
724 20 : pg_buffercache_superuser_check(char *func_name)
725 : {
726 20 : if (!superuser())
727 6 : ereport(ERROR,
728 : (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
729 : errmsg("must be superuser to use %s()",
730 : func_name)));
731 14 : }
732 :
733 : /*
734 : * Try to evict a shared buffer.
735 : */
736 : Datum
737 5 : pg_buffercache_evict(PG_FUNCTION_ARGS)
738 : {
739 : Datum result;
740 : TupleDesc tupledesc;
741 : HeapTuple tuple;
742 : Datum values[NUM_BUFFERCACHE_EVICT_ELEM];
743 5 : bool nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
744 :
745 5 : Buffer buf = PG_GETARG_INT32(0);
746 : bool buffer_flushed;
747 :
748 5 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
749 0 : elog(ERROR, "return type must be a row type");
750 :
751 5 : pg_buffercache_superuser_check("pg_buffercache_evict");
752 :
753 4 : if (buf < 1 || buf > NBuffers)
754 3 : elog(ERROR, "bad buffer ID: %d", buf);
755 :
756 1 : values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
757 1 : values[1] = BoolGetDatum(buffer_flushed);
758 :
759 1 : tuple = heap_form_tuple(tupledesc, values, nulls);
760 1 : result = HeapTupleGetDatum(tuple);
761 :
762 1 : PG_RETURN_DATUM(result);
763 : }
764 :
765 : /*
766 : * Try to evict specified relation.
767 : */
768 : Datum
769 3 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
770 : {
771 : Datum result;
772 : TupleDesc tupledesc;
773 : HeapTuple tuple;
774 : Datum values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
775 3 : bool nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
776 :
777 : Oid relOid;
778 : Relation rel;
779 :
780 3 : int32 buffers_evicted = 0;
781 3 : int32 buffers_flushed = 0;
782 3 : int32 buffers_skipped = 0;
783 :
784 3 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
785 0 : elog(ERROR, "return type must be a row type");
786 :
787 3 : pg_buffercache_superuser_check("pg_buffercache_evict_relation");
788 :
789 2 : relOid = PG_GETARG_OID(0);
790 :
791 2 : rel = relation_open(relOid, AccessShareLock);
792 :
793 2 : if (RelationUsesLocalBuffers(rel))
794 1 : ereport(ERROR,
795 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
796 : errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
797 : "pg_buffercache_evict_relation")));
798 :
799 1 : EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
800 : &buffers_skipped);
801 :
802 1 : relation_close(rel, AccessShareLock);
803 :
804 1 : values[0] = Int32GetDatum(buffers_evicted);
805 1 : values[1] = Int32GetDatum(buffers_flushed);
806 1 : values[2] = Int32GetDatum(buffers_skipped);
807 :
808 1 : tuple = heap_form_tuple(tupledesc, values, nulls);
809 1 : result = HeapTupleGetDatum(tuple);
810 :
811 1 : PG_RETURN_DATUM(result);
812 : }
813 :
814 :
815 : /*
816 : * Try to evict all shared buffers.
817 : */
818 : Datum
819 2 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
820 : {
821 : Datum result;
822 : TupleDesc tupledesc;
823 : HeapTuple tuple;
824 : Datum values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
825 2 : bool nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
826 :
827 2 : int32 buffers_evicted = 0;
828 2 : int32 buffers_flushed = 0;
829 2 : int32 buffers_skipped = 0;
830 :
831 2 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
832 0 : elog(ERROR, "return type must be a row type");
833 :
834 2 : pg_buffercache_superuser_check("pg_buffercache_evict_all");
835 :
836 1 : EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
837 : &buffers_skipped);
838 :
839 1 : values[0] = Int32GetDatum(buffers_evicted);
840 1 : values[1] = Int32GetDatum(buffers_flushed);
841 1 : values[2] = Int32GetDatum(buffers_skipped);
842 :
843 1 : tuple = heap_form_tuple(tupledesc, values, nulls);
844 1 : result = HeapTupleGetDatum(tuple);
845 :
846 1 : PG_RETURN_DATUM(result);
847 : }
848 :
849 : /*
850 : * Try to mark a shared buffer as dirty.
851 : */
852 : Datum
853 5 : pg_buffercache_mark_dirty(PG_FUNCTION_ARGS)
854 : {
855 :
856 : Datum result;
857 : TupleDesc tupledesc;
858 : HeapTuple tuple;
859 : Datum values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM];
860 5 : bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0};
861 :
862 5 : Buffer buf = PG_GETARG_INT32(0);
863 : bool buffer_already_dirty;
864 :
865 5 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
866 0 : elog(ERROR, "return type must be a row type");
867 :
868 5 : pg_buffercache_superuser_check("pg_buffercache_mark_dirty");
869 :
870 4 : if (buf < 1 || buf > NBuffers)
871 3 : elog(ERROR, "bad buffer ID: %d", buf);
872 :
873 1 : values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty));
874 1 : values[1] = BoolGetDatum(buffer_already_dirty);
875 :
876 1 : tuple = heap_form_tuple(tupledesc, values, nulls);
877 1 : result = HeapTupleGetDatum(tuple);
878 :
879 1 : PG_RETURN_DATUM(result);
880 : }
881 :
882 : /*
883 : * Try to mark all the shared buffers of a relation as dirty.
884 : */
885 : Datum
886 3 : pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS)
887 : {
888 : Datum result;
889 : TupleDesc tupledesc;
890 : HeapTuple tuple;
891 : Datum values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM];
892 3 : bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0};
893 :
894 : Oid relOid;
895 : Relation rel;
896 :
897 3 : int32 buffers_already_dirty = 0;
898 3 : int32 buffers_dirtied = 0;
899 3 : int32 buffers_skipped = 0;
900 :
901 3 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
902 0 : elog(ERROR, "return type must be a row type");
903 :
904 3 : pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation");
905 :
906 2 : relOid = PG_GETARG_OID(0);
907 :
908 2 : rel = relation_open(relOid, AccessShareLock);
909 :
910 2 : if (RelationUsesLocalBuffers(rel))
911 1 : ereport(ERROR,
912 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
913 : errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
914 : "pg_buffercache_mark_dirty_relation")));
915 :
916 1 : MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty,
917 : &buffers_skipped);
918 :
919 1 : relation_close(rel, AccessShareLock);
920 :
921 1 : values[0] = Int32GetDatum(buffers_dirtied);
922 1 : values[1] = Int32GetDatum(buffers_already_dirty);
923 1 : values[2] = Int32GetDatum(buffers_skipped);
924 :
925 1 : tuple = heap_form_tuple(tupledesc, values, nulls);
926 1 : result = HeapTupleGetDatum(tuple);
927 :
928 1 : PG_RETURN_DATUM(result);
929 : }
930 :
931 : /*
932 : * Try to mark all the shared buffers as dirty.
933 : */
934 : Datum
935 2 : pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
936 : {
937 : Datum result;
938 : TupleDesc tupledesc;
939 : HeapTuple tuple;
940 : Datum values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM];
941 2 : bool nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0};
942 :
943 2 : int32 buffers_already_dirty = 0;
944 2 : int32 buffers_dirtied = 0;
945 2 : int32 buffers_skipped = 0;
946 :
947 2 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
948 0 : elog(ERROR, "return type must be a row type");
949 :
950 2 : pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all");
951 :
952 1 : MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty,
953 : &buffers_skipped);
954 :
955 1 : values[0] = Int32GetDatum(buffers_dirtied);
956 1 : values[1] = Int32GetDatum(buffers_already_dirty);
957 1 : values[2] = Int32GetDatum(buffers_skipped);
958 :
959 1 : tuple = heap_form_tuple(tupledesc, values, nulls);
960 1 : result = HeapTupleGetDatum(tuple);
961 :
962 1 : PG_RETURN_DATUM(result);
963 : }
|