Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_buffercache_pages.c
4 : * display some contents of the buffer cache
5 : *
6 : * contrib/pg_buffercache/pg_buffercache_pages.c
7 : *-------------------------------------------------------------------------
8 : */
9 : #include "postgres.h"
10 :
11 : #include "access/htup_details.h"
12 : #include "access/relation.h"
13 : #include "catalog/pg_type.h"
14 : #include "funcapi.h"
15 : #include "port/pg_numa.h"
16 : #include "storage/buf_internals.h"
17 : #include "storage/bufmgr.h"
18 : #include "utils/rel.h"
19 :
20 :
21 : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
22 : #define NUM_BUFFERCACHE_PAGES_ELEM 9
23 : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
24 : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
25 : #define NUM_BUFFERCACHE_EVICT_ELEM 2
26 : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
27 : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
28 :
29 : #define NUM_BUFFERCACHE_NUMA_ELEM 3
30 :
31 2 : PG_MODULE_MAGIC_EXT(
32 : .name = "pg_buffercache",
33 : .version = PG_VERSION
34 : );
35 :
36 : /*
37 : * Record structure holding the to be exposed cache data.
38 : */
39 : typedef struct
40 : {
41 : uint32 bufferid;
42 : RelFileNumber relfilenumber;
43 : Oid reltablespace;
44 : Oid reldatabase;
45 : ForkNumber forknum;
46 : BlockNumber blocknum;
47 : bool isvalid;
48 : bool isdirty;
49 : uint16 usagecount;
50 :
51 : /*
52 : * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
53 : * being pinned by too many backends and each backend will only pin once
54 : * because of bufmgr.c's PrivateRefCount infrastructure.
55 : */
56 : int32 pinning_backends;
57 : } BufferCachePagesRec;
58 :
59 :
60 : /*
61 : * Function context for data persisting over repeated calls.
62 : */
63 : typedef struct
64 : {
65 : TupleDesc tupdesc;
66 : BufferCachePagesRec *record;
67 : } BufferCachePagesContext;
68 :
69 : /*
70 : * Record structure holding the to be exposed cache data.
71 : */
72 : typedef struct
73 : {
74 : uint32 bufferid;
75 : int64 page_num;
76 : int32 numa_node;
77 : } BufferCacheNumaRec;
78 :
79 : /*
80 : * Function context for data persisting over repeated calls.
81 : */
82 : typedef struct
83 : {
84 : TupleDesc tupdesc;
85 : int buffers_per_page;
86 : int pages_per_buffer;
87 : int os_page_size;
88 : BufferCacheNumaRec *record;
89 : } BufferCacheNumaContext;
90 :
91 :
92 : /*
93 : * Function returning data from the shared buffer cache - buffer number,
94 : * relation node/tablespace/database/blocknum and dirty indicator.
95 : */
96 4 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
97 2 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
98 4 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
99 4 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
100 6 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
101 4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
102 4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
103 :
104 :
105 : /* Only need to touch memory once per backend process lifetime */
106 : static bool firstNumaTouch = true;
107 :
108 :
109 : Datum
110 65540 : pg_buffercache_pages(PG_FUNCTION_ARGS)
111 : {
112 : FuncCallContext *funcctx;
113 : Datum result;
114 : MemoryContext oldcontext;
115 : BufferCachePagesContext *fctx; /* User function context. */
116 : TupleDesc tupledesc;
117 : TupleDesc expected_tupledesc;
118 : HeapTuple tuple;
119 :
120 65540 : if (SRF_IS_FIRSTCALL())
121 : {
122 : int i;
123 :
124 4 : funcctx = SRF_FIRSTCALL_INIT();
125 :
126 : /* Switch context when allocating stuff to be used in later calls */
127 4 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
128 :
129 : /* Create a user function context for cross-call persistence */
130 4 : fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
131 :
132 : /*
133 : * To smoothly support upgrades from version 1.0 of this extension
134 : * transparently handle the (non-)existence of the pinning_backends
135 : * column. We unfortunately have to get the result type for that... -
136 : * we can't use the result type determined by the function definition
137 : * without potentially crashing when somebody uses the old (or even
138 : * wrong) function definition though.
139 : */
140 4 : if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
141 0 : elog(ERROR, "return type must be a row type");
142 :
143 4 : if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
144 4 : expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
145 0 : elog(ERROR, "incorrect number of output arguments");
146 :
147 : /* Construct a tuple descriptor for the result rows. */
148 4 : tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
149 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
150 : INT4OID, -1, 0);
151 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
152 : OIDOID, -1, 0);
153 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
154 : OIDOID, -1, 0);
155 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
156 : OIDOID, -1, 0);
157 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
158 : INT2OID, -1, 0);
159 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
160 : INT8OID, -1, 0);
161 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
162 : BOOLOID, -1, 0);
163 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
164 : INT2OID, -1, 0);
165 :
166 4 : if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
167 4 : TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
168 : INT4OID, -1, 0);
169 :
170 4 : fctx->tupdesc = BlessTupleDesc(tupledesc);
171 :
172 : /* Allocate NBuffers worth of BufferCachePagesRec records. */
173 4 : fctx->record = (BufferCachePagesRec *)
174 4 : MemoryContextAllocHuge(CurrentMemoryContext,
175 : sizeof(BufferCachePagesRec) * NBuffers);
176 :
177 : /* Set max calls and remember the user function context. */
178 4 : funcctx->max_calls = NBuffers;
179 4 : funcctx->user_fctx = fctx;
180 :
181 : /* Return to original context when allocating transient memory */
182 4 : MemoryContextSwitchTo(oldcontext);
183 :
184 : /*
185 : * Scan through all the buffers, saving the relevant fields in the
186 : * fctx->record structure.
187 : *
188 : * We don't hold the partition locks, so we don't get a consistent
189 : * snapshot across all buffers, but we do grab the buffer header
190 : * locks, so the information of each buffer is self-consistent.
191 : */
192 65540 : for (i = 0; i < NBuffers; i++)
193 : {
194 : BufferDesc *bufHdr;
195 : uint32 buf_state;
196 :
197 65536 : CHECK_FOR_INTERRUPTS();
198 :
199 65536 : bufHdr = GetBufferDescriptor(i);
200 : /* Lock each buffer header before inspecting. */
201 65536 : buf_state = LockBufHdr(bufHdr);
202 :
203 65536 : fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
204 65536 : fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
205 65536 : fctx->record[i].reltablespace = bufHdr->tag.spcOid;
206 65536 : fctx->record[i].reldatabase = bufHdr->tag.dbOid;
207 65536 : fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
208 65536 : fctx->record[i].blocknum = bufHdr->tag.blockNum;
209 65536 : fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
210 65536 : fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
211 :
212 65536 : if (buf_state & BM_DIRTY)
213 3792 : fctx->record[i].isdirty = true;
214 : else
215 61744 : fctx->record[i].isdirty = false;
216 :
217 : /* Note if the buffer is valid, and has storage created */
218 65536 : if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
219 7980 : fctx->record[i].isvalid = true;
220 : else
221 57556 : fctx->record[i].isvalid = false;
222 :
223 65536 : UnlockBufHdr(bufHdr);
224 : }
225 : }
226 :
227 65540 : funcctx = SRF_PERCALL_SETUP();
228 :
229 : /* Get the saved state */
230 65540 : fctx = funcctx->user_fctx;
231 :
232 65540 : if (funcctx->call_cntr < funcctx->max_calls)
233 : {
234 65536 : uint32 i = funcctx->call_cntr;
235 : Datum values[NUM_BUFFERCACHE_PAGES_ELEM];
236 : bool nulls[NUM_BUFFERCACHE_PAGES_ELEM];
237 :
238 65536 : values[0] = Int32GetDatum(fctx->record[i].bufferid);
239 65536 : nulls[0] = false;
240 :
241 : /*
242 : * Set all fields except the bufferid to null if the buffer is unused
243 : * or not valid.
244 : */
245 65536 : if (fctx->record[i].blocknum == InvalidBlockNumber ||
246 7980 : fctx->record[i].isvalid == false)
247 : {
248 57556 : nulls[1] = true;
249 57556 : nulls[2] = true;
250 57556 : nulls[3] = true;
251 57556 : nulls[4] = true;
252 57556 : nulls[5] = true;
253 57556 : nulls[6] = true;
254 57556 : nulls[7] = true;
255 : /* unused for v1.0 callers, but the array is always long enough */
256 57556 : nulls[8] = true;
257 : }
258 : else
259 : {
260 7980 : values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
261 7980 : nulls[1] = false;
262 7980 : values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
263 7980 : nulls[2] = false;
264 7980 : values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
265 7980 : nulls[3] = false;
266 7980 : values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
267 7980 : nulls[4] = false;
268 7980 : values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
269 7980 : nulls[5] = false;
270 7980 : values[6] = BoolGetDatum(fctx->record[i].isdirty);
271 7980 : nulls[6] = false;
272 7980 : values[7] = Int16GetDatum(fctx->record[i].usagecount);
273 7980 : nulls[7] = false;
274 : /* unused for v1.0 callers, but the array is always long enough */
275 7980 : values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
276 7980 : nulls[8] = false;
277 : }
278 :
279 : /* Build and return the tuple. */
280 65536 : tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
281 65536 : result = HeapTupleGetDatum(tuple);
282 :
283 65536 : SRF_RETURN_NEXT(funcctx, result);
284 : }
285 : else
286 4 : SRF_RETURN_DONE(funcctx);
287 : }
288 :
289 : /*
290 : * Inquire about NUMA memory mappings for shared buffers.
291 : *
292 : * Returns NUMA node ID for each memory page used by the buffer. Buffers may
293 : * be smaller or larger than OS memory pages. For each buffer we return one
294 : * entry for each memory page used by the buffer (if the buffer is smaller,
295 : * it only uses a part of one memory page).
296 : *
297 : * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
298 : * one is always a multiple of the other.
299 : *
300 : * In order to get reliable results we also need to touch memory pages, so
301 : * that the inquiry about NUMA memory node doesn't return -2 (which indicates
302 : * unmapped/unallocated pages).
303 : */
304 : Datum
305 0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
306 : {
307 : FuncCallContext *funcctx;
308 : MemoryContext oldcontext;
309 : BufferCacheNumaContext *fctx; /* User function context. */
310 : TupleDesc tupledesc;
311 : TupleDesc expected_tupledesc;
312 : HeapTuple tuple;
313 : Datum result;
314 :
315 0 : if (SRF_IS_FIRSTCALL())
316 : {
317 : int i,
318 : idx;
319 : Size os_page_size;
320 : void **os_page_ptrs;
321 : int *os_page_status;
322 : uint64 os_page_count;
323 : int pages_per_buffer;
324 : int max_entries;
325 : char *startptr,
326 : *endptr;
327 :
328 0 : if (pg_numa_init() == -1)
329 0 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
330 :
331 : /*
332 : * The database block size and OS memory page size are unlikely to be
333 : * the same. The block size is 1-32KB, the memory page size depends on
334 : * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
335 : * there are also features like THP etc. Moreover, we don't quite know
336 : * how the pages and buffers "align" in memory - the buffers may be
337 : * shifted in some way, using more memory pages than necessary.
338 : *
339 : * So we need to be careful about mapping buffers to memory pages. We
340 : * calculate the maximum number of pages a buffer might use, so that
341 : * we allocate enough space for the entries. And then we count the
342 : * actual number of entries as we scan the buffers.
343 : *
344 : * This information is needed before calling move_pages() for NUMA
345 : * node id inquiry.
346 : */
347 0 : os_page_size = pg_get_shmem_pagesize();
348 :
349 : /*
350 : * The pages and block size is expected to be 2^k, so one divides the
351 : * other (we don't know in which direction). This does not say
352 : * anything about relative alignment of pages/buffers.
353 : */
354 : Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
355 :
356 : /*
357 : * How many addresses we are going to query? Simply get the page for
358 : * the first buffer, and first page after the last buffer, and count
359 : * the pages from that.
360 : */
361 0 : startptr = (char *) TYPEALIGN_DOWN(os_page_size,
362 : BufferGetBlock(1));
363 0 : endptr = (char *) TYPEALIGN(os_page_size,
364 : (char *) BufferGetBlock(NBuffers) + BLCKSZ);
365 0 : os_page_count = (endptr - startptr) / os_page_size;
366 :
367 : /* Used to determine the NUMA node for all OS pages at once */
368 0 : os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
369 0 : os_page_status = palloc(sizeof(uint64) * os_page_count);
370 :
371 : /* Fill pointers for all the memory pages. */
372 0 : idx = 0;
373 0 : for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
374 : {
375 0 : os_page_ptrs[idx++] = ptr;
376 :
377 : /* Only need to touch memory once per backend process lifetime */
378 0 : if (firstNumaTouch)
379 : pg_numa_touch_mem_if_required(ptr);
380 : }
381 :
382 : Assert(idx == os_page_count);
383 :
384 0 : elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
385 : "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
386 :
387 : /*
388 : * If we ever get 0xff back from kernel inquiry, then we probably have
389 : * bug in our buffers to OS page mapping code here.
390 : */
391 0 : memset(os_page_status, 0xff, sizeof(int) * os_page_count);
392 :
393 : /* Query NUMA status for all the pointers */
394 0 : if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
395 0 : elog(ERROR, "failed NUMA pages inquiry: %m");
396 :
397 : /* Initialize the multi-call context, load entries about buffers */
398 :
399 0 : funcctx = SRF_FIRSTCALL_INIT();
400 :
401 : /* Switch context when allocating stuff to be used in later calls */
402 0 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
403 :
404 : /* Create a user function context for cross-call persistence */
405 0 : fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext));
406 :
407 0 : if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
408 0 : elog(ERROR, "return type must be a row type");
409 :
410 0 : if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM)
411 0 : elog(ERROR, "incorrect number of output arguments");
412 :
413 : /* Construct a tuple descriptor for the result rows. */
414 0 : tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
415 0 : TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
416 : INT4OID, -1, 0);
417 0 : TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
418 : INT8OID, -1, 0);
419 0 : TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
420 : INT4OID, -1, 0);
421 :
422 0 : fctx->tupdesc = BlessTupleDesc(tupledesc);
423 :
424 : /*
425 : * Each buffer needs at least one entry, but it might be offset in
426 : * some way, and use one extra entry. So we allocate space for the
427 : * maximum number of entries we might need, and then count the exact
428 : * number as we're walking buffers. That way we can do it in one pass,
429 : * without reallocating memory.
430 : */
431 0 : pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
432 0 : max_entries = NBuffers * pages_per_buffer;
433 :
434 : /* Allocate entries for BufferCachePagesRec records. */
435 0 : fctx->record = (BufferCacheNumaRec *)
436 0 : MemoryContextAllocHuge(CurrentMemoryContext,
437 : sizeof(BufferCacheNumaRec) * max_entries);
438 :
439 : /* Return to original context when allocating transient memory */
440 0 : MemoryContextSwitchTo(oldcontext);
441 :
442 0 : if (firstNumaTouch)
443 0 : elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
444 :
445 : /*
446 : * Scan through all the buffers, saving the relevant fields in the
447 : * fctx->record structure.
448 : *
449 : * We don't hold the partition locks, so we don't get a consistent
450 : * snapshot across all buffers, but we do grab the buffer header
451 : * locks, so the information of each buffer is self-consistent.
452 : *
453 : * This loop touches and stores addresses into os_page_ptrs[] as input
454 : * to one big move_pages(2) inquiry system call. Basically we ask for
455 : * all memory pages for NBuffers.
456 : */
457 0 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
458 0 : idx = 0;
459 0 : for (i = 0; i < NBuffers; i++)
460 : {
461 0 : char *buffptr = (char *) BufferGetBlock(i + 1);
462 : BufferDesc *bufHdr;
463 : uint32 bufferid;
464 : int32 page_num;
465 : char *startptr_buff,
466 : *endptr_buff;
467 :
468 0 : CHECK_FOR_INTERRUPTS();
469 :
470 0 : bufHdr = GetBufferDescriptor(i);
471 :
472 : /* Lock each buffer header before inspecting. */
473 0 : LockBufHdr(bufHdr);
474 0 : bufferid = BufferDescriptorGetBuffer(bufHdr);
475 0 : UnlockBufHdr(bufHdr);
476 :
477 : /* start of the first page of this buffer */
478 0 : startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
479 :
480 : /* end of the buffer (no need to align to memory page) */
481 0 : endptr_buff = buffptr + BLCKSZ;
482 :
483 : Assert(startptr_buff < endptr_buff);
484 :
485 : /* calculate ID of the first page for this buffer */
486 0 : page_num = (startptr_buff - startptr) / os_page_size;
487 :
488 : /* Add an entry for each OS page overlapping with this buffer. */
489 0 : for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
490 : {
491 0 : fctx->record[idx].bufferid = bufferid;
492 0 : fctx->record[idx].page_num = page_num;
493 0 : fctx->record[idx].numa_node = os_page_status[page_num];
494 :
495 : /* advance to the next entry/page */
496 0 : ++idx;
497 0 : ++page_num;
498 : }
499 : }
500 :
501 : Assert((idx >= os_page_count) && (idx <= max_entries));
502 :
503 : /* Set max calls and remember the user function context. */
504 0 : funcctx->max_calls = idx;
505 0 : funcctx->user_fctx = fctx;
506 :
507 : /* Remember this backend touched the pages */
508 0 : firstNumaTouch = false;
509 : }
510 :
511 0 : funcctx = SRF_PERCALL_SETUP();
512 :
513 : /* Get the saved state */
514 0 : fctx = funcctx->user_fctx;
515 :
516 0 : if (funcctx->call_cntr < funcctx->max_calls)
517 : {
518 0 : uint32 i = funcctx->call_cntr;
519 : Datum values[NUM_BUFFERCACHE_NUMA_ELEM];
520 : bool nulls[NUM_BUFFERCACHE_NUMA_ELEM];
521 :
522 0 : values[0] = Int32GetDatum(fctx->record[i].bufferid);
523 0 : nulls[0] = false;
524 :
525 0 : values[1] = Int64GetDatum(fctx->record[i].page_num);
526 0 : nulls[1] = false;
527 :
528 0 : values[2] = Int32GetDatum(fctx->record[i].numa_node);
529 0 : nulls[2] = false;
530 :
531 : /* Build and return the tuple. */
532 0 : tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
533 0 : result = HeapTupleGetDatum(tuple);
534 :
535 0 : SRF_RETURN_NEXT(funcctx, result);
536 : }
537 : else
538 0 : SRF_RETURN_DONE(funcctx);
539 : }
540 :
541 : Datum
542 4 : pg_buffercache_summary(PG_FUNCTION_ARGS)
543 : {
544 : Datum result;
545 : TupleDesc tupledesc;
546 : HeapTuple tuple;
547 : Datum values[NUM_BUFFERCACHE_SUMMARY_ELEM];
548 : bool nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
549 :
550 4 : int32 buffers_used = 0;
551 4 : int32 buffers_unused = 0;
552 4 : int32 buffers_dirty = 0;
553 4 : int32 buffers_pinned = 0;
554 4 : int64 usagecount_total = 0;
555 :
556 4 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
557 0 : elog(ERROR, "return type must be a row type");
558 :
559 65540 : for (int i = 0; i < NBuffers; i++)
560 : {
561 : BufferDesc *bufHdr;
562 : uint32 buf_state;
563 :
564 65536 : CHECK_FOR_INTERRUPTS();
565 :
566 : /*
567 : * This function summarizes the state of all headers. Locking the
568 : * buffer headers wouldn't provide an improved result as the state of
569 : * the buffer can still change after we release the lock and it'd
570 : * noticeably increase the cost of the function.
571 : */
572 65536 : bufHdr = GetBufferDescriptor(i);
573 65536 : buf_state = pg_atomic_read_u32(&bufHdr->state);
574 :
575 65536 : if (buf_state & BM_VALID)
576 : {
577 7980 : buffers_used++;
578 7980 : usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
579 :
580 7980 : if (buf_state & BM_DIRTY)
581 3792 : buffers_dirty++;
582 : }
583 : else
584 57556 : buffers_unused++;
585 :
586 65536 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
587 0 : buffers_pinned++;
588 : }
589 :
590 4 : memset(nulls, 0, sizeof(nulls));
591 4 : values[0] = Int32GetDatum(buffers_used);
592 4 : values[1] = Int32GetDatum(buffers_unused);
593 4 : values[2] = Int32GetDatum(buffers_dirty);
594 4 : values[3] = Int32GetDatum(buffers_pinned);
595 :
596 4 : if (buffers_used != 0)
597 4 : values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
598 : else
599 0 : nulls[4] = true;
600 :
601 : /* Build and return the tuple. */
602 4 : tuple = heap_form_tuple(tupledesc, values, nulls);
603 4 : result = HeapTupleGetDatum(tuple);
604 :
605 4 : PG_RETURN_DATUM(result);
606 : }
607 :
608 : Datum
609 4 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
610 : {
611 4 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
612 4 : int usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
613 4 : int dirty[BM_MAX_USAGE_COUNT + 1] = {0};
614 4 : int pinned[BM_MAX_USAGE_COUNT + 1] = {0};
615 : Datum values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
616 4 : bool nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
617 :
618 4 : InitMaterializedSRF(fcinfo, 0);
619 :
620 65540 : for (int i = 0; i < NBuffers; i++)
621 : {
622 65536 : BufferDesc *bufHdr = GetBufferDescriptor(i);
623 65536 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
624 : int usage_count;
625 :
626 65536 : CHECK_FOR_INTERRUPTS();
627 :
628 65536 : usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
629 65536 : usage_counts[usage_count]++;
630 :
631 65536 : if (buf_state & BM_DIRTY)
632 3792 : dirty[usage_count]++;
633 :
634 65536 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
635 0 : pinned[usage_count]++;
636 : }
637 :
638 28 : for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
639 : {
640 24 : values[0] = Int32GetDatum(i);
641 24 : values[1] = Int32GetDatum(usage_counts[i]);
642 24 : values[2] = Int32GetDatum(dirty[i]);
643 24 : values[3] = Int32GetDatum(pinned[i]);
644 :
645 24 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
646 : }
647 :
648 4 : return (Datum) 0;
649 : }
650 :
651 : /*
652 : * Helper function to check if the user has superuser privileges.
653 : */
654 : static void
655 20 : pg_buffercache_superuser_check(char *func_name)
656 : {
657 20 : if (!superuser())
658 6 : ereport(ERROR,
659 : (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
660 : errmsg("must be superuser to use %s()",
661 : func_name)));
662 14 : }
663 :
664 : /*
665 : * Try to evict a shared buffer.
666 : */
667 : Datum
668 10 : pg_buffercache_evict(PG_FUNCTION_ARGS)
669 : {
670 : Datum result;
671 : TupleDesc tupledesc;
672 : HeapTuple tuple;
673 : Datum values[NUM_BUFFERCACHE_EVICT_ELEM];
674 10 : bool nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
675 :
676 10 : Buffer buf = PG_GETARG_INT32(0);
677 : bool buffer_flushed;
678 :
679 10 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
680 0 : elog(ERROR, "return type must be a row type");
681 :
682 10 : pg_buffercache_superuser_check("pg_buffercache_evict");
683 :
684 8 : if (buf < 1 || buf > NBuffers)
685 6 : elog(ERROR, "bad buffer ID: %d", buf);
686 :
687 2 : values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
688 2 : values[1] = BoolGetDatum(buffer_flushed);
689 :
690 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
691 2 : result = HeapTupleGetDatum(tuple);
692 :
693 2 : PG_RETURN_DATUM(result);
694 : }
695 :
696 : /*
697 : * Try to evict specified relation.
698 : */
699 : Datum
700 6 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
701 : {
702 : Datum result;
703 : TupleDesc tupledesc;
704 : HeapTuple tuple;
705 : Datum values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
706 6 : bool nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
707 :
708 : Oid relOid;
709 : Relation rel;
710 :
711 6 : int32 buffers_evicted = 0;
712 6 : int32 buffers_flushed = 0;
713 6 : int32 buffers_skipped = 0;
714 :
715 6 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
716 0 : elog(ERROR, "return type must be a row type");
717 :
718 6 : pg_buffercache_superuser_check("pg_buffercache_evict_relation");
719 :
720 4 : relOid = PG_GETARG_OID(0);
721 :
722 4 : rel = relation_open(relOid, AccessShareLock);
723 :
724 4 : if (RelationUsesLocalBuffers(rel))
725 2 : ereport(ERROR,
726 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
727 : errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
728 : "pg_buffercache_evict_relation")));
729 :
730 2 : EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
731 : &buffers_skipped);
732 :
733 2 : relation_close(rel, AccessShareLock);
734 :
735 2 : values[0] = Int32GetDatum(buffers_evicted);
736 2 : values[1] = Int32GetDatum(buffers_flushed);
737 2 : values[2] = Int32GetDatum(buffers_skipped);
738 :
739 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
740 2 : result = HeapTupleGetDatum(tuple);
741 :
742 2 : PG_RETURN_DATUM(result);
743 : }
744 :
745 :
746 : /*
747 : * Try to evict all shared buffers.
748 : */
749 : Datum
750 4 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
751 : {
752 : Datum result;
753 : TupleDesc tupledesc;
754 : HeapTuple tuple;
755 : Datum values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
756 4 : bool nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
757 :
758 4 : int32 buffers_evicted = 0;
759 4 : int32 buffers_flushed = 0;
760 4 : int32 buffers_skipped = 0;
761 :
762 4 : if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
763 0 : elog(ERROR, "return type must be a row type");
764 :
765 4 : pg_buffercache_superuser_check("pg_buffercache_evict_all");
766 :
767 2 : EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
768 : &buffers_skipped);
769 :
770 2 : values[0] = Int32GetDatum(buffers_evicted);
771 2 : values[1] = Int32GetDatum(buffers_flushed);
772 2 : values[2] = Int32GetDatum(buffers_skipped);
773 :
774 2 : tuple = heap_form_tuple(tupledesc, values, nulls);
775 2 : result = HeapTupleGetDatum(tuple);
776 :
777 2 : PG_RETURN_DATUM(result);
778 : }
|