LCOV - code coverage report
Current view: top level - contrib/pg_buffercache - pg_buffercache_pages.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 188 269 69.9 %
Date: 2025-09-10 22:18:18 Functions: 15 16 93.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_buffercache_pages.c
       4             :  *    display some contents of the buffer cache
       5             :  *
       6             :  *    contrib/pg_buffercache/pg_buffercache_pages.c
       7             :  *-------------------------------------------------------------------------
       8             :  */
       9             : #include "postgres.h"
      10             : 
      11             : #include "access/htup_details.h"
      12             : #include "access/relation.h"
      13             : #include "catalog/pg_type.h"
      14             : #include "funcapi.h"
      15             : #include "port/pg_numa.h"
      16             : #include "storage/buf_internals.h"
      17             : #include "storage/bufmgr.h"
      18             : #include "utils/rel.h"
      19             : 
      20             : 
      21             : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM  8
      22             : #define NUM_BUFFERCACHE_PAGES_ELEM  9
      23             : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
      24             : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
      25             : #define NUM_BUFFERCACHE_EVICT_ELEM 2
      26             : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
      27             : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
      28             : 
      29             : #define NUM_BUFFERCACHE_NUMA_ELEM   3
      30             : 
      31           2 : PG_MODULE_MAGIC_EXT(
      32             :                     .name = "pg_buffercache",
      33             :                     .version = PG_VERSION
      34             : );
      35             : 
      36             : /*
      37             :  * Record structure holding the to be exposed cache data.
      38             :  */
      39             : typedef struct
      40             : {
      41             :     uint32      bufferid;
      42             :     RelFileNumber relfilenumber;
      43             :     Oid         reltablespace;
      44             :     Oid         reldatabase;
      45             :     ForkNumber  forknum;
      46             :     BlockNumber blocknum;
      47             :     bool        isvalid;
      48             :     bool        isdirty;
      49             :     uint16      usagecount;
      50             : 
      51             :     /*
      52             :      * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
      53             :      * being pinned by too many backends and each backend will only pin once
      54             :      * because of bufmgr.c's PrivateRefCount infrastructure.
      55             :      */
      56             :     int32       pinning_backends;
      57             : } BufferCachePagesRec;
      58             : 
      59             : 
      60             : /*
      61             :  * Function context for data persisting over repeated calls.
      62             :  */
      63             : typedef struct
      64             : {
      65             :     TupleDesc   tupdesc;
      66             :     BufferCachePagesRec *record;
      67             : } BufferCachePagesContext;
      68             : 
      69             : /*
      70             :  * Record structure holding the to be exposed cache data.
      71             :  */
      72             : typedef struct
      73             : {
      74             :     uint32      bufferid;
      75             :     int64       page_num;
      76             :     int32       numa_node;
      77             : } BufferCacheNumaRec;
      78             : 
      79             : /*
      80             :  * Function context for data persisting over repeated calls.
      81             :  */
      82             : typedef struct
      83             : {
      84             :     TupleDesc   tupdesc;
      85             :     int         buffers_per_page;
      86             :     int         pages_per_buffer;
      87             :     int         os_page_size;
      88             :     BufferCacheNumaRec *record;
      89             : } BufferCacheNumaContext;
      90             : 
      91             : 
      92             : /*
      93             :  * Function returning data from the shared buffer cache - buffer number,
      94             :  * relation node/tablespace/database/blocknum and dirty indicator.
      95             :  */
      96           4 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
      97           2 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
      98           4 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
      99           4 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
     100           6 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
     101           4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
     102           4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
     103             : 
     104             : 
     105             : /* Only need to touch memory once per backend process lifetime */
     106             : static bool firstNumaTouch = true;
     107             : 
     108             : 
     109             : Datum
     110       65540 : pg_buffercache_pages(PG_FUNCTION_ARGS)
     111             : {
     112             :     FuncCallContext *funcctx;
     113             :     Datum       result;
     114             :     MemoryContext oldcontext;
     115             :     BufferCachePagesContext *fctx;  /* User function context. */
     116             :     TupleDesc   tupledesc;
     117             :     TupleDesc   expected_tupledesc;
     118             :     HeapTuple   tuple;
     119             : 
     120       65540 :     if (SRF_IS_FIRSTCALL())
     121             :     {
     122             :         int         i;
     123             : 
     124           4 :         funcctx = SRF_FIRSTCALL_INIT();
     125             : 
     126             :         /* Switch context when allocating stuff to be used in later calls */
     127           4 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     128             : 
     129             :         /* Create a user function context for cross-call persistence */
     130           4 :         fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
     131             : 
     132             :         /*
     133             :          * To smoothly support upgrades from version 1.0 of this extension
     134             :          * transparently handle the (non-)existence of the pinning_backends
     135             :          * column. We unfortunately have to get the result type for that... -
     136             :          * we can't use the result type determined by the function definition
     137             :          * without potentially crashing when somebody uses the old (or even
     138             :          * wrong) function definition though.
     139             :          */
     140           4 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     141           0 :             elog(ERROR, "return type must be a row type");
     142             : 
     143           4 :         if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
     144           4 :             expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
     145           0 :             elog(ERROR, "incorrect number of output arguments");
     146             : 
     147             :         /* Construct a tuple descriptor for the result rows. */
     148           4 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     149           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     150             :                            INT4OID, -1, 0);
     151           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
     152             :                            OIDOID, -1, 0);
     153           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
     154             :                            OIDOID, -1, 0);
     155           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
     156             :                            OIDOID, -1, 0);
     157           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
     158             :                            INT2OID, -1, 0);
     159           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
     160             :                            INT8OID, -1, 0);
     161           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
     162             :                            BOOLOID, -1, 0);
     163           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
     164             :                            INT2OID, -1, 0);
     165             : 
     166           4 :         if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
     167           4 :             TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
     168             :                                INT4OID, -1, 0);
     169             : 
     170           4 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     171             : 
     172             :         /* Allocate NBuffers worth of BufferCachePagesRec records. */
     173           4 :         fctx->record = (BufferCachePagesRec *)
     174           4 :             MemoryContextAllocHuge(CurrentMemoryContext,
     175             :                                    sizeof(BufferCachePagesRec) * NBuffers);
     176             : 
     177             :         /* Set max calls and remember the user function context. */
     178           4 :         funcctx->max_calls = NBuffers;
     179           4 :         funcctx->user_fctx = fctx;
     180             : 
     181             :         /* Return to original context when allocating transient memory */
     182           4 :         MemoryContextSwitchTo(oldcontext);
     183             : 
     184             :         /*
     185             :          * Scan through all the buffers, saving the relevant fields in the
     186             :          * fctx->record structure.
     187             :          *
     188             :          * We don't hold the partition locks, so we don't get a consistent
     189             :          * snapshot across all buffers, but we do grab the buffer header
     190             :          * locks, so the information of each buffer is self-consistent.
     191             :          */
     192       65540 :         for (i = 0; i < NBuffers; i++)
     193             :         {
     194             :             BufferDesc *bufHdr;
     195             :             uint32      buf_state;
     196             : 
     197       65536 :             CHECK_FOR_INTERRUPTS();
     198             : 
     199       65536 :             bufHdr = GetBufferDescriptor(i);
     200             :             /* Lock each buffer header before inspecting. */
     201       65536 :             buf_state = LockBufHdr(bufHdr);
     202             : 
     203       65536 :             fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
     204       65536 :             fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
     205       65536 :             fctx->record[i].reltablespace = bufHdr->tag.spcOid;
     206       65536 :             fctx->record[i].reldatabase = bufHdr->tag.dbOid;
     207       65536 :             fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
     208       65536 :             fctx->record[i].blocknum = bufHdr->tag.blockNum;
     209       65536 :             fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
     210       65536 :             fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
     211             : 
     212       65536 :             if (buf_state & BM_DIRTY)
     213        3788 :                 fctx->record[i].isdirty = true;
     214             :             else
     215       61748 :                 fctx->record[i].isdirty = false;
     216             : 
     217             :             /* Note if the buffer is valid, and has storage created */
     218       65536 :             if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
     219        7976 :                 fctx->record[i].isvalid = true;
     220             :             else
     221       57560 :                 fctx->record[i].isvalid = false;
     222             : 
     223       65536 :             UnlockBufHdr(bufHdr, buf_state);
     224             :         }
     225             :     }
     226             : 
     227       65540 :     funcctx = SRF_PERCALL_SETUP();
     228             : 
     229             :     /* Get the saved state */
     230       65540 :     fctx = funcctx->user_fctx;
     231             : 
     232       65540 :     if (funcctx->call_cntr < funcctx->max_calls)
     233             :     {
     234       65536 :         uint32      i = funcctx->call_cntr;
     235             :         Datum       values[NUM_BUFFERCACHE_PAGES_ELEM];
     236             :         bool        nulls[NUM_BUFFERCACHE_PAGES_ELEM];
     237             : 
     238       65536 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     239       65536 :         nulls[0] = false;
     240             : 
     241             :         /*
     242             :          * Set all fields except the bufferid to null if the buffer is unused
     243             :          * or not valid.
     244             :          */
     245       65536 :         if (fctx->record[i].blocknum == InvalidBlockNumber ||
     246        7976 :             fctx->record[i].isvalid == false)
     247             :         {
     248       57560 :             nulls[1] = true;
     249       57560 :             nulls[2] = true;
     250       57560 :             nulls[3] = true;
     251       57560 :             nulls[4] = true;
     252       57560 :             nulls[5] = true;
     253       57560 :             nulls[6] = true;
     254       57560 :             nulls[7] = true;
     255             :             /* unused for v1.0 callers, but the array is always long enough */
     256       57560 :             nulls[8] = true;
     257             :         }
     258             :         else
     259             :         {
     260        7976 :             values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
     261        7976 :             nulls[1] = false;
     262        7976 :             values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
     263        7976 :             nulls[2] = false;
     264        7976 :             values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
     265        7976 :             nulls[3] = false;
     266        7976 :             values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
     267        7976 :             nulls[4] = false;
     268        7976 :             values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
     269        7976 :             nulls[5] = false;
     270        7976 :             values[6] = BoolGetDatum(fctx->record[i].isdirty);
     271        7976 :             nulls[6] = false;
     272        7976 :             values[7] = Int16GetDatum(fctx->record[i].usagecount);
     273        7976 :             nulls[7] = false;
     274             :             /* unused for v1.0 callers, but the array is always long enough */
     275        7976 :             values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
     276        7976 :             nulls[8] = false;
     277             :         }
     278             : 
     279             :         /* Build and return the tuple. */
     280       65536 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     281       65536 :         result = HeapTupleGetDatum(tuple);
     282             : 
     283       65536 :         SRF_RETURN_NEXT(funcctx, result);
     284             :     }
     285             :     else
     286           4 :         SRF_RETURN_DONE(funcctx);
     287             : }
     288             : 
     289             : /*
     290             :  * Inquire about NUMA memory mappings for shared buffers.
     291             :  *
     292             :  * Returns NUMA node ID for each memory page used by the buffer. Buffers may
     293             :  * be smaller or larger than OS memory pages. For each buffer we return one
     294             :  * entry for each memory page used by the buffer (if the buffer is smaller,
     295             :  * it only uses a part of one memory page).
     296             :  *
     297             :  * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
     298             :  * one is always a multiple of the other.
     299             :  *
     300             :  * In order to get reliable results we also need to touch memory pages, so
     301             :  * that the inquiry about NUMA memory node doesn't return -2 (which indicates
     302             :  * unmapped/unallocated pages).
     303             :  */
     304             : Datum
     305           0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
     306             : {
     307             :     FuncCallContext *funcctx;
     308             :     MemoryContext oldcontext;
     309             :     BufferCacheNumaContext *fctx;   /* User function context. */
     310             :     TupleDesc   tupledesc;
     311             :     TupleDesc   expected_tupledesc;
     312             :     HeapTuple   tuple;
     313             :     Datum       result;
     314             : 
     315           0 :     if (SRF_IS_FIRSTCALL())
     316             :     {
     317             :         int         i,
     318             :                     idx;
     319             :         Size        os_page_size;
     320             :         void      **os_page_ptrs;
     321             :         int        *os_page_status;
     322             :         uint64      os_page_count;
     323             :         int         pages_per_buffer;
     324             :         int         max_entries;
     325             :         char       *startptr,
     326             :                    *endptr;
     327             : 
     328           0 :         if (pg_numa_init() == -1)
     329           0 :             elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
     330             : 
     331             :         /*
     332             :          * The database block size and OS memory page size are unlikely to be
     333             :          * the same. The block size is 1-32KB, the memory page size depends on
     334             :          * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
     335             :          * there are also features like THP etc. Moreover, we don't quite know
     336             :          * how the pages and buffers "align" in memory - the buffers may be
     337             :          * shifted in some way, using more memory pages than necessary.
     338             :          *
     339             :          * So we need to be careful about mapping buffers to memory pages. We
     340             :          * calculate the maximum number of pages a buffer might use, so that
     341             :          * we allocate enough space for the entries. And then we count the
     342             :          * actual number of entries as we scan the buffers.
     343             :          *
     344             :          * This information is needed before calling move_pages() for NUMA
     345             :          * node id inquiry.
     346             :          */
     347           0 :         os_page_size = pg_get_shmem_pagesize();
     348             : 
     349             :         /*
     350             :          * The pages and block size is expected to be 2^k, so one divides the
     351             :          * other (we don't know in which direction). This does not say
     352             :          * anything about relative alignment of pages/buffers.
     353             :          */
     354             :         Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
     355             : 
     356             :         /*
     357             :          * How many addresses we are going to query? Simply get the page for
     358             :          * the first buffer, and first page after the last buffer, and count
     359             :          * the pages from that.
     360             :          */
     361           0 :         startptr = (char *) TYPEALIGN_DOWN(os_page_size,
     362             :                                            BufferGetBlock(1));
     363           0 :         endptr = (char *) TYPEALIGN(os_page_size,
     364             :                                     (char *) BufferGetBlock(NBuffers) + BLCKSZ);
     365           0 :         os_page_count = (endptr - startptr) / os_page_size;
     366             : 
     367             :         /* Used to determine the NUMA node for all OS pages at once */
     368           0 :         os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
     369           0 :         os_page_status = palloc(sizeof(uint64) * os_page_count);
     370             : 
     371             :         /* Fill pointers for all the memory pages. */
     372           0 :         idx = 0;
     373           0 :         for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
     374             :         {
     375           0 :             os_page_ptrs[idx++] = ptr;
     376             : 
     377             :             /* Only need to touch memory once per backend process lifetime */
     378           0 :             if (firstNumaTouch)
     379             :                 pg_numa_touch_mem_if_required(ptr);
     380             :         }
     381             : 
     382             :         Assert(idx == os_page_count);
     383             : 
     384           0 :         elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
     385             :              "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
     386             : 
     387             :         /*
     388             :          * If we ever get 0xff back from kernel inquiry, then we probably have
     389             :          * bug in our buffers to OS page mapping code here.
     390             :          */
     391           0 :         memset(os_page_status, 0xff, sizeof(int) * os_page_count);
     392             : 
     393             :         /* Query NUMA status for all the pointers */
     394           0 :         if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
     395           0 :             elog(ERROR, "failed NUMA pages inquiry: %m");
     396             : 
     397             :         /* Initialize the multi-call context, load entries about buffers */
     398             : 
     399           0 :         funcctx = SRF_FIRSTCALL_INIT();
     400             : 
     401             :         /* Switch context when allocating stuff to be used in later calls */
     402           0 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     403             : 
     404             :         /* Create a user function context for cross-call persistence */
     405           0 :         fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext));
     406             : 
     407           0 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     408           0 :             elog(ERROR, "return type must be a row type");
     409             : 
     410           0 :         if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM)
     411           0 :             elog(ERROR, "incorrect number of output arguments");
     412             : 
     413             :         /* Construct a tuple descriptor for the result rows. */
     414           0 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     415           0 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     416             :                            INT4OID, -1, 0);
     417           0 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
     418             :                            INT8OID, -1, 0);
     419           0 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
     420             :                            INT4OID, -1, 0);
     421             : 
     422           0 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     423             : 
     424             :         /*
     425             :          * Each buffer needs at least one entry, but it might be offset in
     426             :          * some way, and use one extra entry. So we allocate space for the
     427             :          * maximum number of entries we might need, and then count the exact
     428             :          * number as we're walking buffers. That way we can do it in one pass,
     429             :          * without reallocating memory.
     430             :          */
     431           0 :         pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
     432           0 :         max_entries = NBuffers * pages_per_buffer;
     433             : 
     434             :         /* Allocate entries for BufferCachePagesRec records. */
     435           0 :         fctx->record = (BufferCacheNumaRec *)
     436           0 :             MemoryContextAllocHuge(CurrentMemoryContext,
     437             :                                    sizeof(BufferCacheNumaRec) * max_entries);
     438             : 
     439             :         /* Return to original context when allocating transient memory */
     440           0 :         MemoryContextSwitchTo(oldcontext);
     441             : 
     442           0 :         if (firstNumaTouch)
     443           0 :             elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
     444             : 
     445             :         /*
     446             :          * Scan through all the buffers, saving the relevant fields in the
     447             :          * fctx->record structure.
     448             :          *
     449             :          * We don't hold the partition locks, so we don't get a consistent
     450             :          * snapshot across all buffers, but we do grab the buffer header
     451             :          * locks, so the information of each buffer is self-consistent.
     452             :          *
     453             :          * This loop touches and stores addresses into os_page_ptrs[] as input
     454             :          * to one big move_pages(2) inquiry system call. Basically we ask for
     455             :          * all memory pages for NBuffers.
     456             :          */
     457           0 :         startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
     458           0 :         idx = 0;
     459           0 :         for (i = 0; i < NBuffers; i++)
     460             :         {
     461           0 :             char       *buffptr = (char *) BufferGetBlock(i + 1);
     462             :             BufferDesc *bufHdr;
     463             :             uint32      buf_state;
     464             :             uint32      bufferid;
     465             :             int32       page_num;
     466             :             char       *startptr_buff,
     467             :                        *endptr_buff;
     468             : 
     469           0 :             CHECK_FOR_INTERRUPTS();
     470             : 
     471           0 :             bufHdr = GetBufferDescriptor(i);
     472             : 
     473             :             /* Lock each buffer header before inspecting. */
     474           0 :             buf_state = LockBufHdr(bufHdr);
     475           0 :             bufferid = BufferDescriptorGetBuffer(bufHdr);
     476           0 :             UnlockBufHdr(bufHdr, buf_state);
     477             : 
     478             :             /* start of the first page of this buffer */
     479           0 :             startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
     480             : 
     481             :             /* end of the buffer (no need to align to memory page) */
     482           0 :             endptr_buff = buffptr + BLCKSZ;
     483             : 
     484             :             Assert(startptr_buff < endptr_buff);
     485             : 
     486             :             /* calculate ID of the first page for this buffer */
     487           0 :             page_num = (startptr_buff - startptr) / os_page_size;
     488             : 
     489             :             /* Add an entry for each OS page overlapping with this buffer. */
     490           0 :             for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
     491             :             {
     492           0 :                 fctx->record[idx].bufferid = bufferid;
     493           0 :                 fctx->record[idx].page_num = page_num;
     494           0 :                 fctx->record[idx].numa_node = os_page_status[page_num];
     495             : 
     496             :                 /* advance to the next entry/page */
     497           0 :                 ++idx;
     498           0 :                 ++page_num;
     499             :             }
     500             :         }
     501             : 
     502             :         Assert((idx >= os_page_count) && (idx <= max_entries));
     503             : 
     504             :         /* Set max calls and remember the user function context. */
     505           0 :         funcctx->max_calls = idx;
     506           0 :         funcctx->user_fctx = fctx;
     507             : 
     508             :         /* Remember this backend touched the pages */
     509           0 :         firstNumaTouch = false;
     510             :     }
     511             : 
     512           0 :     funcctx = SRF_PERCALL_SETUP();
     513             : 
     514             :     /* Get the saved state */
     515           0 :     fctx = funcctx->user_fctx;
     516             : 
     517           0 :     if (funcctx->call_cntr < funcctx->max_calls)
     518             :     {
     519           0 :         uint32      i = funcctx->call_cntr;
     520             :         Datum       values[NUM_BUFFERCACHE_NUMA_ELEM];
     521             :         bool        nulls[NUM_BUFFERCACHE_NUMA_ELEM];
     522             : 
     523           0 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     524           0 :         nulls[0] = false;
     525             : 
     526           0 :         values[1] = Int64GetDatum(fctx->record[i].page_num);
     527           0 :         nulls[1] = false;
     528             : 
     529           0 :         values[2] = Int32GetDatum(fctx->record[i].numa_node);
     530           0 :         nulls[2] = false;
     531             : 
     532             :         /* Build and return the tuple. */
     533           0 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     534           0 :         result = HeapTupleGetDatum(tuple);
     535             : 
     536           0 :         SRF_RETURN_NEXT(funcctx, result);
     537             :     }
     538             :     else
     539           0 :         SRF_RETURN_DONE(funcctx);
     540             : }
     541             : 
     542             : Datum
     543           4 : pg_buffercache_summary(PG_FUNCTION_ARGS)
     544             : {
     545             :     Datum       result;
     546             :     TupleDesc   tupledesc;
     547             :     HeapTuple   tuple;
     548             :     Datum       values[NUM_BUFFERCACHE_SUMMARY_ELEM];
     549             :     bool        nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
     550             : 
     551           4 :     int32       buffers_used = 0;
     552           4 :     int32       buffers_unused = 0;
     553           4 :     int32       buffers_dirty = 0;
     554           4 :     int32       buffers_pinned = 0;
     555           4 :     int64       usagecount_total = 0;
     556             : 
     557           4 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     558           0 :         elog(ERROR, "return type must be a row type");
     559             : 
     560       65540 :     for (int i = 0; i < NBuffers; i++)
     561             :     {
     562             :         BufferDesc *bufHdr;
     563             :         uint32      buf_state;
     564             : 
     565       65536 :         CHECK_FOR_INTERRUPTS();
     566             : 
     567             :         /*
     568             :          * This function summarizes the state of all headers. Locking the
     569             :          * buffer headers wouldn't provide an improved result as the state of
     570             :          * the buffer can still change after we release the lock and it'd
     571             :          * noticeably increase the cost of the function.
     572             :          */
     573       65536 :         bufHdr = GetBufferDescriptor(i);
     574       65536 :         buf_state = pg_atomic_read_u32(&bufHdr->state);
     575             : 
     576       65536 :         if (buf_state & BM_VALID)
     577             :         {
     578        7976 :             buffers_used++;
     579        7976 :             usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
     580             : 
     581        7976 :             if (buf_state & BM_DIRTY)
     582        3788 :                 buffers_dirty++;
     583             :         }
     584             :         else
     585       57560 :             buffers_unused++;
     586             : 
     587       65536 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     588           0 :             buffers_pinned++;
     589             :     }
     590             : 
     591           4 :     memset(nulls, 0, sizeof(nulls));
     592           4 :     values[0] = Int32GetDatum(buffers_used);
     593           4 :     values[1] = Int32GetDatum(buffers_unused);
     594           4 :     values[2] = Int32GetDatum(buffers_dirty);
     595           4 :     values[3] = Int32GetDatum(buffers_pinned);
     596             : 
     597           4 :     if (buffers_used != 0)
     598           4 :         values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
     599             :     else
     600           0 :         nulls[4] = true;
     601             : 
     602             :     /* Build and return the tuple. */
     603           4 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     604           4 :     result = HeapTupleGetDatum(tuple);
     605             : 
     606           4 :     PG_RETURN_DATUM(result);
     607             : }
     608             : 
     609             : Datum
     610           4 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
     611             : {
     612           4 :     ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
     613           4 :     int         usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
     614           4 :     int         dirty[BM_MAX_USAGE_COUNT + 1] = {0};
     615           4 :     int         pinned[BM_MAX_USAGE_COUNT + 1] = {0};
     616             :     Datum       values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
     617           4 :     bool        nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
     618             : 
     619           4 :     InitMaterializedSRF(fcinfo, 0);
     620             : 
     621       65540 :     for (int i = 0; i < NBuffers; i++)
     622             :     {
     623       65536 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
     624       65536 :         uint32      buf_state = pg_atomic_read_u32(&bufHdr->state);
     625             :         int         usage_count;
     626             : 
     627       65536 :         CHECK_FOR_INTERRUPTS();
     628             : 
     629       65536 :         usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
     630       65536 :         usage_counts[usage_count]++;
     631             : 
     632       65536 :         if (buf_state & BM_DIRTY)
     633        3788 :             dirty[usage_count]++;
     634             : 
     635       65536 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     636           0 :             pinned[usage_count]++;
     637             :     }
     638             : 
     639          28 :     for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
     640             :     {
     641          24 :         values[0] = Int32GetDatum(i);
     642          24 :         values[1] = Int32GetDatum(usage_counts[i]);
     643          24 :         values[2] = Int32GetDatum(dirty[i]);
     644          24 :         values[3] = Int32GetDatum(pinned[i]);
     645             : 
     646          24 :         tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
     647             :     }
     648             : 
     649           4 :     return (Datum) 0;
     650             : }
     651             : 
     652             : /*
     653             :  * Helper function to check if the user has superuser privileges.
     654             :  */
     655             : static void
     656          20 : pg_buffercache_superuser_check(char *func_name)
     657             : {
     658          20 :     if (!superuser())
     659           6 :         ereport(ERROR,
     660             :                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
     661             :                  errmsg("must be superuser to use %s()",
     662             :                         func_name)));
     663          14 : }
     664             : 
     665             : /*
     666             :  * Try to evict a shared buffer.
     667             :  */
     668             : Datum
     669          10 : pg_buffercache_evict(PG_FUNCTION_ARGS)
     670             : {
     671             :     Datum       result;
     672             :     TupleDesc   tupledesc;
     673             :     HeapTuple   tuple;
     674             :     Datum       values[NUM_BUFFERCACHE_EVICT_ELEM];
     675          10 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
     676             : 
     677          10 :     Buffer      buf = PG_GETARG_INT32(0);
     678             :     bool        buffer_flushed;
     679             : 
     680          10 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     681           0 :         elog(ERROR, "return type must be a row type");
     682             : 
     683          10 :     pg_buffercache_superuser_check("pg_buffercache_evict");
     684             : 
     685           8 :     if (buf < 1 || buf > NBuffers)
     686           6 :         elog(ERROR, "bad buffer ID: %d", buf);
     687             : 
     688           2 :     values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
     689           2 :     values[1] = BoolGetDatum(buffer_flushed);
     690             : 
     691           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     692           2 :     result = HeapTupleGetDatum(tuple);
     693             : 
     694           2 :     PG_RETURN_DATUM(result);
     695             : }
     696             : 
     697             : /*
     698             :  * Try to evict specified relation.
     699             :  */
     700             : Datum
     701           6 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
     702             : {
     703             :     Datum       result;
     704             :     TupleDesc   tupledesc;
     705             :     HeapTuple   tuple;
     706             :     Datum       values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
     707           6 :     bool        nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
     708             : 
     709             :     Oid         relOid;
     710             :     Relation    rel;
     711             : 
     712           6 :     int32       buffers_evicted = 0;
     713           6 :     int32       buffers_flushed = 0;
     714           6 :     int32       buffers_skipped = 0;
     715             : 
     716           6 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     717           0 :         elog(ERROR, "return type must be a row type");
     718             : 
     719           6 :     pg_buffercache_superuser_check("pg_buffercache_evict_relation");
     720             : 
     721           4 :     relOid = PG_GETARG_OID(0);
     722             : 
     723           4 :     rel = relation_open(relOid, AccessShareLock);
     724             : 
     725           4 :     if (RelationUsesLocalBuffers(rel))
     726           2 :         ereport(ERROR,
     727             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     728             :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     729             :                         "pg_buffercache_evict_relation")));
     730             : 
     731           2 :     EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
     732             :                             &buffers_skipped);
     733             : 
     734           2 :     relation_close(rel, AccessShareLock);
     735             : 
     736           2 :     values[0] = Int32GetDatum(buffers_evicted);
     737           2 :     values[1] = Int32GetDatum(buffers_flushed);
     738           2 :     values[2] = Int32GetDatum(buffers_skipped);
     739             : 
     740           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     741           2 :     result = HeapTupleGetDatum(tuple);
     742             : 
     743           2 :     PG_RETURN_DATUM(result);
     744             : }
     745             : 
     746             : 
     747             : /*
     748             :  * Try to evict all shared buffers.
     749             :  */
     750             : Datum
     751           4 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
     752             : {
     753             :     Datum       result;
     754             :     TupleDesc   tupledesc;
     755             :     HeapTuple   tuple;
     756             :     Datum       values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
     757           4 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
     758             : 
     759           4 :     int32       buffers_evicted = 0;
     760           4 :     int32       buffers_flushed = 0;
     761           4 :     int32       buffers_skipped = 0;
     762             : 
     763           4 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     764           0 :         elog(ERROR, "return type must be a row type");
     765             : 
     766           4 :     pg_buffercache_superuser_check("pg_buffercache_evict_all");
     767             : 
     768           2 :     EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
     769             :                             &buffers_skipped);
     770             : 
     771           2 :     values[0] = Int32GetDatum(buffers_evicted);
     772           2 :     values[1] = Int32GetDatum(buffers_flushed);
     773           2 :     values[2] = Int32GetDatum(buffers_skipped);
     774             : 
     775           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     776           2 :     result = HeapTupleGetDatum(tuple);
     777             : 
     778           2 :     PG_RETURN_DATUM(result);
     779             : }

Generated by: LCOV version 1.16