LCOV - code coverage report
Current view: top level - contrib/pg_buffercache - pg_buffercache_pages.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19beta1 Lines: 88.5 % 331 293
Test Date: 2026-06-15 18:16:44 Functions: 96.2 % 26 25
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_buffercache_pages.c
       4              :  *    display some contents of the buffer cache
       5              :  *
       6              :  *    contrib/pg_buffercache/pg_buffercache_pages.c
       7              :  *-------------------------------------------------------------------------
       8              :  */
       9              : #include "postgres.h"
      10              : 
      11              : #include "access/htup_details.h"
      12              : #include "access/relation.h"
      13              : #include "catalog/pg_type.h"
      14              : #include "funcapi.h"
      15              : #include "port/pg_numa.h"
      16              : #include "storage/buf_internals.h"
      17              : #include "storage/bufmgr.h"
      18              : #include "utils/rel.h"
      19              : #include "utils/tuplestore.h"
      20              : 
      21              : 
      22              : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM  8
      23              : #define NUM_BUFFERCACHE_PAGES_ELEM  9
      24              : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
      25              : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
      26              : #define NUM_BUFFERCACHE_EVICT_ELEM 2
      27              : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
      28              : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
      29              : #define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2
      30              : #define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3
      31              : #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
      32              : 
      33              : #define NUM_BUFFERCACHE_OS_PAGES_ELEM   3
      34              : 
      35            1 : PG_MODULE_MAGIC_EXT(
      36              :                     .name = "pg_buffercache",
      37              :                     .version = PG_VERSION
      38              : );
      39              : 
      40              : /*
      41              :  * Record structure holding the to be exposed cache data for OS pages.  This
      42              :  * structure is used by pg_buffercache_os_pages(), where NUMA information may
      43              :  * or may not be included.
      44              :  */
      45              : typedef struct
      46              : {
      47              :     uint32      bufferid;
      48              :     int64       page_num;
      49              :     int32       numa_node;
      50              : } BufferCacheOsPagesRec;
      51              : 
      52              : /*
      53              :  * Function context for data persisting over repeated calls.
      54              :  */
      55              : typedef struct
      56              : {
      57              :     TupleDesc   tupdesc;
      58              :     bool        include_numa;
      59              :     BufferCacheOsPagesRec *record;
      60              : } BufferCacheOsPagesContext;
      61              : 
      62              : static TupleDesc build_buffercache_pages_tupledesc(int natts);
      63              : 
      64              : 
      65              : /*
      66              :  * Function returning data from the shared buffer cache - buffer number,
      67              :  * relation node/tablespace/database/blocknum and dirty indicator.
      68              :  */
      69            2 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
      70            2 : PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
      71            1 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
      72            2 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
      73            2 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
      74            3 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
      75            2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
      76            2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
      77            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
      78            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
      79            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
      80              : 
      81              : 
      82              : /* Only need to touch memory once per backend process lifetime */
      83              : static bool firstNumaTouch = true;
      84              : 
      85              : 
      86              : Datum
      87            3 : pg_buffercache_pages(PG_FUNCTION_ARGS)
      88              : {
      89            3 :     ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
      90              :     TupleDesc   expected_tupledesc;
      91              :     TupleDesc   actual_tupledesc;
      92              :     MemoryContext oldcontext;
      93              :     int         i;
      94              : 
      95              :     /*
      96              :      * To smoothly support upgrades from version 1.0 of this extension
      97              :      * transparently handle the (non-)existence of the pinning_backends
      98              :      * column. We unfortunately have to get the result type for that... - we
      99              :      * can't use the result type determined by the function definition without
     100              :      * potentially crashing when somebody uses the old (or even wrong)
     101              :      * function definition though.
     102              :      */
     103            3 :     if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     104            0 :         elog(ERROR, "return type must be a row type");
     105              : 
     106            3 :     if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
     107            3 :         expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
     108            0 :         elog(ERROR, "incorrect number of output arguments");
     109              : 
     110            3 :     InitMaterializedSRF(fcinfo, 0);
     111              : 
     112            3 :     oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory);
     113            3 :     actual_tupledesc = build_buffercache_pages_tupledesc(expected_tupledesc->natts);
     114            3 :     MemoryContextSwitchTo(oldcontext);
     115              : 
     116              :     /*
     117              :      * Override the caller-supplied descriptor with the tuple descriptor that
     118              :      * matches the values we actually return, so executor-side
     119              :      * tupledesc_match() can verify the caller's row definition.
     120              :      *
     121              :      * Do not free the previous rsinfo->setDesc here: for RECORD results it
     122              :      * can alias rsinfo->expectedDesc, which the executor still needs to
     123              :      * reference.
     124              :      */
     125            3 :     rsinfo->setDesc = actual_tupledesc;
     126              : 
     127              :     /*
     128              :      * Scan through all the buffers, adding one row for each of the buffers to
     129              :      * the tuplestore.
     130              :      *
     131              :      * We don't hold the partition locks, so we don't get a consistent
     132              :      * snapshot across all buffers, but we do grab the buffer header locks, so
     133              :      * the information of each buffer is self-consistent.
     134              :      */
     135        49155 :     for (i = 0; i < NBuffers; i++)
     136              :     {
     137              :         BufferDesc *bufHdr;
     138              :         uint64      buf_state;
     139              :         uint32      bufferid;
     140              :         RelFileNumber relfilenumber;
     141              :         Oid         reltablespace;
     142              :         Oid         reldatabase;
     143              :         ForkNumber  forknum;
     144              :         BlockNumber blocknum;
     145              :         bool        isvalid;
     146              :         bool        isdirty;
     147              :         uint16      usagecount;
     148              :         int32       pinning_backends;
     149              :         Datum       values[NUM_BUFFERCACHE_PAGES_ELEM];
     150              :         bool        nulls[NUM_BUFFERCACHE_PAGES_ELEM];
     151              : 
     152        49152 :         CHECK_FOR_INTERRUPTS();
     153              : 
     154        49152 :         bufHdr = GetBufferDescriptor(i);
     155              :         /* Lock each buffer header before inspecting. */
     156        49152 :         buf_state = LockBufHdr(bufHdr);
     157              : 
     158        49152 :         bufferid = BufferDescriptorGetBuffer(bufHdr);
     159        49152 :         relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
     160        49152 :         reltablespace = bufHdr->tag.spcOid;
     161        49152 :         reldatabase = bufHdr->tag.dbOid;
     162        49152 :         forknum = BufTagGetForkNum(&bufHdr->tag);
     163        49152 :         blocknum = bufHdr->tag.blockNum;
     164        49152 :         usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
     165        49152 :         pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
     166              : 
     167        49152 :         if (buf_state & BM_DIRTY)
     168         2925 :             isdirty = true;
     169              :         else
     170        46227 :             isdirty = false;
     171              : 
     172              :         /* Note if the buffer is valid, and has storage created */
     173        49152 :         if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
     174         6070 :             isvalid = true;
     175              :         else
     176        43082 :             isvalid = false;
     177              : 
     178        49152 :         UnlockBufHdr(bufHdr);
     179              : 
     180              :         /* Build the tuple and add it to tuplestore */
     181        49152 :         values[0] = Int32GetDatum(bufferid);
     182        49152 :         nulls[0] = false;
     183              : 
     184              :         /*
     185              :          * Set all fields except the bufferid to null if the buffer is unused
     186              :          * or not valid.
     187              :          */
     188        49152 :         if (blocknum == InvalidBlockNumber || isvalid == false)
     189              :         {
     190        43082 :             nulls[1] = true;
     191        43082 :             nulls[2] = true;
     192        43082 :             nulls[3] = true;
     193        43082 :             nulls[4] = true;
     194        43082 :             nulls[5] = true;
     195        43082 :             nulls[6] = true;
     196        43082 :             nulls[7] = true;
     197              :             /* unused for v1.0 callers, but the array is always long enough */
     198        43082 :             nulls[8] = true;
     199              :         }
     200              :         else
     201              :         {
     202         6070 :             values[1] = ObjectIdGetDatum(relfilenumber);
     203         6070 :             nulls[1] = false;
     204         6070 :             values[2] = ObjectIdGetDatum(reltablespace);
     205         6070 :             nulls[2] = false;
     206         6070 :             values[3] = ObjectIdGetDatum(reldatabase);
     207         6070 :             nulls[3] = false;
     208         6070 :             values[4] = Int16GetDatum(forknum);
     209         6070 :             nulls[4] = false;
     210         6070 :             values[5] = Int64GetDatum((int64) blocknum);
     211         6070 :             nulls[5] = false;
     212         6070 :             values[6] = BoolGetDatum(isdirty);
     213         6070 :             nulls[6] = false;
     214         6070 :             values[7] = Int16GetDatum(usagecount);
     215         6070 :             nulls[7] = false;
     216              :             /* unused for v1.0 callers, but the array is always long enough */
     217         6070 :             values[8] = Int32GetDatum(pinning_backends);
     218         6070 :             nulls[8] = false;
     219              :         }
     220              : 
     221        49152 :         tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
     222              :     }
     223              : 
     224            3 :     return (Datum) 0;
     225              : }
     226              : 
     227              : static TupleDesc
     228            3 : build_buffercache_pages_tupledesc(int natts)
     229              : {
     230              :     TupleDesc   tupledesc;
     231              : 
     232            3 :     tupledesc = CreateTemplateTupleDesc(natts);
     233            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     234              :                        INT4OID, -1, 0);
     235            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
     236              :                        OIDOID, -1, 0);
     237            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
     238              :                        OIDOID, -1, 0);
     239            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
     240              :                        OIDOID, -1, 0);
     241            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
     242              :                        INT2OID, -1, 0);
     243            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
     244              :                        INT8OID, -1, 0);
     245            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
     246              :                        BOOLOID, -1, 0);
     247            3 :     TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usagecount",
     248              :                        INT2OID, -1, 0);
     249              : 
     250            3 :     if (natts == NUM_BUFFERCACHE_PAGES_ELEM)
     251            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
     252              :                            INT4OID, -1, 0);
     253              : 
     254            3 :     TupleDescFinalize(tupledesc);
     255              : 
     256            3 :     return BlessTupleDesc(tupledesc);
     257              : }
     258              : 
     259              : /*
     260              :  * Inquire about OS pages mappings for shared buffers, with NUMA information,
     261              :  * optionally.
     262              :  *
     263              :  * When "include_numa" is false, this routines ignores everything related
     264              :  * to NUMA (returned as NULL values), returning mapping information between
     265              :  * shared buffers and OS pages.
     266              :  *
     267              :  * When "include_numa" is true, NUMA is initialized and numa_node values
     268              :  * are generated.  In order to get reliable results we also need to touch
     269              :  * memory pages, so that the inquiry about NUMA memory node does not return
     270              :  * -2, indicating unmapped/unallocated pages.
     271              :  *
     272              :  * Buffers may be smaller or larger than OS memory pages. For each buffer we
     273              :  * return one entry for each memory page used by the buffer (if the buffer is
     274              :  * smaller, it only uses a part of one memory page).
     275              :  *
     276              :  * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
     277              :  * one is always a multiple of the other.
     278              :  *
     279              :  */
     280              : static Datum
     281        65538 : pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
     282              : {
     283              :     FuncCallContext *funcctx;
     284              :     MemoryContext oldcontext;
     285              :     BufferCacheOsPagesContext *fctx;    /* User function context. */
     286              :     TupleDesc   tupledesc;
     287              :     TupleDesc   expected_tupledesc;
     288              :     HeapTuple   tuple;
     289              :     Datum       result;
     290              : 
     291        65538 :     if (SRF_IS_FIRSTCALL())
     292              :     {
     293              :         int         i,
     294              :                     idx;
     295              :         Size        os_page_size;
     296              :         int         pages_per_buffer;
     297            2 :         int        *os_page_status = NULL;
     298            2 :         uint64      os_page_count = 0;
     299              :         int         max_entries;
     300              :         char       *startptr,
     301              :                    *endptr;
     302              : 
     303              :         /* If NUMA information is requested, initialize NUMA support. */
     304            2 :         if (include_numa && pg_numa_init() == -1)
     305            0 :             elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
     306              : 
     307              :         /*
     308              :          * The database block size and OS memory page size are unlikely to be
     309              :          * the same. The block size is 1-32KB, the memory page size depends on
     310              :          * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
     311              :          * there are also features like THP etc. Moreover, we don't quite know
     312              :          * how the pages and buffers "align" in memory - the buffers may be
     313              :          * shifted in some way, using more memory pages than necessary.
     314              :          *
     315              :          * So we need to be careful about mapping buffers to memory pages. We
     316              :          * calculate the maximum number of pages a buffer might use, so that
     317              :          * we allocate enough space for the entries. And then we count the
     318              :          * actual number of entries as we scan the buffers.
     319              :          *
     320              :          * This information is needed before calling move_pages() for NUMA
     321              :          * node id inquiry.
     322              :          */
     323            2 :         os_page_size = pg_get_shmem_pagesize();
     324              : 
     325              :         /*
     326              :          * The pages and block size is expected to be 2^k, so one divides the
     327              :          * other (we don't know in which direction). This does not say
     328              :          * anything about relative alignment of pages/buffers.
     329              :          */
     330              :         Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
     331              : 
     332            2 :         if (include_numa)
     333              :         {
     334            0 :             void      **os_page_ptrs = NULL;
     335              : 
     336              :             /*
     337              :              * How many addresses we are going to query?  Simply get the page
     338              :              * for the first buffer, and first page after the last buffer, and
     339              :              * count the pages from that.
     340              :              */
     341            0 :             startptr = (char *) TYPEALIGN_DOWN(os_page_size,
     342              :                                                BufferGetBlock(1));
     343            0 :             endptr = (char *) TYPEALIGN(os_page_size,
     344              :                                         (char *) BufferGetBlock(NBuffers) + BLCKSZ);
     345            0 :             os_page_count = (endptr - startptr) / os_page_size;
     346              : 
     347              :             /* Used to determine the NUMA node for all OS pages at once */
     348            0 :             os_page_ptrs = palloc0_array(void *, os_page_count);
     349            0 :             os_page_status = palloc_array(int, os_page_count);
     350              : 
     351              :             /*
     352              :              * Fill pointers for all the memory pages.  This loop stores and
     353              :              * touches (if needed) addresses into os_page_ptrs[] as input to
     354              :              * one big move_pages(2) inquiry system call, as done in
     355              :              * pg_numa_query_pages().
     356              :              */
     357            0 :             idx = 0;
     358            0 :             for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
     359              :             {
     360            0 :                 os_page_ptrs[idx++] = ptr;
     361              : 
     362              :                 /* Only need to touch memory once per backend process lifetime */
     363            0 :                 if (firstNumaTouch)
     364              :                     pg_numa_touch_mem_if_required(ptr);
     365              :             }
     366              : 
     367              :             Assert(idx == os_page_count);
     368              : 
     369            0 :             elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
     370              :                  "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
     371              : 
     372              :             /*
     373              :              * If we ever get 0xff back from kernel inquiry, then we probably
     374              :              * have bug in our buffers to OS page mapping code here.
     375              :              */
     376            0 :             memset(os_page_status, 0xff, sizeof(int) * os_page_count);
     377              : 
     378              :             /* Query NUMA status for all the pointers */
     379            0 :             if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
     380            0 :                 elog(ERROR, "failed NUMA pages inquiry: %m");
     381              :         }
     382              : 
     383              :         /* Initialize the multi-call context, load entries about buffers */
     384              : 
     385            2 :         funcctx = SRF_FIRSTCALL_INIT();
     386              : 
     387              :         /* Switch context when allocating stuff to be used in later calls */
     388            2 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     389              : 
     390              :         /* Create a user function context for cross-call persistence */
     391            2 :         fctx = palloc_object(BufferCacheOsPagesContext);
     392              : 
     393            2 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     394            0 :             elog(ERROR, "return type must be a row type");
     395              : 
     396            2 :         if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
     397            0 :             elog(ERROR, "incorrect number of output arguments");
     398              : 
     399              :         /* Construct a tuple descriptor for the result rows. */
     400            2 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     401            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     402              :                            INT4OID, -1, 0);
     403            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
     404              :                            INT8OID, -1, 0);
     405            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
     406              :                            INT4OID, -1, 0);
     407              : 
     408            2 :         TupleDescFinalize(tupledesc);
     409            2 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     410            2 :         fctx->include_numa = include_numa;
     411              : 
     412              :         /*
     413              :          * Each buffer needs at least one entry, but it might be offset in
     414              :          * some way, and use one extra entry. So we allocate space for the
     415              :          * maximum number of entries we might need, and then count the exact
     416              :          * number as we're walking buffers. That way we can do it in one pass,
     417              :          * without reallocating memory.
     418              :          */
     419            2 :         pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
     420            2 :         max_entries = NBuffers * pages_per_buffer;
     421              : 
     422              :         /* Allocate entries for BufferCacheOsPagesRec records. */
     423            2 :         fctx->record = (BufferCacheOsPagesRec *)
     424            2 :             MemoryContextAllocHuge(CurrentMemoryContext,
     425              :                                    sizeof(BufferCacheOsPagesRec) * max_entries);
     426              : 
     427              :         /* Return to original context when allocating transient memory */
     428            2 :         MemoryContextSwitchTo(oldcontext);
     429              : 
     430            2 :         if (include_numa && firstNumaTouch)
     431            0 :             elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
     432              : 
     433              :         /*
     434              :          * Scan through all the buffers, saving the relevant fields in the
     435              :          * fctx->record structure.
     436              :          *
     437              :          * We don't hold the partition locks, so we don't get a consistent
     438              :          * snapshot across all buffers, but we do grab the buffer header
     439              :          * locks, so the information of each buffer is self-consistent.
     440              :          */
     441            2 :         startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
     442            2 :         idx = 0;
     443        32770 :         for (i = 0; i < NBuffers; i++)
     444              :         {
     445        32768 :             char       *buffptr = (char *) BufferGetBlock(i + 1);
     446              :             BufferDesc *bufHdr;
     447              :             uint32      bufferid;
     448              :             int32       page_num;
     449              :             char       *startptr_buff,
     450              :                        *endptr_buff;
     451              : 
     452        32768 :             CHECK_FOR_INTERRUPTS();
     453              : 
     454        32768 :             bufHdr = GetBufferDescriptor(i);
     455              : 
     456              :             /* Lock each buffer header before inspecting. */
     457        32768 :             LockBufHdr(bufHdr);
     458        32768 :             bufferid = BufferDescriptorGetBuffer(bufHdr);
     459        32768 :             UnlockBufHdr(bufHdr);
     460              : 
     461              :             /* start of the first page of this buffer */
     462        32768 :             startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
     463              : 
     464              :             /* end of the buffer (no need to align to memory page) */
     465        32768 :             endptr_buff = buffptr + BLCKSZ;
     466              : 
     467              :             Assert(startptr_buff < endptr_buff);
     468              : 
     469              :             /* calculate ID of the first page for this buffer */
     470        32768 :             page_num = (startptr_buff - startptr) / os_page_size;
     471              : 
     472              :             /* Add an entry for each OS page overlapping with this buffer. */
     473        98304 :             for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
     474              :             {
     475        65536 :                 fctx->record[idx].bufferid = bufferid;
     476        65536 :                 fctx->record[idx].page_num = page_num;
     477        65536 :                 fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
     478              : 
     479              :                 /* advance to the next entry/page */
     480        65536 :                 ++idx;
     481        65536 :                 ++page_num;
     482              :             }
     483              :         }
     484              : 
     485              :         Assert(idx <= max_entries);
     486              : 
     487              :         if (include_numa)
     488              :             Assert(idx >= os_page_count);
     489              : 
     490              :         /* Set max calls and remember the user function context. */
     491            2 :         funcctx->max_calls = idx;
     492            2 :         funcctx->user_fctx = fctx;
     493              : 
     494              :         /* Remember this backend touched the pages (only relevant for NUMA) */
     495            2 :         if (include_numa)
     496            0 :             firstNumaTouch = false;
     497              :     }
     498              : 
     499        65538 :     funcctx = SRF_PERCALL_SETUP();
     500              : 
     501              :     /* Get the saved state */
     502        65538 :     fctx = funcctx->user_fctx;
     503              : 
     504        65538 :     if (funcctx->call_cntr < funcctx->max_calls)
     505              :     {
     506        65536 :         uint32      i = funcctx->call_cntr;
     507              :         Datum       values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     508              :         bool        nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     509              : 
     510        65536 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     511        65536 :         nulls[0] = false;
     512              : 
     513        65536 :         values[1] = Int64GetDatum(fctx->record[i].page_num);
     514        65536 :         nulls[1] = false;
     515              : 
     516        65536 :         if (fctx->include_numa)
     517              :         {
     518              :             /* status is valid node number */
     519            0 :             if (fctx->record[i].numa_node >= 0)
     520              :             {
     521            0 :                 values[2] = Int32GetDatum(fctx->record[i].numa_node);
     522            0 :                 nulls[2] = false;
     523              :             }
     524              :             else
     525              :             {
     526              :                 /* some kind of error (e.g. pages moved to swap) */
     527            0 :                 values[2] = (Datum) 0;
     528            0 :                 nulls[2] = true;
     529              :             }
     530              :         }
     531              :         else
     532              :         {
     533        65536 :             values[2] = (Datum) 0;
     534        65536 :             nulls[2] = true;
     535              :         }
     536              : 
     537              :         /* Build and return the tuple. */
     538        65536 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     539        65536 :         result = HeapTupleGetDatum(tuple);
     540              : 
     541        65536 :         SRF_RETURN_NEXT(funcctx, result);
     542              :     }
     543              :     else
     544            2 :         SRF_RETURN_DONE(funcctx);
     545              : }
     546              : 
     547              : /*
     548              :  * pg_buffercache_os_pages
     549              :  *
     550              :  * Retrieve information about OS pages, with or without NUMA information.
     551              :  */
     552              : Datum
     553        65538 : pg_buffercache_os_pages(PG_FUNCTION_ARGS)
     554              : {
     555              :     bool        include_numa;
     556              : 
     557              :     /* Get the boolean parameter that controls the NUMA behavior. */
     558        65538 :     include_numa = PG_GETARG_BOOL(0);
     559              : 
     560        65538 :     return pg_buffercache_os_pages_internal(fcinfo, include_numa);
     561              : }
     562              : 
     563              : /* Backward-compatible wrapper for v1.6. */
     564              : Datum
     565            0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
     566              : {
     567              :     /* Call internal function with include_numa=true */
     568            0 :     return pg_buffercache_os_pages_internal(fcinfo, true);
     569              : }
     570              : 
     571              : Datum
     572            2 : pg_buffercache_summary(PG_FUNCTION_ARGS)
     573              : {
     574              :     Datum       result;
     575              :     TupleDesc   tupledesc;
     576              :     HeapTuple   tuple;
     577              :     Datum       values[NUM_BUFFERCACHE_SUMMARY_ELEM];
     578              :     bool        nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
     579              : 
     580            2 :     int32       buffers_used = 0;
     581            2 :     int32       buffers_unused = 0;
     582            2 :     int32       buffers_dirty = 0;
     583            2 :     int32       buffers_pinned = 0;
     584            2 :     int64       usagecount_total = 0;
     585              : 
     586            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     587            0 :         elog(ERROR, "return type must be a row type");
     588              : 
     589        32770 :     for (int i = 0; i < NBuffers; i++)
     590              :     {
     591              :         BufferDesc *bufHdr;
     592              :         uint64      buf_state;
     593              : 
     594        32768 :         CHECK_FOR_INTERRUPTS();
     595              : 
     596              :         /*
     597              :          * This function summarizes the state of all headers. Locking the
     598              :          * buffer headers wouldn't provide an improved result as the state of
     599              :          * the buffer can still change after we release the lock and it'd
     600              :          * noticeably increase the cost of the function.
     601              :          */
     602        32768 :         bufHdr = GetBufferDescriptor(i);
     603        32768 :         buf_state = pg_atomic_read_u64(&bufHdr->state);
     604              : 
     605        32768 :         if (buf_state & BM_VALID)
     606              :         {
     607         4046 :             buffers_used++;
     608         4046 :             usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
     609              : 
     610         4046 :             if (buf_state & BM_DIRTY)
     611         1950 :                 buffers_dirty++;
     612              :         }
     613              :         else
     614        28722 :             buffers_unused++;
     615              : 
     616        32768 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     617            0 :             buffers_pinned++;
     618              :     }
     619              : 
     620            2 :     memset(nulls, 0, sizeof(nulls));
     621            2 :     values[0] = Int32GetDatum(buffers_used);
     622            2 :     values[1] = Int32GetDatum(buffers_unused);
     623            2 :     values[2] = Int32GetDatum(buffers_dirty);
     624            2 :     values[3] = Int32GetDatum(buffers_pinned);
     625              : 
     626            2 :     if (buffers_used != 0)
     627            2 :         values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
     628              :     else
     629            0 :         nulls[4] = true;
     630              : 
     631              :     /* Build and return the tuple. */
     632            2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     633            2 :     result = HeapTupleGetDatum(tuple);
     634              : 
     635            2 :     PG_RETURN_DATUM(result);
     636              : }
     637              : 
     638              : Datum
     639            2 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
     640              : {
     641            2 :     ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
     642            2 :     int         usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
     643            2 :     int         dirty[BM_MAX_USAGE_COUNT + 1] = {0};
     644            2 :     int         pinned[BM_MAX_USAGE_COUNT + 1] = {0};
     645              :     Datum       values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
     646            2 :     bool        nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
     647              : 
     648            2 :     InitMaterializedSRF(fcinfo, 0);
     649              : 
     650        32770 :     for (int i = 0; i < NBuffers; i++)
     651              :     {
     652        32768 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
     653        32768 :         uint64      buf_state = pg_atomic_read_u64(&bufHdr->state);
     654              :         int         usage_count;
     655              : 
     656        32768 :         CHECK_FOR_INTERRUPTS();
     657              : 
     658        32768 :         usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
     659        32768 :         usage_counts[usage_count]++;
     660              : 
     661        32768 :         if (buf_state & BM_DIRTY)
     662         1950 :             dirty[usage_count]++;
     663              : 
     664        32768 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     665            0 :             pinned[usage_count]++;
     666              :     }
     667              : 
     668           14 :     for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
     669              :     {
     670           12 :         values[0] = Int32GetDatum(i);
     671           12 :         values[1] = Int32GetDatum(usage_counts[i]);
     672           12 :         values[2] = Int32GetDatum(dirty[i]);
     673           12 :         values[3] = Int32GetDatum(pinned[i]);
     674              : 
     675           12 :         tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
     676              :     }
     677              : 
     678            2 :     return (Datum) 0;
     679              : }
     680              : 
     681              : /*
     682              :  * Helper function to check if the user has superuser privileges.
     683              :  */
     684              : static void
     685           20 : pg_buffercache_superuser_check(char *func_name)
     686              : {
     687           20 :     if (!superuser())
     688            6 :         ereport(ERROR,
     689              :                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
     690              :                  errmsg("must be superuser to use %s()",
     691              :                         func_name)));
     692           14 : }
     693              : 
     694              : /*
     695              :  * Try to evict a shared buffer.
     696              :  */
     697              : Datum
     698            5 : pg_buffercache_evict(PG_FUNCTION_ARGS)
     699              : {
     700              :     Datum       result;
     701              :     TupleDesc   tupledesc;
     702              :     HeapTuple   tuple;
     703              :     Datum       values[NUM_BUFFERCACHE_EVICT_ELEM];
     704            5 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
     705              : 
     706            5 :     Buffer      buf = PG_GETARG_INT32(0);
     707              :     bool        buffer_flushed;
     708              : 
     709            5 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     710            0 :         elog(ERROR, "return type must be a row type");
     711              : 
     712            5 :     pg_buffercache_superuser_check("pg_buffercache_evict");
     713              : 
     714            4 :     if (buf < 1 || buf > NBuffers)
     715            3 :         elog(ERROR, "bad buffer ID: %d", buf);
     716              : 
     717            1 :     values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
     718            1 :     values[1] = BoolGetDatum(buffer_flushed);
     719              : 
     720            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     721            1 :     result = HeapTupleGetDatum(tuple);
     722              : 
     723            1 :     PG_RETURN_DATUM(result);
     724              : }
     725              : 
     726              : /*
     727              :  * Try to evict specified relation.
     728              :  */
     729              : Datum
     730            3 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
     731              : {
     732              :     Datum       result;
     733              :     TupleDesc   tupledesc;
     734              :     HeapTuple   tuple;
     735              :     Datum       values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
     736            3 :     bool        nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
     737              : 
     738              :     Oid         relOid;
     739              :     Relation    rel;
     740              : 
     741            3 :     int32       buffers_evicted = 0;
     742            3 :     int32       buffers_flushed = 0;
     743            3 :     int32       buffers_skipped = 0;
     744              : 
     745            3 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     746            0 :         elog(ERROR, "return type must be a row type");
     747              : 
     748            3 :     pg_buffercache_superuser_check("pg_buffercache_evict_relation");
     749              : 
     750            2 :     relOid = PG_GETARG_OID(0);
     751              : 
     752            2 :     rel = relation_open(relOid, AccessShareLock);
     753              : 
     754            2 :     if (RelationUsesLocalBuffers(rel))
     755            1 :         ereport(ERROR,
     756              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     757              :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     758              :                         "pg_buffercache_evict_relation")));
     759              : 
     760            1 :     EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
     761              :                             &buffers_skipped);
     762              : 
     763            1 :     relation_close(rel, AccessShareLock);
     764              : 
     765            1 :     values[0] = Int32GetDatum(buffers_evicted);
     766            1 :     values[1] = Int32GetDatum(buffers_flushed);
     767            1 :     values[2] = Int32GetDatum(buffers_skipped);
     768              : 
     769            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     770            1 :     result = HeapTupleGetDatum(tuple);
     771              : 
     772            1 :     PG_RETURN_DATUM(result);
     773              : }
     774              : 
     775              : 
     776              : /*
     777              :  * Try to evict all shared buffers.
     778              :  */
     779              : Datum
     780            2 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
     781              : {
     782              :     Datum       result;
     783              :     TupleDesc   tupledesc;
     784              :     HeapTuple   tuple;
     785              :     Datum       values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
     786            2 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
     787              : 
     788            2 :     int32       buffers_evicted = 0;
     789            2 :     int32       buffers_flushed = 0;
     790            2 :     int32       buffers_skipped = 0;
     791              : 
     792            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     793            0 :         elog(ERROR, "return type must be a row type");
     794              : 
     795            2 :     pg_buffercache_superuser_check("pg_buffercache_evict_all");
     796              : 
     797            1 :     EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
     798              :                             &buffers_skipped);
     799              : 
     800            1 :     values[0] = Int32GetDatum(buffers_evicted);
     801            1 :     values[1] = Int32GetDatum(buffers_flushed);
     802            1 :     values[2] = Int32GetDatum(buffers_skipped);
     803              : 
     804            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     805            1 :     result = HeapTupleGetDatum(tuple);
     806              : 
     807            1 :     PG_RETURN_DATUM(result);
     808              : }
     809              : 
     810              : /*
     811              :  * Try to mark a shared buffer as dirty.
     812              :  */
     813              : Datum
     814            5 : pg_buffercache_mark_dirty(PG_FUNCTION_ARGS)
     815              : {
     816              : 
     817              :     Datum       result;
     818              :     TupleDesc   tupledesc;
     819              :     HeapTuple   tuple;
     820              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM];
     821            5 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0};
     822              : 
     823            5 :     Buffer      buf = PG_GETARG_INT32(0);
     824              :     bool        buffer_already_dirty;
     825              : 
     826            5 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     827            0 :         elog(ERROR, "return type must be a row type");
     828              : 
     829            5 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty");
     830              : 
     831            4 :     if (buf < 1 || buf > NBuffers)
     832            3 :         elog(ERROR, "bad buffer ID: %d", buf);
     833              : 
     834            1 :     values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty));
     835            1 :     values[1] = BoolGetDatum(buffer_already_dirty);
     836              : 
     837            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     838            1 :     result = HeapTupleGetDatum(tuple);
     839              : 
     840            1 :     PG_RETURN_DATUM(result);
     841              : }
     842              : 
     843              : /*
     844              :  * Try to mark all the shared buffers of a relation as dirty.
     845              :  */
     846              : Datum
     847            3 : pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS)
     848              : {
     849              :     Datum       result;
     850              :     TupleDesc   tupledesc;
     851              :     HeapTuple   tuple;
     852              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM];
     853            3 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0};
     854              : 
     855              :     Oid         relOid;
     856              :     Relation    rel;
     857              : 
     858            3 :     int32       buffers_already_dirty = 0;
     859            3 :     int32       buffers_dirtied = 0;
     860            3 :     int32       buffers_skipped = 0;
     861              : 
     862            3 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     863            0 :         elog(ERROR, "return type must be a row type");
     864              : 
     865            3 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation");
     866              : 
     867            2 :     relOid = PG_GETARG_OID(0);
     868              : 
     869            2 :     rel = relation_open(relOid, AccessShareLock);
     870              : 
     871            2 :     if (RelationUsesLocalBuffers(rel))
     872            1 :         ereport(ERROR,
     873              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     874              :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     875              :                         "pg_buffercache_mark_dirty_relation")));
     876              : 
     877            1 :     MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty,
     878              :                                 &buffers_skipped);
     879              : 
     880            1 :     relation_close(rel, AccessShareLock);
     881              : 
     882            1 :     values[0] = Int32GetDatum(buffers_dirtied);
     883            1 :     values[1] = Int32GetDatum(buffers_already_dirty);
     884            1 :     values[2] = Int32GetDatum(buffers_skipped);
     885              : 
     886            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     887            1 :     result = HeapTupleGetDatum(tuple);
     888              : 
     889            1 :     PG_RETURN_DATUM(result);
     890              : }
     891              : 
     892              : /*
     893              :  * Try to mark all the shared buffers as dirty.
     894              :  */
     895              : Datum
     896            2 : pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
     897              : {
     898              :     Datum       result;
     899              :     TupleDesc   tupledesc;
     900              :     HeapTuple   tuple;
     901              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM];
     902            2 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0};
     903              : 
     904            2 :     int32       buffers_already_dirty = 0;
     905            2 :     int32       buffers_dirtied = 0;
     906            2 :     int32       buffers_skipped = 0;
     907              : 
     908            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     909            0 :         elog(ERROR, "return type must be a row type");
     910              : 
     911            2 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all");
     912              : 
     913            1 :     MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty,
     914              :                                 &buffers_skipped);
     915              : 
     916            1 :     values[0] = Int32GetDatum(buffers_dirtied);
     917            1 :     values[1] = Int32GetDatum(buffers_already_dirty);
     918            1 :     values[2] = Int32GetDatum(buffers_skipped);
     919              : 
     920            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     921            1 :     result = HeapTupleGetDatum(tuple);
     922              : 
     923            1 :     PG_RETURN_DATUM(result);
     924              : }
        

Generated by: LCOV version 2.0-1