LCOV - code coverage report
Current view: top level - contrib/pg_buffercache - pg_buffercache_pages.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 300 338 88.8 %
Date: 2026-02-02 14:17:46 Functions: 24 25 96.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_buffercache_pages.c
       4             :  *    display some contents of the buffer cache
       5             :  *
       6             :  *    contrib/pg_buffercache/pg_buffercache_pages.c
       7             :  *-------------------------------------------------------------------------
       8             :  */
       9             : #include "postgres.h"
      10             : 
      11             : #include "access/htup_details.h"
      12             : #include "access/relation.h"
      13             : #include "catalog/pg_type.h"
      14             : #include "funcapi.h"
      15             : #include "port/pg_numa.h"
      16             : #include "storage/buf_internals.h"
      17             : #include "storage/bufmgr.h"
      18             : #include "utils/rel.h"
      19             : 
      20             : 
      21             : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM  8
      22             : #define NUM_BUFFERCACHE_PAGES_ELEM  9
      23             : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
      24             : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
      25             : #define NUM_BUFFERCACHE_EVICT_ELEM 2
      26             : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
      27             : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
      28             : #define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2
      29             : #define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3
      30             : #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
      31             : 
      32             : #define NUM_BUFFERCACHE_OS_PAGES_ELEM   3
      33             : 
      34           2 : PG_MODULE_MAGIC_EXT(
      35             :                     .name = "pg_buffercache",
      36             :                     .version = PG_VERSION
      37             : );
      38             : 
      39             : /*
      40             :  * Record structure holding the to be exposed cache data.
      41             :  */
      42             : typedef struct
      43             : {
      44             :     uint32      bufferid;
      45             :     RelFileNumber relfilenumber;
      46             :     Oid         reltablespace;
      47             :     Oid         reldatabase;
      48             :     ForkNumber  forknum;
      49             :     BlockNumber blocknum;
      50             :     bool        isvalid;
      51             :     bool        isdirty;
      52             :     uint16      usagecount;
      53             : 
      54             :     /*
      55             :      * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
      56             :      * being pinned by too many backends and each backend will only pin once
      57             :      * because of bufmgr.c's PrivateRefCount infrastructure.
      58             :      */
      59             :     int32       pinning_backends;
      60             : } BufferCachePagesRec;
      61             : 
      62             : 
      63             : /*
      64             :  * Function context for data persisting over repeated calls.
      65             :  */
      66             : typedef struct
      67             : {
      68             :     TupleDesc   tupdesc;
      69             :     BufferCachePagesRec *record;
      70             : } BufferCachePagesContext;
      71             : 
      72             : /*
      73             :  * Record structure holding the to be exposed cache data for OS pages.  This
      74             :  * structure is used by pg_buffercache_os_pages(), where NUMA information may
      75             :  * or may not be included.
      76             :  */
      77             : typedef struct
      78             : {
      79             :     uint32      bufferid;
      80             :     int64       page_num;
      81             :     int32       numa_node;
      82             : } BufferCacheOsPagesRec;
      83             : 
      84             : /*
      85             :  * Function context for data persisting over repeated calls.
      86             :  */
      87             : typedef struct
      88             : {
      89             :     TupleDesc   tupdesc;
      90             :     bool        include_numa;
      91             :     BufferCacheOsPagesRec *record;
      92             : } BufferCacheOsPagesContext;
      93             : 
      94             : 
      95             : /*
      96             :  * Function returning data from the shared buffer cache - buffer number,
      97             :  * relation node/tablespace/database/blocknum and dirty indicator.
      98             :  */
      99           4 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
     100           4 : PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
     101           2 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
     102           4 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
     103           4 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
     104           6 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
     105           4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
     106           4 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
     107           4 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
     108           4 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
     109           4 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
     110             : 
     111             : 
     112             : /* Only need to touch memory once per backend process lifetime */
     113             : static bool firstNumaTouch = true;
     114             : 
     115             : 
     116             : Datum
     117       65540 : pg_buffercache_pages(PG_FUNCTION_ARGS)
     118             : {
     119             :     FuncCallContext *funcctx;
     120             :     Datum       result;
     121             :     MemoryContext oldcontext;
     122             :     BufferCachePagesContext *fctx;  /* User function context. */
     123             :     TupleDesc   tupledesc;
     124             :     TupleDesc   expected_tupledesc;
     125             :     HeapTuple   tuple;
     126             : 
     127       65540 :     if (SRF_IS_FIRSTCALL())
     128             :     {
     129             :         int         i;
     130             : 
     131           4 :         funcctx = SRF_FIRSTCALL_INIT();
     132             : 
     133             :         /* Switch context when allocating stuff to be used in later calls */
     134           4 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     135             : 
     136             :         /* Create a user function context for cross-call persistence */
     137           4 :         fctx = palloc_object(BufferCachePagesContext);
     138             : 
     139             :         /*
     140             :          * To smoothly support upgrades from version 1.0 of this extension
     141             :          * transparently handle the (non-)existence of the pinning_backends
     142             :          * column. We unfortunately have to get the result type for that... -
     143             :          * we can't use the result type determined by the function definition
     144             :          * without potentially crashing when somebody uses the old (or even
     145             :          * wrong) function definition though.
     146             :          */
     147           4 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     148           0 :             elog(ERROR, "return type must be a row type");
     149             : 
     150           4 :         if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
     151           4 :             expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
     152           0 :             elog(ERROR, "incorrect number of output arguments");
     153             : 
     154             :         /* Construct a tuple descriptor for the result rows. */
     155           4 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     156           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     157             :                            INT4OID, -1, 0);
     158           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
     159             :                            OIDOID, -1, 0);
     160           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
     161             :                            OIDOID, -1, 0);
     162           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
     163             :                            OIDOID, -1, 0);
     164           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
     165             :                            INT2OID, -1, 0);
     166           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
     167             :                            INT8OID, -1, 0);
     168           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
     169             :                            BOOLOID, -1, 0);
     170           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
     171             :                            INT2OID, -1, 0);
     172             : 
     173           4 :         if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
     174           4 :             TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
     175             :                                INT4OID, -1, 0);
     176             : 
     177           4 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     178             : 
     179             :         /* Allocate NBuffers worth of BufferCachePagesRec records. */
     180           4 :         fctx->record = (BufferCachePagesRec *)
     181           4 :             MemoryContextAllocHuge(CurrentMemoryContext,
     182             :                                    sizeof(BufferCachePagesRec) * NBuffers);
     183             : 
     184             :         /* Set max calls and remember the user function context. */
     185           4 :         funcctx->max_calls = NBuffers;
     186           4 :         funcctx->user_fctx = fctx;
     187             : 
     188             :         /* Return to original context when allocating transient memory */
     189           4 :         MemoryContextSwitchTo(oldcontext);
     190             : 
     191             :         /*
     192             :          * Scan through all the buffers, saving the relevant fields in the
     193             :          * fctx->record structure.
     194             :          *
     195             :          * We don't hold the partition locks, so we don't get a consistent
     196             :          * snapshot across all buffers, but we do grab the buffer header
     197             :          * locks, so the information of each buffer is self-consistent.
     198             :          */
     199       65540 :         for (i = 0; i < NBuffers; i++)
     200             :         {
     201             :             BufferDesc *bufHdr;
     202             :             uint64      buf_state;
     203             : 
     204       65536 :             CHECK_FOR_INTERRUPTS();
     205             : 
     206       65536 :             bufHdr = GetBufferDescriptor(i);
     207             :             /* Lock each buffer header before inspecting. */
     208       65536 :             buf_state = LockBufHdr(bufHdr);
     209             : 
     210       65536 :             fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
     211       65536 :             fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
     212       65536 :             fctx->record[i].reltablespace = bufHdr->tag.spcOid;
     213       65536 :             fctx->record[i].reldatabase = bufHdr->tag.dbOid;
     214       65536 :             fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
     215       65536 :             fctx->record[i].blocknum = bufHdr->tag.blockNum;
     216       65536 :             fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
     217       65536 :             fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
     218             : 
     219       65536 :             if (buf_state & BM_DIRTY)
     220        3800 :                 fctx->record[i].isdirty = true;
     221             :             else
     222       61736 :                 fctx->record[i].isdirty = false;
     223             : 
     224             :             /* Note if the buffer is valid, and has storage created */
     225       65536 :             if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
     226        7992 :                 fctx->record[i].isvalid = true;
     227             :             else
     228       57544 :                 fctx->record[i].isvalid = false;
     229             : 
     230       65536 :             UnlockBufHdr(bufHdr);
     231             :         }
     232             :     }
     233             : 
     234       65540 :     funcctx = SRF_PERCALL_SETUP();
     235             : 
     236             :     /* Get the saved state */
     237       65540 :     fctx = funcctx->user_fctx;
     238             : 
     239       65540 :     if (funcctx->call_cntr < funcctx->max_calls)
     240             :     {
     241       65536 :         uint32      i = funcctx->call_cntr;
     242             :         Datum       values[NUM_BUFFERCACHE_PAGES_ELEM];
     243             :         bool        nulls[NUM_BUFFERCACHE_PAGES_ELEM];
     244             : 
     245       65536 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     246       65536 :         nulls[0] = false;
     247             : 
     248             :         /*
     249             :          * Set all fields except the bufferid to null if the buffer is unused
     250             :          * or not valid.
     251             :          */
     252       65536 :         if (fctx->record[i].blocknum == InvalidBlockNumber ||
     253        7992 :             fctx->record[i].isvalid == false)
     254             :         {
     255       57544 :             nulls[1] = true;
     256       57544 :             nulls[2] = true;
     257       57544 :             nulls[3] = true;
     258       57544 :             nulls[4] = true;
     259       57544 :             nulls[5] = true;
     260       57544 :             nulls[6] = true;
     261       57544 :             nulls[7] = true;
     262             :             /* unused for v1.0 callers, but the array is always long enough */
     263       57544 :             nulls[8] = true;
     264             :         }
     265             :         else
     266             :         {
     267        7992 :             values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
     268        7992 :             nulls[1] = false;
     269        7992 :             values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
     270        7992 :             nulls[2] = false;
     271        7992 :             values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
     272        7992 :             nulls[3] = false;
     273        7992 :             values[4] = Int16GetDatum(fctx->record[i].forknum);
     274        7992 :             nulls[4] = false;
     275        7992 :             values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
     276        7992 :             nulls[5] = false;
     277        7992 :             values[6] = BoolGetDatum(fctx->record[i].isdirty);
     278        7992 :             nulls[6] = false;
     279        7992 :             values[7] = UInt16GetDatum(fctx->record[i].usagecount);
     280        7992 :             nulls[7] = false;
     281             :             /* unused for v1.0 callers, but the array is always long enough */
     282        7992 :             values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
     283        7992 :             nulls[8] = false;
     284             :         }
     285             : 
     286             :         /* Build and return the tuple. */
     287       65536 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     288       65536 :         result = HeapTupleGetDatum(tuple);
     289             : 
     290       65536 :         SRF_RETURN_NEXT(funcctx, result);
     291             :     }
     292             :     else
     293           4 :         SRF_RETURN_DONE(funcctx);
     294             : }
     295             : 
     296             : /*
     297             :  * Inquire about OS pages mappings for shared buffers, with NUMA information,
     298             :  * optionally.
     299             :  *
     300             :  * When "include_numa" is false, this routines ignores everything related
     301             :  * to NUMA (returned as NULL values), returning mapping information between
     302             :  * shared buffers and OS pages.
     303             :  *
     304             :  * When "include_numa" is true, NUMA is initialized and numa_node values
     305             :  * are generated.  In order to get reliable results we also need to touch
     306             :  * memory pages, so that the inquiry about NUMA memory node does not return
     307             :  * -2, indicating unmapped/unallocated pages.
     308             :  *
     309             :  * Buffers may be smaller or larger than OS memory pages. For each buffer we
     310             :  * return one entry for each memory page used by the buffer (if the buffer is
     311             :  * smaller, it only uses a part of one memory page).
     312             :  *
     313             :  * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
     314             :  * one is always a multiple of the other.
     315             :  *
     316             :  */
     317             : static Datum
     318      131076 : pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
     319             : {
     320             :     FuncCallContext *funcctx;
     321             :     MemoryContext oldcontext;
     322             :     BufferCacheOsPagesContext *fctx;    /* User function context. */
     323             :     TupleDesc   tupledesc;
     324             :     TupleDesc   expected_tupledesc;
     325             :     HeapTuple   tuple;
     326             :     Datum       result;
     327             : 
     328      131076 :     if (SRF_IS_FIRSTCALL())
     329             :     {
     330             :         int         i,
     331             :                     idx;
     332             :         Size        os_page_size;
     333             :         int         pages_per_buffer;
     334           4 :         int        *os_page_status = NULL;
     335           4 :         uint64      os_page_count = 0;
     336             :         int         max_entries;
     337             :         char       *startptr,
     338             :                    *endptr;
     339             : 
     340             :         /* If NUMA information is requested, initialize NUMA support. */
     341           4 :         if (include_numa && pg_numa_init() == -1)
     342           0 :             elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
     343             : 
     344             :         /*
     345             :          * The database block size and OS memory page size are unlikely to be
     346             :          * the same. The block size is 1-32KB, the memory page size depends on
     347             :          * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
     348             :          * there are also features like THP etc. Moreover, we don't quite know
     349             :          * how the pages and buffers "align" in memory - the buffers may be
     350             :          * shifted in some way, using more memory pages than necessary.
     351             :          *
     352             :          * So we need to be careful about mapping buffers to memory pages. We
     353             :          * calculate the maximum number of pages a buffer might use, so that
     354             :          * we allocate enough space for the entries. And then we count the
     355             :          * actual number of entries as we scan the buffers.
     356             :          *
     357             :          * This information is needed before calling move_pages() for NUMA
     358             :          * node id inquiry.
     359             :          */
     360           4 :         os_page_size = pg_get_shmem_pagesize();
     361             : 
     362             :         /*
     363             :          * The pages and block size is expected to be 2^k, so one divides the
     364             :          * other (we don't know in which direction). This does not say
     365             :          * anything about relative alignment of pages/buffers.
     366             :          */
     367             :         Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
     368             : 
     369           4 :         if (include_numa)
     370             :         {
     371           0 :             void      **os_page_ptrs = NULL;
     372             : 
     373             :             /*
     374             :              * How many addresses we are going to query?  Simply get the page
     375             :              * for the first buffer, and first page after the last buffer, and
     376             :              * count the pages from that.
     377             :              */
     378           0 :             startptr = (char *) TYPEALIGN_DOWN(os_page_size,
     379             :                                                BufferGetBlock(1));
     380           0 :             endptr = (char *) TYPEALIGN(os_page_size,
     381             :                                         (char *) BufferGetBlock(NBuffers) + BLCKSZ);
     382           0 :             os_page_count = (endptr - startptr) / os_page_size;
     383             : 
     384             :             /* Used to determine the NUMA node for all OS pages at once */
     385           0 :             os_page_ptrs = palloc0_array(void *, os_page_count);
     386           0 :             os_page_status = palloc_array(int, os_page_count);
     387             : 
     388             :             /*
     389             :              * Fill pointers for all the memory pages.  This loop stores and
     390             :              * touches (if needed) addresses into os_page_ptrs[] as input to
     391             :              * one big move_pages(2) inquiry system call, as done in
     392             :              * pg_numa_query_pages().
     393             :              */
     394           0 :             idx = 0;
     395           0 :             for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
     396             :             {
     397           0 :                 os_page_ptrs[idx++] = ptr;
     398             : 
     399             :                 /* Only need to touch memory once per backend process lifetime */
     400           0 :                 if (firstNumaTouch)
     401             :                     pg_numa_touch_mem_if_required(ptr);
     402             :             }
     403             : 
     404             :             Assert(idx == os_page_count);
     405             : 
     406           0 :             elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
     407             :                  "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
     408             : 
     409             :             /*
     410             :              * If we ever get 0xff back from kernel inquiry, then we probably
     411             :              * have bug in our buffers to OS page mapping code here.
     412             :              */
     413           0 :             memset(os_page_status, 0xff, sizeof(int) * os_page_count);
     414             : 
     415             :             /* Query NUMA status for all the pointers */
     416           0 :             if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
     417           0 :                 elog(ERROR, "failed NUMA pages inquiry: %m");
     418             :         }
     419             : 
     420             :         /* Initialize the multi-call context, load entries about buffers */
     421             : 
     422           4 :         funcctx = SRF_FIRSTCALL_INIT();
     423             : 
     424             :         /* Switch context when allocating stuff to be used in later calls */
     425           4 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     426             : 
     427             :         /* Create a user function context for cross-call persistence */
     428           4 :         fctx = palloc_object(BufferCacheOsPagesContext);
     429             : 
     430           4 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     431           0 :             elog(ERROR, "return type must be a row type");
     432             : 
     433           4 :         if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
     434           0 :             elog(ERROR, "incorrect number of output arguments");
     435             : 
     436             :         /* Construct a tuple descriptor for the result rows. */
     437           4 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     438           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     439             :                            INT4OID, -1, 0);
     440           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
     441             :                            INT8OID, -1, 0);
     442           4 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
     443             :                            INT4OID, -1, 0);
     444             : 
     445           4 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     446           4 :         fctx->include_numa = include_numa;
     447             : 
     448             :         /*
     449             :          * Each buffer needs at least one entry, but it might be offset in
     450             :          * some way, and use one extra entry. So we allocate space for the
     451             :          * maximum number of entries we might need, and then count the exact
     452             :          * number as we're walking buffers. That way we can do it in one pass,
     453             :          * without reallocating memory.
     454             :          */
     455           4 :         pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
     456           4 :         max_entries = NBuffers * pages_per_buffer;
     457             : 
     458             :         /* Allocate entries for BufferCacheOsPagesRec records. */
     459           4 :         fctx->record = (BufferCacheOsPagesRec *)
     460           4 :             MemoryContextAllocHuge(CurrentMemoryContext,
     461             :                                    sizeof(BufferCacheOsPagesRec) * max_entries);
     462             : 
     463             :         /* Return to original context when allocating transient memory */
     464           4 :         MemoryContextSwitchTo(oldcontext);
     465             : 
     466           4 :         if (include_numa && firstNumaTouch)
     467           0 :             elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
     468             : 
     469             :         /*
     470             :          * Scan through all the buffers, saving the relevant fields in the
     471             :          * fctx->record structure.
     472             :          *
     473             :          * We don't hold the partition locks, so we don't get a consistent
     474             :          * snapshot across all buffers, but we do grab the buffer header
     475             :          * locks, so the information of each buffer is self-consistent.
     476             :          */
     477           4 :         startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
     478           4 :         idx = 0;
     479       65540 :         for (i = 0; i < NBuffers; i++)
     480             :         {
     481       65536 :             char       *buffptr = (char *) BufferGetBlock(i + 1);
     482             :             BufferDesc *bufHdr;
     483             :             uint32      bufferid;
     484             :             int32       page_num;
     485             :             char       *startptr_buff,
     486             :                        *endptr_buff;
     487             : 
     488       65536 :             CHECK_FOR_INTERRUPTS();
     489             : 
     490       65536 :             bufHdr = GetBufferDescriptor(i);
     491             : 
     492             :             /* Lock each buffer header before inspecting. */
     493       65536 :             LockBufHdr(bufHdr);
     494       65536 :             bufferid = BufferDescriptorGetBuffer(bufHdr);
     495       65536 :             UnlockBufHdr(bufHdr);
     496             : 
     497             :             /* start of the first page of this buffer */
     498       65536 :             startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
     499             : 
     500             :             /* end of the buffer (no need to align to memory page) */
     501       65536 :             endptr_buff = buffptr + BLCKSZ;
     502             : 
     503             :             Assert(startptr_buff < endptr_buff);
     504             : 
     505             :             /* calculate ID of the first page for this buffer */
     506       65536 :             page_num = (startptr_buff - startptr) / os_page_size;
     507             : 
     508             :             /* Add an entry for each OS page overlapping with this buffer. */
     509      196608 :             for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
     510             :             {
     511      131072 :                 fctx->record[idx].bufferid = bufferid;
     512      131072 :                 fctx->record[idx].page_num = page_num;
     513      131072 :                 fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
     514             : 
     515             :                 /* advance to the next entry/page */
     516      131072 :                 ++idx;
     517      131072 :                 ++page_num;
     518             :             }
     519             :         }
     520             : 
     521             :         Assert(idx <= max_entries);
     522             : 
     523             :         if (include_numa)
     524             :             Assert(idx >= os_page_count);
     525             : 
     526             :         /* Set max calls and remember the user function context. */
     527           4 :         funcctx->max_calls = idx;
     528           4 :         funcctx->user_fctx = fctx;
     529             : 
     530             :         /* Remember this backend touched the pages (only relevant for NUMA) */
     531           4 :         if (include_numa)
     532           0 :             firstNumaTouch = false;
     533             :     }
     534             : 
     535      131076 :     funcctx = SRF_PERCALL_SETUP();
     536             : 
     537             :     /* Get the saved state */
     538      131076 :     fctx = funcctx->user_fctx;
     539             : 
     540      131076 :     if (funcctx->call_cntr < funcctx->max_calls)
     541             :     {
     542      131072 :         uint32      i = funcctx->call_cntr;
     543             :         Datum       values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     544             :         bool        nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     545             : 
     546      131072 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     547      131072 :         nulls[0] = false;
     548             : 
     549      131072 :         values[1] = Int64GetDatum(fctx->record[i].page_num);
     550      131072 :         nulls[1] = false;
     551             : 
     552      131072 :         if (fctx->include_numa)
     553             :         {
     554             :             /* status is valid node number */
     555           0 :             if (fctx->record[i].numa_node >= 0)
     556             :             {
     557           0 :                 values[2] = Int32GetDatum(fctx->record[i].numa_node);
     558           0 :                 nulls[2] = false;
     559             :             }
     560             :             else
     561             :             {
     562             :                 /* some kind of error (e.g. pages moved to swap) */
     563           0 :                 values[2] = (Datum) 0;
     564           0 :                 nulls[2] = true;
     565             :             }
     566             :         }
     567             :         else
     568             :         {
     569      131072 :             values[2] = (Datum) 0;
     570      131072 :             nulls[2] = true;
     571             :         }
     572             : 
     573             :         /* Build and return the tuple. */
     574      131072 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     575      131072 :         result = HeapTupleGetDatum(tuple);
     576             : 
     577      131072 :         SRF_RETURN_NEXT(funcctx, result);
     578             :     }
     579             :     else
     580           4 :         SRF_RETURN_DONE(funcctx);
     581             : }
     582             : 
     583             : /*
     584             :  * pg_buffercache_os_pages
     585             :  *
     586             :  * Retrieve information about OS pages, with or without NUMA information.
     587             :  */
     588             : Datum
     589      131076 : pg_buffercache_os_pages(PG_FUNCTION_ARGS)
     590             : {
     591             :     bool        include_numa;
     592             : 
     593             :     /* Get the boolean parameter that controls the NUMA behavior. */
     594      131076 :     include_numa = PG_GETARG_BOOL(0);
     595             : 
     596      131076 :     return pg_buffercache_os_pages_internal(fcinfo, include_numa);
     597             : }
     598             : 
     599             : /* Backward-compatible wrapper for v1.6. */
     600             : Datum
     601           0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
     602             : {
     603             :     /* Call internal function with include_numa=true */
     604           0 :     return pg_buffercache_os_pages_internal(fcinfo, true);
     605             : }
     606             : 
     607             : Datum
     608           4 : pg_buffercache_summary(PG_FUNCTION_ARGS)
     609             : {
     610             :     Datum       result;
     611             :     TupleDesc   tupledesc;
     612             :     HeapTuple   tuple;
     613             :     Datum       values[NUM_BUFFERCACHE_SUMMARY_ELEM];
     614             :     bool        nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
     615             : 
     616           4 :     int32       buffers_used = 0;
     617           4 :     int32       buffers_unused = 0;
     618           4 :     int32       buffers_dirty = 0;
     619           4 :     int32       buffers_pinned = 0;
     620           4 :     int64       usagecount_total = 0;
     621             : 
     622           4 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     623           0 :         elog(ERROR, "return type must be a row type");
     624             : 
     625       65540 :     for (int i = 0; i < NBuffers; i++)
     626             :     {
     627             :         BufferDesc *bufHdr;
     628             :         uint64      buf_state;
     629             : 
     630       65536 :         CHECK_FOR_INTERRUPTS();
     631             : 
     632             :         /*
     633             :          * This function summarizes the state of all headers. Locking the
     634             :          * buffer headers wouldn't provide an improved result as the state of
     635             :          * the buffer can still change after we release the lock and it'd
     636             :          * noticeably increase the cost of the function.
     637             :          */
     638       65536 :         bufHdr = GetBufferDescriptor(i);
     639       65536 :         buf_state = pg_atomic_read_u64(&bufHdr->state);
     640             : 
     641       65536 :         if (buf_state & BM_VALID)
     642             :         {
     643        7992 :             buffers_used++;
     644        7992 :             usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
     645             : 
     646        7992 :             if (buf_state & BM_DIRTY)
     647        3800 :                 buffers_dirty++;
     648             :         }
     649             :         else
     650       57544 :             buffers_unused++;
     651             : 
     652       65536 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     653           0 :             buffers_pinned++;
     654             :     }
     655             : 
     656           4 :     memset(nulls, 0, sizeof(nulls));
     657           4 :     values[0] = Int32GetDatum(buffers_used);
     658           4 :     values[1] = Int32GetDatum(buffers_unused);
     659           4 :     values[2] = Int32GetDatum(buffers_dirty);
     660           4 :     values[3] = Int32GetDatum(buffers_pinned);
     661             : 
     662           4 :     if (buffers_used != 0)
     663           4 :         values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
     664             :     else
     665           0 :         nulls[4] = true;
     666             : 
     667             :     /* Build and return the tuple. */
     668           4 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     669           4 :     result = HeapTupleGetDatum(tuple);
     670             : 
     671           4 :     PG_RETURN_DATUM(result);
     672             : }
     673             : 
     674             : Datum
     675           4 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
     676             : {
     677           4 :     ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
     678           4 :     int         usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
     679           4 :     int         dirty[BM_MAX_USAGE_COUNT + 1] = {0};
     680           4 :     int         pinned[BM_MAX_USAGE_COUNT + 1] = {0};
     681             :     Datum       values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
     682           4 :     bool        nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
     683             : 
     684           4 :     InitMaterializedSRF(fcinfo, 0);
     685             : 
     686       65540 :     for (int i = 0; i < NBuffers; i++)
     687             :     {
     688       65536 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
     689       65536 :         uint64      buf_state = pg_atomic_read_u64(&bufHdr->state);
     690             :         int         usage_count;
     691             : 
     692       65536 :         CHECK_FOR_INTERRUPTS();
     693             : 
     694       65536 :         usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
     695       65536 :         usage_counts[usage_count]++;
     696             : 
     697       65536 :         if (buf_state & BM_DIRTY)
     698        3800 :             dirty[usage_count]++;
     699             : 
     700       65536 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     701           0 :             pinned[usage_count]++;
     702             :     }
     703             : 
     704          28 :     for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
     705             :     {
     706          24 :         values[0] = Int32GetDatum(i);
     707          24 :         values[1] = Int32GetDatum(usage_counts[i]);
     708          24 :         values[2] = Int32GetDatum(dirty[i]);
     709          24 :         values[3] = Int32GetDatum(pinned[i]);
     710             : 
     711          24 :         tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
     712             :     }
     713             : 
     714           4 :     return (Datum) 0;
     715             : }
     716             : 
     717             : /*
     718             :  * Helper function to check if the user has superuser privileges.
     719             :  */
     720             : static void
     721          40 : pg_buffercache_superuser_check(char *func_name)
     722             : {
     723          40 :     if (!superuser())
     724          12 :         ereport(ERROR,
     725             :                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
     726             :                  errmsg("must be superuser to use %s()",
     727             :                         func_name)));
     728          28 : }
     729             : 
     730             : /*
     731             :  * Try to evict a shared buffer.
     732             :  */
     733             : Datum
     734          10 : pg_buffercache_evict(PG_FUNCTION_ARGS)
     735             : {
     736             :     Datum       result;
     737             :     TupleDesc   tupledesc;
     738             :     HeapTuple   tuple;
     739             :     Datum       values[NUM_BUFFERCACHE_EVICT_ELEM];
     740          10 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
     741             : 
     742          10 :     Buffer      buf = PG_GETARG_INT32(0);
     743             :     bool        buffer_flushed;
     744             : 
     745          10 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     746           0 :         elog(ERROR, "return type must be a row type");
     747             : 
     748          10 :     pg_buffercache_superuser_check("pg_buffercache_evict");
     749             : 
     750           8 :     if (buf < 1 || buf > NBuffers)
     751           6 :         elog(ERROR, "bad buffer ID: %d", buf);
     752             : 
     753           2 :     values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
     754           2 :     values[1] = BoolGetDatum(buffer_flushed);
     755             : 
     756           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     757           2 :     result = HeapTupleGetDatum(tuple);
     758             : 
     759           2 :     PG_RETURN_DATUM(result);
     760             : }
     761             : 
     762             : /*
     763             :  * Try to evict specified relation.
     764             :  */
     765             : Datum
     766           6 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
     767             : {
     768             :     Datum       result;
     769             :     TupleDesc   tupledesc;
     770             :     HeapTuple   tuple;
     771             :     Datum       values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
     772           6 :     bool        nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
     773             : 
     774             :     Oid         relOid;
     775             :     Relation    rel;
     776             : 
     777           6 :     int32       buffers_evicted = 0;
     778           6 :     int32       buffers_flushed = 0;
     779           6 :     int32       buffers_skipped = 0;
     780             : 
     781           6 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     782           0 :         elog(ERROR, "return type must be a row type");
     783             : 
     784           6 :     pg_buffercache_superuser_check("pg_buffercache_evict_relation");
     785             : 
     786           4 :     relOid = PG_GETARG_OID(0);
     787             : 
     788           4 :     rel = relation_open(relOid, AccessShareLock);
     789             : 
     790           4 :     if (RelationUsesLocalBuffers(rel))
     791           2 :         ereport(ERROR,
     792             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     793             :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     794             :                         "pg_buffercache_evict_relation")));
     795             : 
     796           2 :     EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
     797             :                             &buffers_skipped);
     798             : 
     799           2 :     relation_close(rel, AccessShareLock);
     800             : 
     801           2 :     values[0] = Int32GetDatum(buffers_evicted);
     802           2 :     values[1] = Int32GetDatum(buffers_flushed);
     803           2 :     values[2] = Int32GetDatum(buffers_skipped);
     804             : 
     805           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     806           2 :     result = HeapTupleGetDatum(tuple);
     807             : 
     808           2 :     PG_RETURN_DATUM(result);
     809             : }
     810             : 
     811             : 
     812             : /*
     813             :  * Try to evict all shared buffers.
     814             :  */
     815             : Datum
     816           4 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
     817             : {
     818             :     Datum       result;
     819             :     TupleDesc   tupledesc;
     820             :     HeapTuple   tuple;
     821             :     Datum       values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
     822           4 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
     823             : 
     824           4 :     int32       buffers_evicted = 0;
     825           4 :     int32       buffers_flushed = 0;
     826           4 :     int32       buffers_skipped = 0;
     827             : 
     828           4 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     829           0 :         elog(ERROR, "return type must be a row type");
     830             : 
     831           4 :     pg_buffercache_superuser_check("pg_buffercache_evict_all");
     832             : 
     833           2 :     EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
     834             :                             &buffers_skipped);
     835             : 
     836           2 :     values[0] = Int32GetDatum(buffers_evicted);
     837           2 :     values[1] = Int32GetDatum(buffers_flushed);
     838           2 :     values[2] = Int32GetDatum(buffers_skipped);
     839             : 
     840           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     841           2 :     result = HeapTupleGetDatum(tuple);
     842             : 
     843           2 :     PG_RETURN_DATUM(result);
     844             : }
     845             : 
     846             : /*
     847             :  * Try to mark a shared buffer as dirty.
     848             :  */
     849             : Datum
     850          10 : pg_buffercache_mark_dirty(PG_FUNCTION_ARGS)
     851             : {
     852             : 
     853             :     Datum       result;
     854             :     TupleDesc   tupledesc;
     855             :     HeapTuple   tuple;
     856             :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM];
     857          10 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0};
     858             : 
     859          10 :     Buffer      buf = PG_GETARG_INT32(0);
     860             :     bool        buffer_already_dirty;
     861             : 
     862          10 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     863           0 :         elog(ERROR, "return type must be a row type");
     864             : 
     865          10 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty");
     866             : 
     867           8 :     if (buf < 1 || buf > NBuffers)
     868           6 :         elog(ERROR, "bad buffer ID: %d", buf);
     869             : 
     870           2 :     values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty));
     871           2 :     values[1] = BoolGetDatum(buffer_already_dirty);
     872             : 
     873           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     874           2 :     result = HeapTupleGetDatum(tuple);
     875             : 
     876           2 :     PG_RETURN_DATUM(result);
     877             : }
     878             : 
     879             : /*
     880             :  * Try to mark all the shared buffers of a relation as dirty.
     881             :  */
     882             : Datum
     883           6 : pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS)
     884             : {
     885             :     Datum       result;
     886             :     TupleDesc   tupledesc;
     887             :     HeapTuple   tuple;
     888             :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM];
     889           6 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0};
     890             : 
     891             :     Oid         relOid;
     892             :     Relation    rel;
     893             : 
     894           6 :     int32       buffers_already_dirty = 0;
     895           6 :     int32       buffers_dirtied = 0;
     896           6 :     int32       buffers_skipped = 0;
     897             : 
     898           6 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     899           0 :         elog(ERROR, "return type must be a row type");
     900             : 
     901           6 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation");
     902             : 
     903           4 :     relOid = PG_GETARG_OID(0);
     904             : 
     905           4 :     rel = relation_open(relOid, AccessShareLock);
     906             : 
     907           4 :     if (RelationUsesLocalBuffers(rel))
     908           2 :         ereport(ERROR,
     909             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     910             :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     911             :                         "pg_buffercache_mark_dirty_relation")));
     912             : 
     913           2 :     MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty,
     914             :                                 &buffers_skipped);
     915             : 
     916           2 :     relation_close(rel, AccessShareLock);
     917             : 
     918           2 :     values[0] = Int32GetDatum(buffers_dirtied);
     919           2 :     values[1] = Int32GetDatum(buffers_already_dirty);
     920           2 :     values[2] = Int32GetDatum(buffers_skipped);
     921             : 
     922           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     923           2 :     result = HeapTupleGetDatum(tuple);
     924             : 
     925           2 :     PG_RETURN_DATUM(result);
     926             : }
     927             : 
     928             : /*
     929             :  * Try to mark all the shared buffers as dirty.
     930             :  */
     931             : Datum
     932           4 : pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
     933             : {
     934             :     Datum       result;
     935             :     TupleDesc   tupledesc;
     936             :     HeapTuple   tuple;
     937             :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM];
     938           4 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0};
     939             : 
     940           4 :     int32       buffers_already_dirty = 0;
     941           4 :     int32       buffers_dirtied = 0;
     942           4 :     int32       buffers_skipped = 0;
     943             : 
     944           4 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     945           0 :         elog(ERROR, "return type must be a row type");
     946             : 
     947           4 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all");
     948             : 
     949           2 :     MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty,
     950             :                                 &buffers_skipped);
     951             : 
     952           2 :     values[0] = Int32GetDatum(buffers_dirtied);
     953           2 :     values[1] = Int32GetDatum(buffers_already_dirty);
     954           2 :     values[2] = Int32GetDatum(buffers_skipped);
     955             : 
     956           2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     957           2 :     result = HeapTupleGetDatum(tuple);
     958             : 
     959           2 :     PG_RETURN_DATUM(result);
     960             : }

Generated by: LCOV version 1.16