LCOV - code coverage report
Current view: top level - contrib/pg_buffercache - pg_buffercache_pages.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 88.8 % 338 300
Test Date: 2026-02-27 20:14:49 Functions: 96.0 % 25 24
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_buffercache_pages.c
       4              :  *    display some contents of the buffer cache
       5              :  *
       6              :  *    contrib/pg_buffercache/pg_buffercache_pages.c
       7              :  *-------------------------------------------------------------------------
       8              :  */
       9              : #include "postgres.h"
      10              : 
      11              : #include "access/htup_details.h"
      12              : #include "access/relation.h"
      13              : #include "catalog/pg_type.h"
      14              : #include "funcapi.h"
      15              : #include "port/pg_numa.h"
      16              : #include "storage/buf_internals.h"
      17              : #include "storage/bufmgr.h"
      18              : #include "utils/rel.h"
      19              : 
      20              : 
      21              : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM  8
      22              : #define NUM_BUFFERCACHE_PAGES_ELEM  9
      23              : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
      24              : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
      25              : #define NUM_BUFFERCACHE_EVICT_ELEM 2
      26              : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
      27              : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
      28              : #define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2
      29              : #define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3
      30              : #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
      31              : 
      32              : #define NUM_BUFFERCACHE_OS_PAGES_ELEM   3
      33              : 
      34            1 : PG_MODULE_MAGIC_EXT(
      35              :                     .name = "pg_buffercache",
      36              :                     .version = PG_VERSION
      37              : );
      38              : 
      39              : /*
      40              :  * Record structure holding the to be exposed cache data.
      41              :  */
      42              : typedef struct
      43              : {
      44              :     uint32      bufferid;
      45              :     RelFileNumber relfilenumber;
      46              :     Oid         reltablespace;
      47              :     Oid         reldatabase;
      48              :     ForkNumber  forknum;
      49              :     BlockNumber blocknum;
      50              :     bool        isvalid;
      51              :     bool        isdirty;
      52              :     uint16      usagecount;
      53              : 
      54              :     /*
      55              :      * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
      56              :      * being pinned by too many backends and each backend will only pin once
      57              :      * because of bufmgr.c's PrivateRefCount infrastructure.
      58              :      */
      59              :     int32       pinning_backends;
      60              : } BufferCachePagesRec;
      61              : 
      62              : 
      63              : /*
      64              :  * Function context for data persisting over repeated calls.
      65              :  */
      66              : typedef struct
      67              : {
      68              :     TupleDesc   tupdesc;
      69              :     BufferCachePagesRec *record;
      70              : } BufferCachePagesContext;
      71              : 
      72              : /*
      73              :  * Record structure holding the to be exposed cache data for OS pages.  This
      74              :  * structure is used by pg_buffercache_os_pages(), where NUMA information may
      75              :  * or may not be included.
      76              :  */
      77              : typedef struct
      78              : {
      79              :     uint32      bufferid;
      80              :     int64       page_num;
      81              :     int32       numa_node;
      82              : } BufferCacheOsPagesRec;
      83              : 
      84              : /*
      85              :  * Function context for data persisting over repeated calls.
      86              :  */
      87              : typedef struct
      88              : {
      89              :     TupleDesc   tupdesc;
      90              :     bool        include_numa;
      91              :     BufferCacheOsPagesRec *record;
      92              : } BufferCacheOsPagesContext;
      93              : 
      94              : 
      95              : /*
      96              :  * Function returning data from the shared buffer cache - buffer number,
      97              :  * relation node/tablespace/database/blocknum and dirty indicator.
      98              :  */
      99            2 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
     100            2 : PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
     101            1 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
     102            2 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
     103            2 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
     104            3 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
     105            2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
     106            2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
     107            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
     108            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
     109            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
     110              : 
     111              : 
     112              : /* Only need to touch memory once per backend process lifetime */
     113              : static bool firstNumaTouch = true;
     114              : 
     115              : 
     116              : Datum
     117        32770 : pg_buffercache_pages(PG_FUNCTION_ARGS)
     118              : {
     119              :     FuncCallContext *funcctx;
     120              :     Datum       result;
     121              :     MemoryContext oldcontext;
     122              :     BufferCachePagesContext *fctx;  /* User function context. */
     123              :     TupleDesc   tupledesc;
     124              :     TupleDesc   expected_tupledesc;
     125              :     HeapTuple   tuple;
     126              : 
     127        32770 :     if (SRF_IS_FIRSTCALL())
     128              :     {
     129              :         int         i;
     130              : 
     131            2 :         funcctx = SRF_FIRSTCALL_INIT();
     132              : 
     133              :         /* Switch context when allocating stuff to be used in later calls */
     134            2 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     135              : 
     136              :         /* Create a user function context for cross-call persistence */
     137            2 :         fctx = palloc_object(BufferCachePagesContext);
     138              : 
     139              :         /*
     140              :          * To smoothly support upgrades from version 1.0 of this extension
     141              :          * transparently handle the (non-)existence of the pinning_backends
     142              :          * column. We unfortunately have to get the result type for that... -
     143              :          * we can't use the result type determined by the function definition
     144              :          * without potentially crashing when somebody uses the old (or even
     145              :          * wrong) function definition though.
     146              :          */
     147            2 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     148            0 :             elog(ERROR, "return type must be a row type");
     149              : 
     150            2 :         if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
     151            2 :             expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
     152            0 :             elog(ERROR, "incorrect number of output arguments");
     153              : 
     154              :         /* Construct a tuple descriptor for the result rows. */
     155            2 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     156            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     157              :                            INT4OID, -1, 0);
     158            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
     159              :                            OIDOID, -1, 0);
     160            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
     161              :                            OIDOID, -1, 0);
     162            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
     163              :                            OIDOID, -1, 0);
     164            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
     165              :                            INT2OID, -1, 0);
     166            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
     167              :                            INT8OID, -1, 0);
     168            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
     169              :                            BOOLOID, -1, 0);
     170            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
     171              :                            INT2OID, -1, 0);
     172              : 
     173            2 :         if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
     174            2 :             TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
     175              :                                INT4OID, -1, 0);
     176              : 
     177            2 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     178              : 
     179              :         /* Allocate NBuffers worth of BufferCachePagesRec records. */
     180            2 :         fctx->record = (BufferCachePagesRec *)
     181            2 :             MemoryContextAllocHuge(CurrentMemoryContext,
     182              :                                    sizeof(BufferCachePagesRec) * NBuffers);
     183              : 
     184              :         /* Set max calls and remember the user function context. */
     185            2 :         funcctx->max_calls = NBuffers;
     186            2 :         funcctx->user_fctx = fctx;
     187              : 
     188              :         /* Return to original context when allocating transient memory */
     189            2 :         MemoryContextSwitchTo(oldcontext);
     190              : 
     191              :         /*
     192              :          * Scan through all the buffers, saving the relevant fields in the
     193              :          * fctx->record structure.
     194              :          *
     195              :          * We don't hold the partition locks, so we don't get a consistent
     196              :          * snapshot across all buffers, but we do grab the buffer header
     197              :          * locks, so the information of each buffer is self-consistent.
     198              :          */
     199        32770 :         for (i = 0; i < NBuffers; i++)
     200              :         {
     201              :             BufferDesc *bufHdr;
     202              :             uint64      buf_state;
     203              : 
     204        32768 :             CHECK_FOR_INTERRUPTS();
     205              : 
     206        32768 :             bufHdr = GetBufferDescriptor(i);
     207              :             /* Lock each buffer header before inspecting. */
     208        32768 :             buf_state = LockBufHdr(bufHdr);
     209              : 
     210        32768 :             fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
     211        32768 :             fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
     212        32768 :             fctx->record[i].reltablespace = bufHdr->tag.spcOid;
     213        32768 :             fctx->record[i].reldatabase = bufHdr->tag.dbOid;
     214        32768 :             fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
     215        32768 :             fctx->record[i].blocknum = bufHdr->tag.blockNum;
     216        32768 :             fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
     217        32768 :             fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
     218              : 
     219        32768 :             if (buf_state & BM_DIRTY)
     220         1904 :                 fctx->record[i].isdirty = true;
     221              :             else
     222        30864 :                 fctx->record[i].isdirty = false;
     223              : 
     224              :             /* Note if the buffer is valid, and has storage created */
     225        32768 :             if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
     226         4006 :                 fctx->record[i].isvalid = true;
     227              :             else
     228        28762 :                 fctx->record[i].isvalid = false;
     229              : 
     230        32768 :             UnlockBufHdr(bufHdr);
     231              :         }
     232              :     }
     233              : 
     234        32770 :     funcctx = SRF_PERCALL_SETUP();
     235              : 
     236              :     /* Get the saved state */
     237        32770 :     fctx = funcctx->user_fctx;
     238              : 
     239        32770 :     if (funcctx->call_cntr < funcctx->max_calls)
     240              :     {
     241        32768 :         uint32      i = funcctx->call_cntr;
     242              :         Datum       values[NUM_BUFFERCACHE_PAGES_ELEM];
     243              :         bool        nulls[NUM_BUFFERCACHE_PAGES_ELEM];
     244              : 
     245        32768 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     246        32768 :         nulls[0] = false;
     247              : 
     248              :         /*
     249              :          * Set all fields except the bufferid to null if the buffer is unused
     250              :          * or not valid.
     251              :          */
     252        32768 :         if (fctx->record[i].blocknum == InvalidBlockNumber ||
     253         4006 :             fctx->record[i].isvalid == false)
     254              :         {
     255        28762 :             nulls[1] = true;
     256        28762 :             nulls[2] = true;
     257        28762 :             nulls[3] = true;
     258        28762 :             nulls[4] = true;
     259        28762 :             nulls[5] = true;
     260        28762 :             nulls[6] = true;
     261        28762 :             nulls[7] = true;
     262              :             /* unused for v1.0 callers, but the array is always long enough */
     263        28762 :             nulls[8] = true;
     264              :         }
     265              :         else
     266              :         {
     267         4006 :             values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
     268         4006 :             nulls[1] = false;
     269         4006 :             values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
     270         4006 :             nulls[2] = false;
     271         4006 :             values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
     272         4006 :             nulls[3] = false;
     273         4006 :             values[4] = Int16GetDatum(fctx->record[i].forknum);
     274         4006 :             nulls[4] = false;
     275         4006 :             values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
     276         4006 :             nulls[5] = false;
     277         4006 :             values[6] = BoolGetDatum(fctx->record[i].isdirty);
     278         4006 :             nulls[6] = false;
     279         4006 :             values[7] = UInt16GetDatum(fctx->record[i].usagecount);
     280         4006 :             nulls[7] = false;
     281              :             /* unused for v1.0 callers, but the array is always long enough */
     282         4006 :             values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
     283         4006 :             nulls[8] = false;
     284              :         }
     285              : 
     286              :         /* Build and return the tuple. */
     287        32768 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     288        32768 :         result = HeapTupleGetDatum(tuple);
     289              : 
     290        32768 :         SRF_RETURN_NEXT(funcctx, result);
     291              :     }
     292              :     else
     293            2 :         SRF_RETURN_DONE(funcctx);
     294              : }
     295              : 
     296              : /*
     297              :  * Inquire about OS pages mappings for shared buffers, with NUMA information,
     298              :  * optionally.
     299              :  *
     300              :  * When "include_numa" is false, this routines ignores everything related
     301              :  * to NUMA (returned as NULL values), returning mapping information between
     302              :  * shared buffers and OS pages.
     303              :  *
     304              :  * When "include_numa" is true, NUMA is initialized and numa_node values
     305              :  * are generated.  In order to get reliable results we also need to touch
     306              :  * memory pages, so that the inquiry about NUMA memory node does not return
     307              :  * -2, indicating unmapped/unallocated pages.
     308              :  *
     309              :  * Buffers may be smaller or larger than OS memory pages. For each buffer we
     310              :  * return one entry for each memory page used by the buffer (if the buffer is
     311              :  * smaller, it only uses a part of one memory page).
     312              :  *
     313              :  * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
     314              :  * one is always a multiple of the other.
     315              :  *
     316              :  */
     317              : static Datum
     318        65538 : pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
     319              : {
     320              :     FuncCallContext *funcctx;
     321              :     MemoryContext oldcontext;
     322              :     BufferCacheOsPagesContext *fctx;    /* User function context. */
     323              :     TupleDesc   tupledesc;
     324              :     TupleDesc   expected_tupledesc;
     325              :     HeapTuple   tuple;
     326              :     Datum       result;
     327              : 
     328        65538 :     if (SRF_IS_FIRSTCALL())
     329              :     {
     330              :         int         i,
     331              :                     idx;
     332              :         Size        os_page_size;
     333              :         int         pages_per_buffer;
     334            2 :         int        *os_page_status = NULL;
     335            2 :         uint64      os_page_count = 0;
     336              :         int         max_entries;
     337              :         char       *startptr,
     338              :                    *endptr;
     339              : 
     340              :         /* If NUMA information is requested, initialize NUMA support. */
     341            2 :         if (include_numa && pg_numa_init() == -1)
     342            0 :             elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
     343              : 
     344              :         /*
     345              :          * The database block size and OS memory page size are unlikely to be
     346              :          * the same. The block size is 1-32KB, the memory page size depends on
     347              :          * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
     348              :          * there are also features like THP etc. Moreover, we don't quite know
     349              :          * how the pages and buffers "align" in memory - the buffers may be
     350              :          * shifted in some way, using more memory pages than necessary.
     351              :          *
     352              :          * So we need to be careful about mapping buffers to memory pages. We
     353              :          * calculate the maximum number of pages a buffer might use, so that
     354              :          * we allocate enough space for the entries. And then we count the
     355              :          * actual number of entries as we scan the buffers.
     356              :          *
     357              :          * This information is needed before calling move_pages() for NUMA
     358              :          * node id inquiry.
     359              :          */
     360            2 :         os_page_size = pg_get_shmem_pagesize();
     361              : 
     362              :         /*
     363              :          * The pages and block size is expected to be 2^k, so one divides the
     364              :          * other (we don't know in which direction). This does not say
     365              :          * anything about relative alignment of pages/buffers.
     366              :          */
     367              :         Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
     368              : 
     369            2 :         if (include_numa)
     370              :         {
     371            0 :             void      **os_page_ptrs = NULL;
     372              : 
     373              :             /*
     374              :              * How many addresses we are going to query?  Simply get the page
     375              :              * for the first buffer, and first page after the last buffer, and
     376              :              * count the pages from that.
     377              :              */
     378            0 :             startptr = (char *) TYPEALIGN_DOWN(os_page_size,
     379              :                                                BufferGetBlock(1));
     380            0 :             endptr = (char *) TYPEALIGN(os_page_size,
     381              :                                         (char *) BufferGetBlock(NBuffers) + BLCKSZ);
     382            0 :             os_page_count = (endptr - startptr) / os_page_size;
     383              : 
     384              :             /* Used to determine the NUMA node for all OS pages at once */
     385            0 :             os_page_ptrs = palloc0_array(void *, os_page_count);
     386            0 :             os_page_status = palloc_array(int, os_page_count);
     387              : 
     388              :             /*
     389              :              * Fill pointers for all the memory pages.  This loop stores and
     390              :              * touches (if needed) addresses into os_page_ptrs[] as input to
     391              :              * one big move_pages(2) inquiry system call, as done in
     392              :              * pg_numa_query_pages().
     393              :              */
     394            0 :             idx = 0;
     395            0 :             for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
     396              :             {
     397            0 :                 os_page_ptrs[idx++] = ptr;
     398              : 
     399              :                 /* Only need to touch memory once per backend process lifetime */
     400            0 :                 if (firstNumaTouch)
     401              :                     pg_numa_touch_mem_if_required(ptr);
     402              :             }
     403              : 
     404              :             Assert(idx == os_page_count);
     405              : 
     406            0 :             elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
     407              :                  "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
     408              : 
     409              :             /*
     410              :              * If we ever get 0xff back from kernel inquiry, then we probably
     411              :              * have bug in our buffers to OS page mapping code here.
     412              :              */
     413            0 :             memset(os_page_status, 0xff, sizeof(int) * os_page_count);
     414              : 
     415              :             /* Query NUMA status for all the pointers */
     416            0 :             if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
     417            0 :                 elog(ERROR, "failed NUMA pages inquiry: %m");
     418              :         }
     419              : 
     420              :         /* Initialize the multi-call context, load entries about buffers */
     421              : 
     422            2 :         funcctx = SRF_FIRSTCALL_INIT();
     423              : 
     424              :         /* Switch context when allocating stuff to be used in later calls */
     425            2 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     426              : 
     427              :         /* Create a user function context for cross-call persistence */
     428            2 :         fctx = palloc_object(BufferCacheOsPagesContext);
     429              : 
     430            2 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     431            0 :             elog(ERROR, "return type must be a row type");
     432              : 
     433            2 :         if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
     434            0 :             elog(ERROR, "incorrect number of output arguments");
     435              : 
     436              :         /* Construct a tuple descriptor for the result rows. */
     437            2 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     438            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     439              :                            INT4OID, -1, 0);
     440            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
     441              :                            INT8OID, -1, 0);
     442            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
     443              :                            INT4OID, -1, 0);
     444              : 
     445            2 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     446            2 :         fctx->include_numa = include_numa;
     447              : 
     448              :         /*
     449              :          * Each buffer needs at least one entry, but it might be offset in
     450              :          * some way, and use one extra entry. So we allocate space for the
     451              :          * maximum number of entries we might need, and then count the exact
     452              :          * number as we're walking buffers. That way we can do it in one pass,
     453              :          * without reallocating memory.
     454              :          */
     455            2 :         pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
     456            2 :         max_entries = NBuffers * pages_per_buffer;
     457              : 
     458              :         /* Allocate entries for BufferCacheOsPagesRec records. */
     459            2 :         fctx->record = (BufferCacheOsPagesRec *)
     460            2 :             MemoryContextAllocHuge(CurrentMemoryContext,
     461              :                                    sizeof(BufferCacheOsPagesRec) * max_entries);
     462              : 
     463              :         /* Return to original context when allocating transient memory */
     464            2 :         MemoryContextSwitchTo(oldcontext);
     465              : 
     466            2 :         if (include_numa && firstNumaTouch)
     467            0 :             elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
     468              : 
     469              :         /*
     470              :          * Scan through all the buffers, saving the relevant fields in the
     471              :          * fctx->record structure.
     472              :          *
     473              :          * We don't hold the partition locks, so we don't get a consistent
     474              :          * snapshot across all buffers, but we do grab the buffer header
     475              :          * locks, so the information of each buffer is self-consistent.
     476              :          */
     477            2 :         startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
     478            2 :         idx = 0;
     479        32770 :         for (i = 0; i < NBuffers; i++)
     480              :         {
     481        32768 :             char       *buffptr = (char *) BufferGetBlock(i + 1);
     482              :             BufferDesc *bufHdr;
     483              :             uint32      bufferid;
     484              :             int32       page_num;
     485              :             char       *startptr_buff,
     486              :                        *endptr_buff;
     487              : 
     488        32768 :             CHECK_FOR_INTERRUPTS();
     489              : 
     490        32768 :             bufHdr = GetBufferDescriptor(i);
     491              : 
     492              :             /* Lock each buffer header before inspecting. */
     493        32768 :             LockBufHdr(bufHdr);
     494        32768 :             bufferid = BufferDescriptorGetBuffer(bufHdr);
     495        32768 :             UnlockBufHdr(bufHdr);
     496              : 
     497              :             /* start of the first page of this buffer */
     498        32768 :             startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
     499              : 
     500              :             /* end of the buffer (no need to align to memory page) */
     501        32768 :             endptr_buff = buffptr + BLCKSZ;
     502              : 
     503              :             Assert(startptr_buff < endptr_buff);
     504              : 
     505              :             /* calculate ID of the first page for this buffer */
     506        32768 :             page_num = (startptr_buff - startptr) / os_page_size;
     507              : 
     508              :             /* Add an entry for each OS page overlapping with this buffer. */
     509        98304 :             for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
     510              :             {
     511        65536 :                 fctx->record[idx].bufferid = bufferid;
     512        65536 :                 fctx->record[idx].page_num = page_num;
     513        65536 :                 fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
     514              : 
     515              :                 /* advance to the next entry/page */
     516        65536 :                 ++idx;
     517        65536 :                 ++page_num;
     518              :             }
     519              :         }
     520              : 
     521              :         Assert(idx <= max_entries);
     522              : 
     523              :         if (include_numa)
     524              :             Assert(idx >= os_page_count);
     525              : 
     526              :         /* Set max calls and remember the user function context. */
     527            2 :         funcctx->max_calls = idx;
     528            2 :         funcctx->user_fctx = fctx;
     529              : 
     530              :         /* Remember this backend touched the pages (only relevant for NUMA) */
     531            2 :         if (include_numa)
     532            0 :             firstNumaTouch = false;
     533              :     }
     534              : 
     535        65538 :     funcctx = SRF_PERCALL_SETUP();
     536              : 
     537              :     /* Get the saved state */
     538        65538 :     fctx = funcctx->user_fctx;
     539              : 
     540        65538 :     if (funcctx->call_cntr < funcctx->max_calls)
     541              :     {
     542        65536 :         uint32      i = funcctx->call_cntr;
     543              :         Datum       values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     544              :         bool        nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     545              : 
     546        65536 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     547        65536 :         nulls[0] = false;
     548              : 
     549        65536 :         values[1] = Int64GetDatum(fctx->record[i].page_num);
     550        65536 :         nulls[1] = false;
     551              : 
     552        65536 :         if (fctx->include_numa)
     553              :         {
     554              :             /* status is valid node number */
     555            0 :             if (fctx->record[i].numa_node >= 0)
     556              :             {
     557            0 :                 values[2] = Int32GetDatum(fctx->record[i].numa_node);
     558            0 :                 nulls[2] = false;
     559              :             }
     560              :             else
     561              :             {
     562              :                 /* some kind of error (e.g. pages moved to swap) */
     563            0 :                 values[2] = (Datum) 0;
     564            0 :                 nulls[2] = true;
     565              :             }
     566              :         }
     567              :         else
     568              :         {
     569        65536 :             values[2] = (Datum) 0;
     570        65536 :             nulls[2] = true;
     571              :         }
     572              : 
     573              :         /* Build and return the tuple. */
     574        65536 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     575        65536 :         result = HeapTupleGetDatum(tuple);
     576              : 
     577        65536 :         SRF_RETURN_NEXT(funcctx, result);
     578              :     }
     579              :     else
     580            2 :         SRF_RETURN_DONE(funcctx);
     581              : }
     582              : 
     583              : /*
     584              :  * pg_buffercache_os_pages
     585              :  *
     586              :  * Retrieve information about OS pages, with or without NUMA information.
     587              :  */
     588              : Datum
     589        65538 : pg_buffercache_os_pages(PG_FUNCTION_ARGS)
     590              : {
     591              :     bool        include_numa;
     592              : 
     593              :     /* Get the boolean parameter that controls the NUMA behavior. */
     594        65538 :     include_numa = PG_GETARG_BOOL(0);
     595              : 
     596        65538 :     return pg_buffercache_os_pages_internal(fcinfo, include_numa);
     597              : }
     598              : 
     599              : /* Backward-compatible wrapper for v1.6. */
     600              : Datum
     601            0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
     602              : {
     603              :     /* Call internal function with include_numa=true */
     604            0 :     return pg_buffercache_os_pages_internal(fcinfo, true);
     605              : }
     606              : 
     607              : Datum
     608            2 : pg_buffercache_summary(PG_FUNCTION_ARGS)
     609              : {
     610              :     Datum       result;
     611              :     TupleDesc   tupledesc;
     612              :     HeapTuple   tuple;
     613              :     Datum       values[NUM_BUFFERCACHE_SUMMARY_ELEM];
     614              :     bool        nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
     615              : 
     616            2 :     int32       buffers_used = 0;
     617            2 :     int32       buffers_unused = 0;
     618            2 :     int32       buffers_dirty = 0;
     619            2 :     int32       buffers_pinned = 0;
     620            2 :     int64       usagecount_total = 0;
     621              : 
     622            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     623            0 :         elog(ERROR, "return type must be a row type");
     624              : 
     625        32770 :     for (int i = 0; i < NBuffers; i++)
     626              :     {
     627              :         BufferDesc *bufHdr;
     628              :         uint64      buf_state;
     629              : 
     630        32768 :         CHECK_FOR_INTERRUPTS();
     631              : 
     632              :         /*
     633              :          * This function summarizes the state of all headers. Locking the
     634              :          * buffer headers wouldn't provide an improved result as the state of
     635              :          * the buffer can still change after we release the lock and it'd
     636              :          * noticeably increase the cost of the function.
     637              :          */
     638        32768 :         bufHdr = GetBufferDescriptor(i);
     639        32768 :         buf_state = pg_atomic_read_u64(&bufHdr->state);
     640              : 
     641        32768 :         if (buf_state & BM_VALID)
     642              :         {
     643         4006 :             buffers_used++;
     644         4006 :             usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
     645              : 
     646         4006 :             if (buf_state & BM_DIRTY)
     647         1904 :                 buffers_dirty++;
     648              :         }
     649              :         else
     650        28762 :             buffers_unused++;
     651              : 
     652        32768 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     653            0 :             buffers_pinned++;
     654              :     }
     655              : 
     656            2 :     memset(nulls, 0, sizeof(nulls));
     657            2 :     values[0] = Int32GetDatum(buffers_used);
     658            2 :     values[1] = Int32GetDatum(buffers_unused);
     659            2 :     values[2] = Int32GetDatum(buffers_dirty);
     660            2 :     values[3] = Int32GetDatum(buffers_pinned);
     661              : 
     662            2 :     if (buffers_used != 0)
     663            2 :         values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
     664              :     else
     665            0 :         nulls[4] = true;
     666              : 
     667              :     /* Build and return the tuple. */
     668            2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     669            2 :     result = HeapTupleGetDatum(tuple);
     670              : 
     671            2 :     PG_RETURN_DATUM(result);
     672              : }
     673              : 
     674              : Datum
     675            2 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
     676              : {
     677            2 :     ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
     678            2 :     int         usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
     679            2 :     int         dirty[BM_MAX_USAGE_COUNT + 1] = {0};
     680            2 :     int         pinned[BM_MAX_USAGE_COUNT + 1] = {0};
     681              :     Datum       values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
     682            2 :     bool        nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
     683              : 
     684            2 :     InitMaterializedSRF(fcinfo, 0);
     685              : 
     686        32770 :     for (int i = 0; i < NBuffers; i++)
     687              :     {
     688        32768 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
     689        32768 :         uint64      buf_state = pg_atomic_read_u64(&bufHdr->state);
     690              :         int         usage_count;
     691              : 
     692        32768 :         CHECK_FOR_INTERRUPTS();
     693              : 
     694        32768 :         usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
     695        32768 :         usage_counts[usage_count]++;
     696              : 
     697        32768 :         if (buf_state & BM_DIRTY)
     698         1904 :             dirty[usage_count]++;
     699              : 
     700        32768 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     701            0 :             pinned[usage_count]++;
     702              :     }
     703              : 
     704           14 :     for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
     705              :     {
     706           12 :         values[0] = Int32GetDatum(i);
     707           12 :         values[1] = Int32GetDatum(usage_counts[i]);
     708           12 :         values[2] = Int32GetDatum(dirty[i]);
     709           12 :         values[3] = Int32GetDatum(pinned[i]);
     710              : 
     711           12 :         tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
     712              :     }
     713              : 
     714            2 :     return (Datum) 0;
     715              : }
     716              : 
     717              : /*
     718              :  * Helper function to check if the user has superuser privileges.
     719              :  */
     720              : static void
     721           20 : pg_buffercache_superuser_check(char *func_name)
     722              : {
     723           20 :     if (!superuser())
     724            6 :         ereport(ERROR,
     725              :                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
     726              :                  errmsg("must be superuser to use %s()",
     727              :                         func_name)));
     728           14 : }
     729              : 
     730              : /*
     731              :  * Try to evict a shared buffer.
     732              :  */
     733              : Datum
     734            5 : pg_buffercache_evict(PG_FUNCTION_ARGS)
     735              : {
     736              :     Datum       result;
     737              :     TupleDesc   tupledesc;
     738              :     HeapTuple   tuple;
     739              :     Datum       values[NUM_BUFFERCACHE_EVICT_ELEM];
     740            5 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
     741              : 
     742            5 :     Buffer      buf = PG_GETARG_INT32(0);
     743              :     bool        buffer_flushed;
     744              : 
     745            5 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     746            0 :         elog(ERROR, "return type must be a row type");
     747              : 
     748            5 :     pg_buffercache_superuser_check("pg_buffercache_evict");
     749              : 
     750            4 :     if (buf < 1 || buf > NBuffers)
     751            3 :         elog(ERROR, "bad buffer ID: %d", buf);
     752              : 
     753            1 :     values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
     754            1 :     values[1] = BoolGetDatum(buffer_flushed);
     755              : 
     756            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     757            1 :     result = HeapTupleGetDatum(tuple);
     758              : 
     759            1 :     PG_RETURN_DATUM(result);
     760              : }
     761              : 
     762              : /*
     763              :  * Try to evict specified relation.
     764              :  */
     765              : Datum
     766            3 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
     767              : {
     768              :     Datum       result;
     769              :     TupleDesc   tupledesc;
     770              :     HeapTuple   tuple;
     771              :     Datum       values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
     772            3 :     bool        nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
     773              : 
     774              :     Oid         relOid;
     775              :     Relation    rel;
     776              : 
     777            3 :     int32       buffers_evicted = 0;
     778            3 :     int32       buffers_flushed = 0;
     779            3 :     int32       buffers_skipped = 0;
     780              : 
     781            3 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     782            0 :         elog(ERROR, "return type must be a row type");
     783              : 
     784            3 :     pg_buffercache_superuser_check("pg_buffercache_evict_relation");
     785              : 
     786            2 :     relOid = PG_GETARG_OID(0);
     787              : 
     788            2 :     rel = relation_open(relOid, AccessShareLock);
     789              : 
     790            2 :     if (RelationUsesLocalBuffers(rel))
     791            1 :         ereport(ERROR,
     792              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     793              :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     794              :                         "pg_buffercache_evict_relation")));
     795              : 
     796            1 :     EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
     797              :                             &buffers_skipped);
     798              : 
     799            1 :     relation_close(rel, AccessShareLock);
     800              : 
     801            1 :     values[0] = Int32GetDatum(buffers_evicted);
     802            1 :     values[1] = Int32GetDatum(buffers_flushed);
     803            1 :     values[2] = Int32GetDatum(buffers_skipped);
     804              : 
     805            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     806            1 :     result = HeapTupleGetDatum(tuple);
     807              : 
     808            1 :     PG_RETURN_DATUM(result);
     809              : }
     810              : 
     811              : 
     812              : /*
     813              :  * Try to evict all shared buffers.
     814              :  */
     815              : Datum
     816            2 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
     817              : {
     818              :     Datum       result;
     819              :     TupleDesc   tupledesc;
     820              :     HeapTuple   tuple;
     821              :     Datum       values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
     822            2 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
     823              : 
     824            2 :     int32       buffers_evicted = 0;
     825            2 :     int32       buffers_flushed = 0;
     826            2 :     int32       buffers_skipped = 0;
     827              : 
     828            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     829            0 :         elog(ERROR, "return type must be a row type");
     830              : 
     831            2 :     pg_buffercache_superuser_check("pg_buffercache_evict_all");
     832              : 
     833            1 :     EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
     834              :                             &buffers_skipped);
     835              : 
     836            1 :     values[0] = Int32GetDatum(buffers_evicted);
     837            1 :     values[1] = Int32GetDatum(buffers_flushed);
     838            1 :     values[2] = Int32GetDatum(buffers_skipped);
     839              : 
     840            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     841            1 :     result = HeapTupleGetDatum(tuple);
     842              : 
     843            1 :     PG_RETURN_DATUM(result);
     844              : }
     845              : 
     846              : /*
     847              :  * Try to mark a shared buffer as dirty.
     848              :  */
     849              : Datum
     850            5 : pg_buffercache_mark_dirty(PG_FUNCTION_ARGS)
     851              : {
     852              : 
     853              :     Datum       result;
     854              :     TupleDesc   tupledesc;
     855              :     HeapTuple   tuple;
     856              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM];
     857            5 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0};
     858              : 
     859            5 :     Buffer      buf = PG_GETARG_INT32(0);
     860              :     bool        buffer_already_dirty;
     861              : 
     862            5 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     863            0 :         elog(ERROR, "return type must be a row type");
     864              : 
     865            5 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty");
     866              : 
     867            4 :     if (buf < 1 || buf > NBuffers)
     868            3 :         elog(ERROR, "bad buffer ID: %d", buf);
     869              : 
     870            1 :     values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty));
     871            1 :     values[1] = BoolGetDatum(buffer_already_dirty);
     872              : 
     873            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     874            1 :     result = HeapTupleGetDatum(tuple);
     875              : 
     876            1 :     PG_RETURN_DATUM(result);
     877              : }
     878              : 
     879              : /*
     880              :  * Try to mark all the shared buffers of a relation as dirty.
     881              :  */
     882              : Datum
     883            3 : pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS)
     884              : {
     885              :     Datum       result;
     886              :     TupleDesc   tupledesc;
     887              :     HeapTuple   tuple;
     888              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM];
     889            3 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0};
     890              : 
     891              :     Oid         relOid;
     892              :     Relation    rel;
     893              : 
     894            3 :     int32       buffers_already_dirty = 0;
     895            3 :     int32       buffers_dirtied = 0;
     896            3 :     int32       buffers_skipped = 0;
     897              : 
     898            3 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     899            0 :         elog(ERROR, "return type must be a row type");
     900              : 
     901            3 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation");
     902              : 
     903            2 :     relOid = PG_GETARG_OID(0);
     904              : 
     905            2 :     rel = relation_open(relOid, AccessShareLock);
     906              : 
     907            2 :     if (RelationUsesLocalBuffers(rel))
     908            1 :         ereport(ERROR,
     909              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     910              :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     911              :                         "pg_buffercache_mark_dirty_relation")));
     912              : 
     913            1 :     MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty,
     914              :                                 &buffers_skipped);
     915              : 
     916            1 :     relation_close(rel, AccessShareLock);
     917              : 
     918            1 :     values[0] = Int32GetDatum(buffers_dirtied);
     919            1 :     values[1] = Int32GetDatum(buffers_already_dirty);
     920            1 :     values[2] = Int32GetDatum(buffers_skipped);
     921              : 
     922            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     923            1 :     result = HeapTupleGetDatum(tuple);
     924              : 
     925            1 :     PG_RETURN_DATUM(result);
     926              : }
     927              : 
     928              : /*
     929              :  * Try to mark all the shared buffers as dirty.
     930              :  */
     931              : Datum
     932            2 : pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
     933              : {
     934              :     Datum       result;
     935              :     TupleDesc   tupledesc;
     936              :     HeapTuple   tuple;
     937              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM];
     938            2 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0};
     939              : 
     940            2 :     int32       buffers_already_dirty = 0;
     941            2 :     int32       buffers_dirtied = 0;
     942            2 :     int32       buffers_skipped = 0;
     943              : 
     944            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     945            0 :         elog(ERROR, "return type must be a row type");
     946              : 
     947            2 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all");
     948              : 
     949            1 :     MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty,
     950              :                                 &buffers_skipped);
     951              : 
     952            1 :     values[0] = Int32GetDatum(buffers_dirtied);
     953            1 :     values[1] = Int32GetDatum(buffers_already_dirty);
     954            1 :     values[2] = Int32GetDatum(buffers_skipped);
     955              : 
     956            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     957            1 :     result = HeapTupleGetDatum(tuple);
     958              : 
     959            1 :     PG_RETURN_DATUM(result);
     960              : }
        

Generated by: LCOV version 2.0-1