LCOV - code coverage report
Current view: top level - contrib/pg_buffercache - pg_buffercache_pages.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 88.8 % 340 302
Test Date: 2026-04-06 21:16:29 Functions: 96.0 % 25 24
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_buffercache_pages.c
       4              :  *    display some contents of the buffer cache
       5              :  *
       6              :  *    contrib/pg_buffercache/pg_buffercache_pages.c
       7              :  *-------------------------------------------------------------------------
       8              :  */
       9              : #include "postgres.h"
      10              : 
      11              : #include "access/htup_details.h"
      12              : #include "access/relation.h"
      13              : #include "catalog/pg_type.h"
      14              : #include "funcapi.h"
      15              : #include "port/pg_numa.h"
      16              : #include "storage/buf_internals.h"
      17              : #include "storage/bufmgr.h"
      18              : #include "utils/rel.h"
      19              : #include "utils/tuplestore.h"
      20              : 
      21              : 
      22              : #define NUM_BUFFERCACHE_PAGES_MIN_ELEM  8
      23              : #define NUM_BUFFERCACHE_PAGES_ELEM  9
      24              : #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
      25              : #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
      26              : #define NUM_BUFFERCACHE_EVICT_ELEM 2
      27              : #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
      28              : #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
      29              : #define NUM_BUFFERCACHE_MARK_DIRTY_ELEM 2
      30              : #define NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM 3
      31              : #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
      32              : 
      33              : #define NUM_BUFFERCACHE_OS_PAGES_ELEM   3
      34              : 
      35            1 : PG_MODULE_MAGIC_EXT(
      36              :                     .name = "pg_buffercache",
      37              :                     .version = PG_VERSION
      38              : );
      39              : 
      40              : /*
      41              :  * Record structure holding the to be exposed cache data.
      42              :  */
      43              : typedef struct
      44              : {
      45              :     uint32      bufferid;
      46              :     RelFileNumber relfilenumber;
      47              :     Oid         reltablespace;
      48              :     Oid         reldatabase;
      49              :     ForkNumber  forknum;
      50              :     BlockNumber blocknum;
      51              :     bool        isvalid;
      52              :     bool        isdirty;
      53              :     uint16      usagecount;
      54              : 
      55              :     /*
      56              :      * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
      57              :      * being pinned by too many backends and each backend will only pin once
      58              :      * because of bufmgr.c's PrivateRefCount infrastructure.
      59              :      */
      60              :     int32       pinning_backends;
      61              : } BufferCachePagesRec;
      62              : 
      63              : 
      64              : /*
      65              :  * Function context for data persisting over repeated calls.
      66              :  */
      67              : typedef struct
      68              : {
      69              :     TupleDesc   tupdesc;
      70              :     BufferCachePagesRec *record;
      71              : } BufferCachePagesContext;
      72              : 
      73              : /*
      74              :  * Record structure holding the to be exposed cache data for OS pages.  This
      75              :  * structure is used by pg_buffercache_os_pages(), where NUMA information may
      76              :  * or may not be included.
      77              :  */
      78              : typedef struct
      79              : {
      80              :     uint32      bufferid;
      81              :     int64       page_num;
      82              :     int32       numa_node;
      83              : } BufferCacheOsPagesRec;
      84              : 
      85              : /*
      86              :  * Function context for data persisting over repeated calls.
      87              :  */
      88              : typedef struct
      89              : {
      90              :     TupleDesc   tupdesc;
      91              :     bool        include_numa;
      92              :     BufferCacheOsPagesRec *record;
      93              : } BufferCacheOsPagesContext;
      94              : 
      95              : 
      96              : /*
      97              :  * Function returning data from the shared buffer cache - buffer number,
      98              :  * relation node/tablespace/database/blocknum and dirty indicator.
      99              :  */
     100            2 : PG_FUNCTION_INFO_V1(pg_buffercache_pages);
     101            2 : PG_FUNCTION_INFO_V1(pg_buffercache_os_pages);
     102            1 : PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
     103            2 : PG_FUNCTION_INFO_V1(pg_buffercache_summary);
     104            2 : PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
     105            3 : PG_FUNCTION_INFO_V1(pg_buffercache_evict);
     106            2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
     107            2 : PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
     108            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
     109            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
     110            2 : PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
     111              : 
     112              : 
     113              : /* Only need to touch memory once per backend process lifetime */
     114              : static bool firstNumaTouch = true;
     115              : 
     116              : 
     117              : Datum
     118        32770 : pg_buffercache_pages(PG_FUNCTION_ARGS)
     119              : {
     120              :     FuncCallContext *funcctx;
     121              :     Datum       result;
     122              :     MemoryContext oldcontext;
     123              :     BufferCachePagesContext *fctx;  /* User function context. */
     124              :     TupleDesc   tupledesc;
     125              :     TupleDesc   expected_tupledesc;
     126              :     HeapTuple   tuple;
     127              : 
     128        32770 :     if (SRF_IS_FIRSTCALL())
     129              :     {
     130              :         int         i;
     131              : 
     132            2 :         funcctx = SRF_FIRSTCALL_INIT();
     133              : 
     134              :         /* Switch context when allocating stuff to be used in later calls */
     135            2 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     136              : 
     137              :         /* Create a user function context for cross-call persistence */
     138            2 :         fctx = palloc_object(BufferCachePagesContext);
     139              : 
     140              :         /*
     141              :          * To smoothly support upgrades from version 1.0 of this extension
     142              :          * transparently handle the (non-)existence of the pinning_backends
     143              :          * column. We unfortunately have to get the result type for that... -
     144              :          * we can't use the result type determined by the function definition
     145              :          * without potentially crashing when somebody uses the old (or even
     146              :          * wrong) function definition though.
     147              :          */
     148            2 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     149            0 :             elog(ERROR, "return type must be a row type");
     150              : 
     151            2 :         if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
     152            2 :             expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
     153            0 :             elog(ERROR, "incorrect number of output arguments");
     154              : 
     155              :         /* Construct a tuple descriptor for the result rows. */
     156            2 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     157            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     158              :                            INT4OID, -1, 0);
     159            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
     160              :                            OIDOID, -1, 0);
     161            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
     162              :                            OIDOID, -1, 0);
     163            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
     164              :                            OIDOID, -1, 0);
     165            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
     166              :                            INT2OID, -1, 0);
     167            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
     168              :                            INT8OID, -1, 0);
     169            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
     170              :                            BOOLOID, -1, 0);
     171            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
     172              :                            INT2OID, -1, 0);
     173              : 
     174            2 :         if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
     175            2 :             TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
     176              :                                INT4OID, -1, 0);
     177              : 
     178            2 :         TupleDescFinalize(tupledesc);
     179            2 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     180              : 
     181              :         /* Allocate NBuffers worth of BufferCachePagesRec records. */
     182            2 :         fctx->record = (BufferCachePagesRec *)
     183            2 :             MemoryContextAllocHuge(CurrentMemoryContext,
     184              :                                    sizeof(BufferCachePagesRec) * NBuffers);
     185              : 
     186              :         /* Set max calls and remember the user function context. */
     187            2 :         funcctx->max_calls = NBuffers;
     188            2 :         funcctx->user_fctx = fctx;
     189              : 
     190              :         /* Return to original context when allocating transient memory */
     191            2 :         MemoryContextSwitchTo(oldcontext);
     192              : 
     193              :         /*
     194              :          * Scan through all the buffers, saving the relevant fields in the
     195              :          * fctx->record structure.
     196              :          *
     197              :          * We don't hold the partition locks, so we don't get a consistent
     198              :          * snapshot across all buffers, but we do grab the buffer header
     199              :          * locks, so the information of each buffer is self-consistent.
     200              :          */
     201        32770 :         for (i = 0; i < NBuffers; i++)
     202              :         {
     203              :             BufferDesc *bufHdr;
     204              :             uint64      buf_state;
     205              : 
     206        32768 :             CHECK_FOR_INTERRUPTS();
     207              : 
     208        32768 :             bufHdr = GetBufferDescriptor(i);
     209              :             /* Lock each buffer header before inspecting. */
     210        32768 :             buf_state = LockBufHdr(bufHdr);
     211              : 
     212        32768 :             fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
     213        32768 :             fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
     214        32768 :             fctx->record[i].reltablespace = bufHdr->tag.spcOid;
     215        32768 :             fctx->record[i].reldatabase = bufHdr->tag.dbOid;
     216        32768 :             fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
     217        32768 :             fctx->record[i].blocknum = bufHdr->tag.blockNum;
     218        32768 :             fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
     219        32768 :             fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
     220              : 
     221        32768 :             if (buf_state & BM_DIRTY)
     222         1954 :                 fctx->record[i].isdirty = true;
     223              :             else
     224        30814 :                 fctx->record[i].isdirty = false;
     225              : 
     226              :             /* Note if the buffer is valid, and has storage created */
     227        32768 :             if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
     228         4106 :                 fctx->record[i].isvalid = true;
     229              :             else
     230        28662 :                 fctx->record[i].isvalid = false;
     231              : 
     232        32768 :             UnlockBufHdr(bufHdr);
     233              :         }
     234              :     }
     235              : 
     236        32770 :     funcctx = SRF_PERCALL_SETUP();
     237              : 
     238              :     /* Get the saved state */
     239        32770 :     fctx = funcctx->user_fctx;
     240              : 
     241        32770 :     if (funcctx->call_cntr < funcctx->max_calls)
     242              :     {
     243        32768 :         uint32      i = funcctx->call_cntr;
     244              :         Datum       values[NUM_BUFFERCACHE_PAGES_ELEM];
     245              :         bool        nulls[NUM_BUFFERCACHE_PAGES_ELEM];
     246              : 
     247        32768 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     248        32768 :         nulls[0] = false;
     249              : 
     250              :         /*
     251              :          * Set all fields except the bufferid to null if the buffer is unused
     252              :          * or not valid.
     253              :          */
     254        32768 :         if (fctx->record[i].blocknum == InvalidBlockNumber ||
     255         4106 :             fctx->record[i].isvalid == false)
     256              :         {
     257        28662 :             nulls[1] = true;
     258        28662 :             nulls[2] = true;
     259        28662 :             nulls[3] = true;
     260        28662 :             nulls[4] = true;
     261        28662 :             nulls[5] = true;
     262        28662 :             nulls[6] = true;
     263        28662 :             nulls[7] = true;
     264              :             /* unused for v1.0 callers, but the array is always long enough */
     265        28662 :             nulls[8] = true;
     266              :         }
     267              :         else
     268              :         {
     269         4106 :             values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
     270         4106 :             nulls[1] = false;
     271         4106 :             values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
     272         4106 :             nulls[2] = false;
     273         4106 :             values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
     274         4106 :             nulls[3] = false;
     275         4106 :             values[4] = Int16GetDatum(fctx->record[i].forknum);
     276         4106 :             nulls[4] = false;
     277         4106 :             values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
     278         4106 :             nulls[5] = false;
     279         4106 :             values[6] = BoolGetDatum(fctx->record[i].isdirty);
     280         4106 :             nulls[6] = false;
     281         4106 :             values[7] = UInt16GetDatum(fctx->record[i].usagecount);
     282         4106 :             nulls[7] = false;
     283              :             /* unused for v1.0 callers, but the array is always long enough */
     284         4106 :             values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
     285         4106 :             nulls[8] = false;
     286              :         }
     287              : 
     288              :         /* Build and return the tuple. */
     289        32768 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     290        32768 :         result = HeapTupleGetDatum(tuple);
     291              : 
     292        32768 :         SRF_RETURN_NEXT(funcctx, result);
     293              :     }
     294              :     else
     295            2 :         SRF_RETURN_DONE(funcctx);
     296              : }
     297              : 
     298              : /*
     299              :  * Inquire about OS pages mappings for shared buffers, with NUMA information,
     300              :  * optionally.
     301              :  *
     302              :  * When "include_numa" is false, this routines ignores everything related
     303              :  * to NUMA (returned as NULL values), returning mapping information between
     304              :  * shared buffers and OS pages.
     305              :  *
     306              :  * When "include_numa" is true, NUMA is initialized and numa_node values
     307              :  * are generated.  In order to get reliable results we also need to touch
     308              :  * memory pages, so that the inquiry about NUMA memory node does not return
     309              :  * -2, indicating unmapped/unallocated pages.
     310              :  *
     311              :  * Buffers may be smaller or larger than OS memory pages. For each buffer we
     312              :  * return one entry for each memory page used by the buffer (if the buffer is
     313              :  * smaller, it only uses a part of one memory page).
     314              :  *
     315              :  * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
     316              :  * one is always a multiple of the other.
     317              :  *
     318              :  */
     319              : static Datum
     320        65538 : pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa)
     321              : {
     322              :     FuncCallContext *funcctx;
     323              :     MemoryContext oldcontext;
     324              :     BufferCacheOsPagesContext *fctx;    /* User function context. */
     325              :     TupleDesc   tupledesc;
     326              :     TupleDesc   expected_tupledesc;
     327              :     HeapTuple   tuple;
     328              :     Datum       result;
     329              : 
     330        65538 :     if (SRF_IS_FIRSTCALL())
     331              :     {
     332              :         int         i,
     333              :                     idx;
     334              :         Size        os_page_size;
     335              :         int         pages_per_buffer;
     336            2 :         int        *os_page_status = NULL;
     337            2 :         uint64      os_page_count = 0;
     338              :         int         max_entries;
     339              :         char       *startptr,
     340              :                    *endptr;
     341              : 
     342              :         /* If NUMA information is requested, initialize NUMA support. */
     343            2 :         if (include_numa && pg_numa_init() == -1)
     344            0 :             elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
     345              : 
     346              :         /*
     347              :          * The database block size and OS memory page size are unlikely to be
     348              :          * the same. The block size is 1-32KB, the memory page size depends on
     349              :          * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
     350              :          * there are also features like THP etc. Moreover, we don't quite know
     351              :          * how the pages and buffers "align" in memory - the buffers may be
     352              :          * shifted in some way, using more memory pages than necessary.
     353              :          *
     354              :          * So we need to be careful about mapping buffers to memory pages. We
     355              :          * calculate the maximum number of pages a buffer might use, so that
     356              :          * we allocate enough space for the entries. And then we count the
     357              :          * actual number of entries as we scan the buffers.
     358              :          *
     359              :          * This information is needed before calling move_pages() for NUMA
     360              :          * node id inquiry.
     361              :          */
     362            2 :         os_page_size = pg_get_shmem_pagesize();
     363              : 
     364              :         /*
     365              :          * The pages and block size is expected to be 2^k, so one divides the
     366              :          * other (we don't know in which direction). This does not say
     367              :          * anything about relative alignment of pages/buffers.
     368              :          */
     369              :         Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
     370              : 
     371            2 :         if (include_numa)
     372              :         {
     373            0 :             void      **os_page_ptrs = NULL;
     374              : 
     375              :             /*
     376              :              * How many addresses we are going to query?  Simply get the page
     377              :              * for the first buffer, and first page after the last buffer, and
     378              :              * count the pages from that.
     379              :              */
     380            0 :             startptr = (char *) TYPEALIGN_DOWN(os_page_size,
     381              :                                                BufferGetBlock(1));
     382            0 :             endptr = (char *) TYPEALIGN(os_page_size,
     383              :                                         (char *) BufferGetBlock(NBuffers) + BLCKSZ);
     384            0 :             os_page_count = (endptr - startptr) / os_page_size;
     385              : 
     386              :             /* Used to determine the NUMA node for all OS pages at once */
     387            0 :             os_page_ptrs = palloc0_array(void *, os_page_count);
     388            0 :             os_page_status = palloc_array(int, os_page_count);
     389              : 
     390              :             /*
     391              :              * Fill pointers for all the memory pages.  This loop stores and
     392              :              * touches (if needed) addresses into os_page_ptrs[] as input to
     393              :              * one big move_pages(2) inquiry system call, as done in
     394              :              * pg_numa_query_pages().
     395              :              */
     396            0 :             idx = 0;
     397            0 :             for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
     398              :             {
     399            0 :                 os_page_ptrs[idx++] = ptr;
     400              : 
     401              :                 /* Only need to touch memory once per backend process lifetime */
     402            0 :                 if (firstNumaTouch)
     403              :                     pg_numa_touch_mem_if_required(ptr);
     404              :             }
     405              : 
     406              :             Assert(idx == os_page_count);
     407              : 
     408            0 :             elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
     409              :                  "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
     410              : 
     411              :             /*
     412              :              * If we ever get 0xff back from kernel inquiry, then we probably
     413              :              * have bug in our buffers to OS page mapping code here.
     414              :              */
     415            0 :             memset(os_page_status, 0xff, sizeof(int) * os_page_count);
     416              : 
     417              :             /* Query NUMA status for all the pointers */
     418            0 :             if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
     419            0 :                 elog(ERROR, "failed NUMA pages inquiry: %m");
     420              :         }
     421              : 
     422              :         /* Initialize the multi-call context, load entries about buffers */
     423              : 
     424            2 :         funcctx = SRF_FIRSTCALL_INIT();
     425              : 
     426              :         /* Switch context when allocating stuff to be used in later calls */
     427            2 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     428              : 
     429              :         /* Create a user function context for cross-call persistence */
     430            2 :         fctx = palloc_object(BufferCacheOsPagesContext);
     431              : 
     432            2 :         if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
     433            0 :             elog(ERROR, "return type must be a row type");
     434              : 
     435            2 :         if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM)
     436            0 :             elog(ERROR, "incorrect number of output arguments");
     437              : 
     438              :         /* Construct a tuple descriptor for the result rows. */
     439            2 :         tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
     440            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
     441              :                            INT4OID, -1, 0);
     442            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "os_page_num",
     443              :                            INT8OID, -1, 0);
     444            2 :         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "numa_node",
     445              :                            INT4OID, -1, 0);
     446              : 
     447            2 :         TupleDescFinalize(tupledesc);
     448            2 :         fctx->tupdesc = BlessTupleDesc(tupledesc);
     449            2 :         fctx->include_numa = include_numa;
     450              : 
     451              :         /*
     452              :          * Each buffer needs at least one entry, but it might be offset in
     453              :          * some way, and use one extra entry. So we allocate space for the
     454              :          * maximum number of entries we might need, and then count the exact
     455              :          * number as we're walking buffers. That way we can do it in one pass,
     456              :          * without reallocating memory.
     457              :          */
     458            2 :         pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
     459            2 :         max_entries = NBuffers * pages_per_buffer;
     460              : 
     461              :         /* Allocate entries for BufferCacheOsPagesRec records. */
     462            2 :         fctx->record = (BufferCacheOsPagesRec *)
     463            2 :             MemoryContextAllocHuge(CurrentMemoryContext,
     464              :                                    sizeof(BufferCacheOsPagesRec) * max_entries);
     465              : 
     466              :         /* Return to original context when allocating transient memory */
     467            2 :         MemoryContextSwitchTo(oldcontext);
     468              : 
     469            2 :         if (include_numa && firstNumaTouch)
     470            0 :             elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
     471              : 
     472              :         /*
     473              :          * Scan through all the buffers, saving the relevant fields in the
     474              :          * fctx->record structure.
     475              :          *
     476              :          * We don't hold the partition locks, so we don't get a consistent
     477              :          * snapshot across all buffers, but we do grab the buffer header
     478              :          * locks, so the information of each buffer is self-consistent.
     479              :          */
     480            2 :         startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
     481            2 :         idx = 0;
     482        32770 :         for (i = 0; i < NBuffers; i++)
     483              :         {
     484        32768 :             char       *buffptr = (char *) BufferGetBlock(i + 1);
     485              :             BufferDesc *bufHdr;
     486              :             uint32      bufferid;
     487              :             int32       page_num;
     488              :             char       *startptr_buff,
     489              :                        *endptr_buff;
     490              : 
     491        32768 :             CHECK_FOR_INTERRUPTS();
     492              : 
     493        32768 :             bufHdr = GetBufferDescriptor(i);
     494              : 
     495              :             /* Lock each buffer header before inspecting. */
     496        32768 :             LockBufHdr(bufHdr);
     497        32768 :             bufferid = BufferDescriptorGetBuffer(bufHdr);
     498        32768 :             UnlockBufHdr(bufHdr);
     499              : 
     500              :             /* start of the first page of this buffer */
     501        32768 :             startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
     502              : 
     503              :             /* end of the buffer (no need to align to memory page) */
     504        32768 :             endptr_buff = buffptr + BLCKSZ;
     505              : 
     506              :             Assert(startptr_buff < endptr_buff);
     507              : 
     508              :             /* calculate ID of the first page for this buffer */
     509        32768 :             page_num = (startptr_buff - startptr) / os_page_size;
     510              : 
     511              :             /* Add an entry for each OS page overlapping with this buffer. */
     512        98304 :             for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
     513              :             {
     514        65536 :                 fctx->record[idx].bufferid = bufferid;
     515        65536 :                 fctx->record[idx].page_num = page_num;
     516        65536 :                 fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1;
     517              : 
     518              :                 /* advance to the next entry/page */
     519        65536 :                 ++idx;
     520        65536 :                 ++page_num;
     521              :             }
     522              :         }
     523              : 
     524              :         Assert(idx <= max_entries);
     525              : 
     526              :         if (include_numa)
     527              :             Assert(idx >= os_page_count);
     528              : 
     529              :         /* Set max calls and remember the user function context. */
     530            2 :         funcctx->max_calls = idx;
     531            2 :         funcctx->user_fctx = fctx;
     532              : 
     533              :         /* Remember this backend touched the pages (only relevant for NUMA) */
     534            2 :         if (include_numa)
     535            0 :             firstNumaTouch = false;
     536              :     }
     537              : 
     538        65538 :     funcctx = SRF_PERCALL_SETUP();
     539              : 
     540              :     /* Get the saved state */
     541        65538 :     fctx = funcctx->user_fctx;
     542              : 
     543        65538 :     if (funcctx->call_cntr < funcctx->max_calls)
     544              :     {
     545        65536 :         uint32      i = funcctx->call_cntr;
     546              :         Datum       values[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     547              :         bool        nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM];
     548              : 
     549        65536 :         values[0] = Int32GetDatum(fctx->record[i].bufferid);
     550        65536 :         nulls[0] = false;
     551              : 
     552        65536 :         values[1] = Int64GetDatum(fctx->record[i].page_num);
     553        65536 :         nulls[1] = false;
     554              : 
     555        65536 :         if (fctx->include_numa)
     556              :         {
     557              :             /* status is valid node number */
     558            0 :             if (fctx->record[i].numa_node >= 0)
     559              :             {
     560            0 :                 values[2] = Int32GetDatum(fctx->record[i].numa_node);
     561            0 :                 nulls[2] = false;
     562              :             }
     563              :             else
     564              :             {
     565              :                 /* some kind of error (e.g. pages moved to swap) */
     566            0 :                 values[2] = (Datum) 0;
     567            0 :                 nulls[2] = true;
     568              :             }
     569              :         }
     570              :         else
     571              :         {
     572        65536 :             values[2] = (Datum) 0;
     573        65536 :             nulls[2] = true;
     574              :         }
     575              : 
     576              :         /* Build and return the tuple. */
     577        65536 :         tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
     578        65536 :         result = HeapTupleGetDatum(tuple);
     579              : 
     580        65536 :         SRF_RETURN_NEXT(funcctx, result);
     581              :     }
     582              :     else
     583            2 :         SRF_RETURN_DONE(funcctx);
     584              : }
     585              : 
     586              : /*
     587              :  * pg_buffercache_os_pages
     588              :  *
     589              :  * Retrieve information about OS pages, with or without NUMA information.
     590              :  */
     591              : Datum
     592        65538 : pg_buffercache_os_pages(PG_FUNCTION_ARGS)
     593              : {
     594              :     bool        include_numa;
     595              : 
     596              :     /* Get the boolean parameter that controls the NUMA behavior. */
     597        65538 :     include_numa = PG_GETARG_BOOL(0);
     598              : 
     599        65538 :     return pg_buffercache_os_pages_internal(fcinfo, include_numa);
     600              : }
     601              : 
     602              : /* Backward-compatible wrapper for v1.6. */
     603              : Datum
     604            0 : pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
     605              : {
     606              :     /* Call internal function with include_numa=true */
     607            0 :     return pg_buffercache_os_pages_internal(fcinfo, true);
     608              : }
     609              : 
     610              : Datum
     611            2 : pg_buffercache_summary(PG_FUNCTION_ARGS)
     612              : {
     613              :     Datum       result;
     614              :     TupleDesc   tupledesc;
     615              :     HeapTuple   tuple;
     616              :     Datum       values[NUM_BUFFERCACHE_SUMMARY_ELEM];
     617              :     bool        nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
     618              : 
     619            2 :     int32       buffers_used = 0;
     620            2 :     int32       buffers_unused = 0;
     621            2 :     int32       buffers_dirty = 0;
     622            2 :     int32       buffers_pinned = 0;
     623            2 :     int64       usagecount_total = 0;
     624              : 
     625            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     626            0 :         elog(ERROR, "return type must be a row type");
     627              : 
     628        32770 :     for (int i = 0; i < NBuffers; i++)
     629              :     {
     630              :         BufferDesc *bufHdr;
     631              :         uint64      buf_state;
     632              : 
     633        32768 :         CHECK_FOR_INTERRUPTS();
     634              : 
     635              :         /*
     636              :          * This function summarizes the state of all headers. Locking the
     637              :          * buffer headers wouldn't provide an improved result as the state of
     638              :          * the buffer can still change after we release the lock and it'd
     639              :          * noticeably increase the cost of the function.
     640              :          */
     641        32768 :         bufHdr = GetBufferDescriptor(i);
     642        32768 :         buf_state = pg_atomic_read_u64(&bufHdr->state);
     643              : 
     644        32768 :         if (buf_state & BM_VALID)
     645              :         {
     646         4106 :             buffers_used++;
     647         4106 :             usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
     648              : 
     649         4106 :             if (buf_state & BM_DIRTY)
     650         1954 :                 buffers_dirty++;
     651              :         }
     652              :         else
     653        28662 :             buffers_unused++;
     654              : 
     655        32768 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     656            0 :             buffers_pinned++;
     657              :     }
     658              : 
     659            2 :     memset(nulls, 0, sizeof(nulls));
     660            2 :     values[0] = Int32GetDatum(buffers_used);
     661            2 :     values[1] = Int32GetDatum(buffers_unused);
     662            2 :     values[2] = Int32GetDatum(buffers_dirty);
     663            2 :     values[3] = Int32GetDatum(buffers_pinned);
     664              : 
     665            2 :     if (buffers_used != 0)
     666            2 :         values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
     667              :     else
     668            0 :         nulls[4] = true;
     669              : 
     670              :     /* Build and return the tuple. */
     671            2 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     672            2 :     result = HeapTupleGetDatum(tuple);
     673              : 
     674            2 :     PG_RETURN_DATUM(result);
     675              : }
     676              : 
     677              : Datum
     678            2 : pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
     679              : {
     680            2 :     ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
     681            2 :     int         usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
     682            2 :     int         dirty[BM_MAX_USAGE_COUNT + 1] = {0};
     683            2 :     int         pinned[BM_MAX_USAGE_COUNT + 1] = {0};
     684              :     Datum       values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
     685            2 :     bool        nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
     686              : 
     687            2 :     InitMaterializedSRF(fcinfo, 0);
     688              : 
     689        32770 :     for (int i = 0; i < NBuffers; i++)
     690              :     {
     691        32768 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
     692        32768 :         uint64      buf_state = pg_atomic_read_u64(&bufHdr->state);
     693              :         int         usage_count;
     694              : 
     695        32768 :         CHECK_FOR_INTERRUPTS();
     696              : 
     697        32768 :         usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
     698        32768 :         usage_counts[usage_count]++;
     699              : 
     700        32768 :         if (buf_state & BM_DIRTY)
     701         1954 :             dirty[usage_count]++;
     702              : 
     703        32768 :         if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
     704            0 :             pinned[usage_count]++;
     705              :     }
     706              : 
     707           14 :     for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
     708              :     {
     709           12 :         values[0] = Int32GetDatum(i);
     710           12 :         values[1] = Int32GetDatum(usage_counts[i]);
     711           12 :         values[2] = Int32GetDatum(dirty[i]);
     712           12 :         values[3] = Int32GetDatum(pinned[i]);
     713              : 
     714           12 :         tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
     715              :     }
     716              : 
     717            2 :     return (Datum) 0;
     718              : }
     719              : 
     720              : /*
     721              :  * Helper function to check if the user has superuser privileges.
     722              :  */
     723              : static void
     724           20 : pg_buffercache_superuser_check(char *func_name)
     725              : {
     726           20 :     if (!superuser())
     727            6 :         ereport(ERROR,
     728              :                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
     729              :                  errmsg("must be superuser to use %s()",
     730              :                         func_name)));
     731           14 : }
     732              : 
     733              : /*
     734              :  * Try to evict a shared buffer.
     735              :  */
     736              : Datum
     737            5 : pg_buffercache_evict(PG_FUNCTION_ARGS)
     738              : {
     739              :     Datum       result;
     740              :     TupleDesc   tupledesc;
     741              :     HeapTuple   tuple;
     742              :     Datum       values[NUM_BUFFERCACHE_EVICT_ELEM];
     743            5 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ELEM] = {0};
     744              : 
     745            5 :     Buffer      buf = PG_GETARG_INT32(0);
     746              :     bool        buffer_flushed;
     747              : 
     748            5 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     749            0 :         elog(ERROR, "return type must be a row type");
     750              : 
     751            5 :     pg_buffercache_superuser_check("pg_buffercache_evict");
     752              : 
     753            4 :     if (buf < 1 || buf > NBuffers)
     754            3 :         elog(ERROR, "bad buffer ID: %d", buf);
     755              : 
     756            1 :     values[0] = BoolGetDatum(EvictUnpinnedBuffer(buf, &buffer_flushed));
     757            1 :     values[1] = BoolGetDatum(buffer_flushed);
     758              : 
     759            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     760            1 :     result = HeapTupleGetDatum(tuple);
     761              : 
     762            1 :     PG_RETURN_DATUM(result);
     763              : }
     764              : 
     765              : /*
     766              :  * Try to evict specified relation.
     767              :  */
     768              : Datum
     769            3 : pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
     770              : {
     771              :     Datum       result;
     772              :     TupleDesc   tupledesc;
     773              :     HeapTuple   tuple;
     774              :     Datum       values[NUM_BUFFERCACHE_EVICT_RELATION_ELEM];
     775            3 :     bool        nulls[NUM_BUFFERCACHE_EVICT_RELATION_ELEM] = {0};
     776              : 
     777              :     Oid         relOid;
     778              :     Relation    rel;
     779              : 
     780            3 :     int32       buffers_evicted = 0;
     781            3 :     int32       buffers_flushed = 0;
     782            3 :     int32       buffers_skipped = 0;
     783              : 
     784            3 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     785            0 :         elog(ERROR, "return type must be a row type");
     786              : 
     787            3 :     pg_buffercache_superuser_check("pg_buffercache_evict_relation");
     788              : 
     789            2 :     relOid = PG_GETARG_OID(0);
     790              : 
     791            2 :     rel = relation_open(relOid, AccessShareLock);
     792              : 
     793            2 :     if (RelationUsesLocalBuffers(rel))
     794            1 :         ereport(ERROR,
     795              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     796              :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     797              :                         "pg_buffercache_evict_relation")));
     798              : 
     799            1 :     EvictRelUnpinnedBuffers(rel, &buffers_evicted, &buffers_flushed,
     800              :                             &buffers_skipped);
     801              : 
     802            1 :     relation_close(rel, AccessShareLock);
     803              : 
     804            1 :     values[0] = Int32GetDatum(buffers_evicted);
     805            1 :     values[1] = Int32GetDatum(buffers_flushed);
     806            1 :     values[2] = Int32GetDatum(buffers_skipped);
     807              : 
     808            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     809            1 :     result = HeapTupleGetDatum(tuple);
     810              : 
     811            1 :     PG_RETURN_DATUM(result);
     812              : }
     813              : 
     814              : 
     815              : /*
     816              :  * Try to evict all shared buffers.
     817              :  */
     818              : Datum
     819            2 : pg_buffercache_evict_all(PG_FUNCTION_ARGS)
     820              : {
     821              :     Datum       result;
     822              :     TupleDesc   tupledesc;
     823              :     HeapTuple   tuple;
     824              :     Datum       values[NUM_BUFFERCACHE_EVICT_ALL_ELEM];
     825            2 :     bool        nulls[NUM_BUFFERCACHE_EVICT_ALL_ELEM] = {0};
     826              : 
     827            2 :     int32       buffers_evicted = 0;
     828            2 :     int32       buffers_flushed = 0;
     829            2 :     int32       buffers_skipped = 0;
     830              : 
     831            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     832            0 :         elog(ERROR, "return type must be a row type");
     833              : 
     834            2 :     pg_buffercache_superuser_check("pg_buffercache_evict_all");
     835              : 
     836            1 :     EvictAllUnpinnedBuffers(&buffers_evicted, &buffers_flushed,
     837              :                             &buffers_skipped);
     838              : 
     839            1 :     values[0] = Int32GetDatum(buffers_evicted);
     840            1 :     values[1] = Int32GetDatum(buffers_flushed);
     841            1 :     values[2] = Int32GetDatum(buffers_skipped);
     842              : 
     843            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     844            1 :     result = HeapTupleGetDatum(tuple);
     845              : 
     846            1 :     PG_RETURN_DATUM(result);
     847              : }
     848              : 
     849              : /*
     850              :  * Try to mark a shared buffer as dirty.
     851              :  */
     852              : Datum
     853            5 : pg_buffercache_mark_dirty(PG_FUNCTION_ARGS)
     854              : {
     855              : 
     856              :     Datum       result;
     857              :     TupleDesc   tupledesc;
     858              :     HeapTuple   tuple;
     859              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ELEM];
     860            5 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ELEM] = {0};
     861              : 
     862            5 :     Buffer      buf = PG_GETARG_INT32(0);
     863              :     bool        buffer_already_dirty;
     864              : 
     865            5 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     866            0 :         elog(ERROR, "return type must be a row type");
     867              : 
     868            5 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty");
     869              : 
     870            4 :     if (buf < 1 || buf > NBuffers)
     871            3 :         elog(ERROR, "bad buffer ID: %d", buf);
     872              : 
     873            1 :     values[0] = BoolGetDatum(MarkDirtyUnpinnedBuffer(buf, &buffer_already_dirty));
     874            1 :     values[1] = BoolGetDatum(buffer_already_dirty);
     875              : 
     876            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     877            1 :     result = HeapTupleGetDatum(tuple);
     878              : 
     879            1 :     PG_RETURN_DATUM(result);
     880              : }
     881              : 
     882              : /*
     883              :  * Try to mark all the shared buffers of a relation as dirty.
     884              :  */
     885              : Datum
     886            3 : pg_buffercache_mark_dirty_relation(PG_FUNCTION_ARGS)
     887              : {
     888              :     Datum       result;
     889              :     TupleDesc   tupledesc;
     890              :     HeapTuple   tuple;
     891              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM];
     892            3 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_RELATION_ELEM] = {0};
     893              : 
     894              :     Oid         relOid;
     895              :     Relation    rel;
     896              : 
     897            3 :     int32       buffers_already_dirty = 0;
     898            3 :     int32       buffers_dirtied = 0;
     899            3 :     int32       buffers_skipped = 0;
     900              : 
     901            3 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     902            0 :         elog(ERROR, "return type must be a row type");
     903              : 
     904            3 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_relation");
     905              : 
     906            2 :     relOid = PG_GETARG_OID(0);
     907              : 
     908            2 :     rel = relation_open(relOid, AccessShareLock);
     909              : 
     910            2 :     if (RelationUsesLocalBuffers(rel))
     911            1 :         ereport(ERROR,
     912              :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     913              :                  errmsg("relation uses local buffers, %s() is intended to be used for shared buffers only",
     914              :                         "pg_buffercache_mark_dirty_relation")));
     915              : 
     916            1 :     MarkDirtyRelUnpinnedBuffers(rel, &buffers_dirtied, &buffers_already_dirty,
     917              :                                 &buffers_skipped);
     918              : 
     919            1 :     relation_close(rel, AccessShareLock);
     920              : 
     921            1 :     values[0] = Int32GetDatum(buffers_dirtied);
     922            1 :     values[1] = Int32GetDatum(buffers_already_dirty);
     923            1 :     values[2] = Int32GetDatum(buffers_skipped);
     924              : 
     925            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     926            1 :     result = HeapTupleGetDatum(tuple);
     927              : 
     928            1 :     PG_RETURN_DATUM(result);
     929              : }
     930              : 
     931              : /*
     932              :  * Try to mark all the shared buffers as dirty.
     933              :  */
     934              : Datum
     935            2 : pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
     936              : {
     937              :     Datum       result;
     938              :     TupleDesc   tupledesc;
     939              :     HeapTuple   tuple;
     940              :     Datum       values[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM];
     941            2 :     bool        nulls[NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM] = {0};
     942              : 
     943            2 :     int32       buffers_already_dirty = 0;
     944            2 :     int32       buffers_dirtied = 0;
     945            2 :     int32       buffers_skipped = 0;
     946              : 
     947            2 :     if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
     948            0 :         elog(ERROR, "return type must be a row type");
     949              : 
     950            2 :     pg_buffercache_superuser_check("pg_buffercache_mark_dirty_all");
     951              : 
     952            1 :     MarkDirtyAllUnpinnedBuffers(&buffers_dirtied, &buffers_already_dirty,
     953              :                                 &buffers_skipped);
     954              : 
     955            1 :     values[0] = Int32GetDatum(buffers_dirtied);
     956            1 :     values[1] = Int32GetDatum(buffers_already_dirty);
     957            1 :     values[2] = Int32GetDatum(buffers_skipped);
     958              : 
     959            1 :     tuple = heap_form_tuple(tupledesc, values, nulls);
     960            1 :     result = HeapTupleGetDatum(tuple);
     961              : 
     962            1 :     PG_RETURN_DATUM(result);
     963              : }
        

Generated by: LCOV version 2.0-1