LCOV - code coverage report
Current view: top level - src/backend/storage/freespace - freespace.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 91.3 % 230 210
Test Date: 2026-03-03 18:14:56 Functions: 100.0 % 22 22
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * freespace.c
       4              :  *    POSTGRES free space map for quickly finding free space in relations
       5              :  *
       6              :  *
       7              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1994, Regents of the University of California
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/storage/freespace/freespace.c
      12              :  *
      13              :  *
      14              :  * NOTES:
      15              :  *
      16              :  *  Free Space Map keeps track of the amount of free space on pages, and
      17              :  *  allows quickly searching for a page with enough free space. The FSM is
      18              :  *  stored in a dedicated relation fork of all heap relations, and those
      19              :  *  index access methods that need it (see also indexfsm.c). See README for
      20              :  *  more information.
      21              :  *
      22              :  *-------------------------------------------------------------------------
      23              :  */
      24              : #include "postgres.h"
      25              : 
      26              : #include "access/htup_details.h"
      27              : #include "access/xloginsert.h"
      28              : #include "access/xlogutils.h"
      29              : #include "miscadmin.h"
      30              : #include "storage/freespace.h"
      31              : #include "storage/fsm_internals.h"
      32              : #include "storage/smgr.h"
      33              : #include "utils/rel.h"
      34              : 
      35              : 
      36              : /*
      37              :  * We use just one byte to store the amount of free space on a page, so we
      38              :  * divide the amount of free space a page can have into 256 different
      39              :  * categories. The highest category, 255, represents a page with at least
      40              :  * MaxFSMRequestSize bytes of free space, and the second highest category
      41              :  * represents the range from 254 * FSM_CAT_STEP, inclusive, to
      42              :  * MaxFSMRequestSize, exclusive.
      43              :  *
      44              :  * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
      45              :  * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
      46              :  * categories look like this:
      47              :  *
      48              :  *
      49              :  * Range     Category
      50              :  * 0    - 31   0
      51              :  * 32   - 63   1
      52              :  * ...    ...  ...
      53              :  * 8096 - 8127 253
      54              :  * 8128 - 8163 254
      55              :  * 8164 - 8192 255
      56              :  *
      57              :  * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
      58              :  * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
      59              :  * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
      60              :  * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
      61              :  * completely empty page, that would mean that we could never satisfy a
      62              :  * request of exactly MaxFSMRequestSize bytes.
      63              :  */
      64              : #define FSM_CATEGORIES  256
      65              : #define FSM_CAT_STEP    (BLCKSZ / FSM_CATEGORIES)
      66              : #define MaxFSMRequestSize   MaxHeapTupleSize
      67              : 
      68              : /*
      69              :  * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
      70              :  * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
      71              :  * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
      72              :  * this means that 4096 bytes is the smallest BLCKSZ that we can get away
      73              :  * with a 3-level tree, and 512 is the smallest we support.
      74              :  */
      75              : #define FSM_TREE_DEPTH  ((SlotsPerFSMPage >= 1626) ? 3 : 4)
      76              : 
      77              : #define FSM_ROOT_LEVEL  (FSM_TREE_DEPTH - 1)
      78              : #define FSM_BOTTOM_LEVEL 0
      79              : 
      80              : /*
      81              :  * The internal FSM routines work on a logical addressing scheme. Each
      82              :  * level of the tree can be thought of as a separately addressable file.
      83              :  */
      84              : typedef struct
      85              : {
      86              :     int         level;          /* level */
      87              :     int         logpageno;      /* page number within the level */
      88              : } FSMAddress;
      89              : 
      90              : /* Address of the root page. */
      91              : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
      92              : 
      93              : /* functions to navigate the tree */
      94              : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
      95              : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
      96              : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
      97              : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
      98              : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
      99              : 
     100              : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
     101              : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
     102              : 
     103              : /* functions to convert amount of free space to a FSM category */
     104              : static uint8 fsm_space_avail_to_cat(Size avail);
     105              : static uint8 fsm_space_needed_to_cat(Size needed);
     106              : static Size fsm_space_cat_to_avail(uint8 cat);
     107              : 
     108              : /* workhorse functions for various operations */
     109              : static int  fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     110              :                                uint8 newValue, uint8 minValue);
     111              : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
     112              : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
     113              :                              BlockNumber start, BlockNumber end,
     114              :                              bool *eof_p);
     115              : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
     116              : 
     117              : 
     118              : /******** Public API ********/
     119              : 
     120              : /*
     121              :  * GetPageWithFreeSpace - try to find a page in the given relation with
     122              :  *      at least the specified amount of free space.
     123              :  *
     124              :  * If successful, return the block number; if not, return InvalidBlockNumber.
     125              :  *
     126              :  * The caller must be prepared for the possibility that the returned page
     127              :  * will turn out to have too little space available by the time the caller
     128              :  * gets a lock on it.  In that case, the caller should report the actual
     129              :  * amount of free space available on that page and then try again (see
     130              :  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
     131              :  * extend the relation.
     132              :  *
     133              :  * This can trigger FSM updates if any FSM entry is found to point to a block
     134              :  * past the end of the relation.
     135              :  */
     136              : BlockNumber
     137        89776 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
     138              : {
     139        89776 :     uint8       min_cat = fsm_space_needed_to_cat(spaceNeeded);
     140              : 
     141        89776 :     return fsm_search(rel, min_cat);
     142              : }
     143              : 
     144              : /*
     145              :  * RecordAndGetPageWithFreeSpace - update info about a page and try again.
     146              :  *
     147              :  * We provide this combo form to save some locking overhead, compared to
     148              :  * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
     149              :  * also some effort to return a page close to the old page; if there's a
     150              :  * page with enough free space on the same FSM page where the old one page
     151              :  * is located, it is preferred.
     152              :  */
     153              : BlockNumber
     154       111892 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
     155              :                               Size oldSpaceAvail, Size spaceNeeded)
     156              : {
     157       111892 :     int         old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
     158       111892 :     int         search_cat = fsm_space_needed_to_cat(spaceNeeded);
     159              :     FSMAddress  addr;
     160              :     uint16      slot;
     161              :     int         search_slot;
     162              : 
     163              :     /* Get the location of the FSM byte representing the heap block */
     164       111892 :     addr = fsm_get_location(oldPage, &slot);
     165              : 
     166       111892 :     search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
     167              : 
     168              :     /*
     169              :      * If fsm_set_and_search found a suitable new block, return that.
     170              :      * Otherwise, search as usual.
     171              :      */
     172       111892 :     if (search_slot != -1)
     173              :     {
     174        14817 :         BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
     175              : 
     176              :         /*
     177              :          * Check that the blknum is actually in the relation. Don't try to
     178              :          * update the FSM in that case, just fall back to the other case
     179              :          */
     180        14817 :         if (fsm_does_block_exist(rel, blknum))
     181        14817 :             return blknum;
     182              :     }
     183        97075 :     return fsm_search(rel, search_cat);
     184              : }
     185              : 
     186              : /*
     187              :  * RecordPageWithFreeSpace - update info about a page.
     188              :  *
     189              :  * Note that if the new spaceAvail value is higher than the old value stored
     190              :  * in the FSM, the space might not become visible to searchers until the next
     191              :  * FreeSpaceMapVacuum call, which updates the upper level pages.
     192              :  */
     193              : void
     194       537143 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
     195              : {
     196       537143 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     197              :     FSMAddress  addr;
     198              :     uint16      slot;
     199              : 
     200              :     /* Get the location of the FSM byte representing the heap block */
     201       537143 :     addr = fsm_get_location(heapBlk, &slot);
     202              : 
     203       537143 :     fsm_set_and_search(rel, addr, slot, new_cat, 0);
     204       537143 : }
     205              : 
     206              : /*
     207              :  * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
     208              :  *      WAL replay
     209              :  */
     210              : void
     211       303360 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
     212              :                             Size spaceAvail)
     213              : {
     214       303360 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     215              :     FSMAddress  addr;
     216              :     uint16      slot;
     217              :     BlockNumber blkno;
     218              :     Buffer      buf;
     219              :     Page        page;
     220              : 
     221              :     /* Get the location of the FSM byte representing the heap block */
     222       303360 :     addr = fsm_get_location(heapBlk, &slot);
     223       303360 :     blkno = fsm_logical_to_physical(addr);
     224              : 
     225              :     /* If the page doesn't exist already, extend */
     226       303360 :     buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
     227              :                                  RBM_ZERO_ON_ERROR, InvalidBuffer);
     228       303360 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     229              : 
     230       303360 :     page = BufferGetPage(buf);
     231       303360 :     if (PageIsNew(page))
     232          539 :         PageInit(page, BLCKSZ, 0);
     233              : 
     234       303360 :     if (fsm_set_avail(page, slot, new_cat))
     235       297705 :         MarkBufferDirtyHint(buf, false);
     236       303360 :     UnlockReleaseBuffer(buf);
     237       303360 : }
     238              : 
     239              : /*
     240              :  * GetRecordedFreeSpace - return the amount of free space on a particular page,
     241              :  *      according to the FSM.
     242              :  */
     243              : Size
     244          630 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
     245              : {
     246              :     FSMAddress  addr;
     247              :     uint16      slot;
     248              :     Buffer      buf;
     249              :     uint8       cat;
     250              : 
     251              :     /* Get the location of the FSM byte representing the heap block */
     252          630 :     addr = fsm_get_location(heapBlk, &slot);
     253              : 
     254          630 :     buf = fsm_readbuf(rel, addr, false);
     255          630 :     if (!BufferIsValid(buf))
     256           36 :         return 0;
     257          594 :     cat = fsm_get_avail(BufferGetPage(buf), slot);
     258          594 :     ReleaseBuffer(buf);
     259              : 
     260          594 :     return fsm_space_cat_to_avail(cat);
     261              : }
     262              : 
     263              : /*
     264              :  * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
     265              :  *
     266              :  * nblocks is the new size of the heap.
     267              :  *
     268              :  * Return the number of blocks of new FSM.
     269              :  * If it's InvalidBlockNumber, there is nothing to truncate;
     270              :  * otherwise the caller is responsible for calling smgrtruncate()
     271              :  * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
     272              :  * to update upper-level pages in the FSM.
     273              :  */
     274              : BlockNumber
     275          184 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
     276              : {
     277              :     BlockNumber new_nfsmblocks;
     278              :     FSMAddress  first_removed_address;
     279              :     uint16      first_removed_slot;
     280              :     Buffer      buf;
     281              : 
     282              :     /*
     283              :      * If no FSM has been created yet for this relation, there's nothing to
     284              :      * truncate.
     285              :      */
     286          184 :     if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
     287            0 :         return InvalidBlockNumber;
     288              : 
     289              :     /* Get the location in the FSM of the first removed heap block */
     290          184 :     first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
     291              : 
     292              :     /*
     293              :      * Zero out the tail of the last remaining FSM page. If the slot
     294              :      * representing the first removed heap block is at a page boundary, as the
     295              :      * first slot on the FSM page that first_removed_address points to, we can
     296              :      * just truncate that page altogether.
     297              :      */
     298          184 :     if (first_removed_slot > 0)
     299              :     {
     300           96 :         buf = fsm_readbuf(rel, first_removed_address, false);
     301           96 :         if (!BufferIsValid(buf))
     302            0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already
     303              :                                          * smaller */
     304           96 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     305              : 
     306              :         /* NO EREPORT(ERROR) from here till changes are logged */
     307           96 :         START_CRIT_SECTION();
     308              : 
     309           96 :         fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
     310              : 
     311              :         /*
     312              :          * This change is non-critical, because fsm_does_block_exist() would
     313              :          * stop us from returning a truncated-away block.  However, since this
     314              :          * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
     315              :          * of that many fsm_does_block_exist() rejections.  Use a full
     316              :          * MarkBufferDirty(), not MarkBufferDirtyHint().
     317              :          */
     318           96 :         MarkBufferDirty(buf);
     319              : 
     320              :         /*
     321              :          * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
     322              :          * differing from the rest of the file in this respect.  This is
     323              :          * optional; see README mention of full page images.  XXX consider
     324              :          * XLogSaveBufferForHint() for even closer similarity.
     325              :          *
     326              :          * A higher-level operation calls us at WAL replay.  If we crash
     327              :          * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
     328              :          * not changed, and our fork remains valid.  If we crash after that
     329              :          * flush, redo will return here.
     330              :          */
     331           96 :         if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
     332           73 :             log_newpage_buffer(buf, false);
     333              : 
     334           96 :         END_CRIT_SECTION();
     335              : 
     336           96 :         UnlockReleaseBuffer(buf);
     337              : 
     338           96 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
     339              :     }
     340              :     else
     341              :     {
     342           88 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
     343           88 :         if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
     344            0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already
     345              :                                          * smaller */
     346              :     }
     347              : 
     348          184 :     return new_nfsmblocks;
     349              : }
     350              : 
     351              : /*
     352              :  * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
     353              :  *
     354              :  * We assume that the bottom-level pages have already been updated with
     355              :  * new free-space information.
     356              :  */
     357              : void
     358          152 : FreeSpaceMapVacuum(Relation rel)
     359              : {
     360              :     bool        dummy;
     361              : 
     362              :     /* Recursively scan the tree, starting at the root */
     363          152 :     (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
     364              :                            (BlockNumber) 0, InvalidBlockNumber,
     365              :                            &dummy);
     366          152 : }
     367              : 
     368              : /*
     369              :  * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
     370              :  *
     371              :  * As above, but assume that only heap pages between start and end-1 inclusive
     372              :  * have new free-space information, so update only the upper-level slots
     373              :  * covering that block range.  end == InvalidBlockNumber is equivalent to
     374              :  * "all the rest of the relation".
     375              :  */
     376              : void
     377        51007 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
     378              : {
     379              :     bool        dummy;
     380              : 
     381              :     /* Recursively scan the tree, starting at the root */
     382        51007 :     if (end > start)
     383        50816 :         (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
     384        51007 : }
     385              : 
     386              : /******** Internal routines ********/
     387              : 
     388              : /*
     389              :  * Return category corresponding x bytes of free space
     390              :  */
     391              : static uint8
     392       952395 : fsm_space_avail_to_cat(Size avail)
     393              : {
     394              :     int         cat;
     395              : 
     396              :     Assert(avail < BLCKSZ);
     397              : 
     398       952395 :     if (avail >= MaxFSMRequestSize)
     399        18539 :         return 255;
     400              : 
     401       933856 :     cat = avail / FSM_CAT_STEP;
     402              : 
     403              :     /*
     404              :      * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
     405              :      * more.
     406              :      */
     407       933856 :     if (cat > 254)
     408            0 :         cat = 254;
     409              : 
     410       933856 :     return (uint8) cat;
     411              : }
     412              : 
     413              : /*
     414              :  * Return the lower bound of the range of free space represented by given
     415              :  * category.
     416              :  */
     417              : static Size
     418          594 : fsm_space_cat_to_avail(uint8 cat)
     419              : {
     420              :     /* The highest category represents exactly MaxFSMRequestSize bytes. */
     421          594 :     if (cat == 255)
     422          156 :         return MaxFSMRequestSize;
     423              :     else
     424          438 :         return cat * FSM_CAT_STEP;
     425              : }
     426              : 
     427              : /*
     428              :  * Which category does a page need to have, to accommodate x bytes of data?
     429              :  * While fsm_space_avail_to_cat() rounds down, this needs to round up.
     430              :  */
     431              : static uint8
     432       201668 : fsm_space_needed_to_cat(Size needed)
     433              : {
     434              :     int         cat;
     435              : 
     436              :     /* Can't ask for more space than the highest category represents */
     437       201668 :     if (needed > MaxFSMRequestSize)
     438            0 :         elog(ERROR, "invalid FSM request size %zu", needed);
     439              : 
     440       201668 :     if (needed == 0)
     441            0 :         return 1;
     442              : 
     443       201668 :     cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
     444              : 
     445       201668 :     if (cat > 255)
     446            0 :         cat = 255;
     447              : 
     448       201668 :     return (uint8) cat;
     449              : }
     450              : 
     451              : /*
     452              :  * Returns the physical block number of a FSM page
     453              :  */
     454              : static BlockNumber
     455      1324523 : fsm_logical_to_physical(FSMAddress addr)
     456              : {
     457              :     BlockNumber pages;
     458              :     int         leafno;
     459              :     int         l;
     460              : 
     461              :     /*
     462              :      * Calculate the logical page number of the first leaf page below the
     463              :      * given page.
     464              :      */
     465      1324523 :     leafno = addr.logpageno;
     466      1872569 :     for (l = 0; l < addr.level; l++)
     467       548046 :         leafno *= SlotsPerFSMPage;
     468              : 
     469              :     /* Count upper level nodes required to address the leaf page */
     470      1324523 :     pages = 0;
     471      5298092 :     for (l = 0; l < FSM_TREE_DEPTH; l++)
     472              :     {
     473      3973569 :         pages += leafno + 1;
     474      3973569 :         leafno /= SlotsPerFSMPage;
     475              :     }
     476              : 
     477              :     /*
     478              :      * If the page we were asked for wasn't at the bottom level, subtract the
     479              :      * additional lower level pages we counted above.
     480              :      */
     481      1324523 :     pages -= addr.level;
     482              : 
     483              :     /* Turn the page count into 0-based block number */
     484      1324523 :     return pages - 1;
     485              : }
     486              : 
     487              : /*
     488              :  * Return the FSM location corresponding to given heap block.
     489              :  */
     490              : static FSMAddress
     491      1157005 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
     492              : {
     493              :     FSMAddress  addr;
     494              : 
     495      1157005 :     addr.level = FSM_BOTTOM_LEVEL;
     496      1157005 :     addr.logpageno = heapblk / SlotsPerFSMPage;
     497      1157005 :     *slot = heapblk % SlotsPerFSMPage;
     498              : 
     499      1157005 :     return addr;
     500              : }
     501              : 
     502              : /*
     503              :  * Return the heap block number corresponding to given location in the FSM.
     504              :  */
     505              : static BlockNumber
     506        26692 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
     507              : {
     508              :     Assert(addr.level == FSM_BOTTOM_LEVEL);
     509        26692 :     return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
     510              : }
     511              : 
     512              : /*
     513              :  * Given a logical address of a child page, get the logical page number of
     514              :  * the parent, and the slot within the parent corresponding to the child.
     515              :  */
     516              : static FSMAddress
     517       307753 : fsm_get_parent(FSMAddress child, uint16 *slot)
     518              : {
     519              :     FSMAddress  parent;
     520              : 
     521              :     Assert(child.level < FSM_ROOT_LEVEL);
     522              : 
     523       307753 :     parent.level = child.level + 1;
     524       307753 :     parent.logpageno = child.logpageno / SlotsPerFSMPage;
     525       307753 :     *slot = child.logpageno % SlotsPerFSMPage;
     526              : 
     527       307753 :     return parent;
     528              : }
     529              : 
     530              : /*
     531              :  * Given a logical address of a parent page and a slot number, get the
     532              :  * logical address of the corresponding child page.
     533              :  */
     534              : static FSMAddress
     535       129281 : fsm_get_child(FSMAddress parent, uint16 slot)
     536              : {
     537              :     FSMAddress  child;
     538              : 
     539              :     Assert(parent.level > FSM_BOTTOM_LEVEL);
     540              : 
     541       129281 :     child.level = parent.level - 1;
     542       129281 :     child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
     543              : 
     544       129281 :     return child;
     545              : }
     546              : 
     547              : /*
     548              :  * Read a FSM page.
     549              :  *
     550              :  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
     551              :  * true, the FSM file is extended.
     552              :  */
     553              : static Buffer
     554      1020979 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
     555              : {
     556      1020979 :     BlockNumber blkno = fsm_logical_to_physical(addr);
     557              :     Buffer      buf;
     558      1020979 :     SMgrRelation reln = RelationGetSmgr(rel);
     559              : 
     560              :     /*
     561              :      * If we haven't cached the size of the FSM yet, check it first.  Also
     562              :      * recheck if the requested block seems to be past end, since our cached
     563              :      * value might be stale.  (We send smgr inval messages on truncation, but
     564              :      * not on extension.)
     565              :      */
     566      1020979 :     if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
     567       911202 :         blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
     568              :     {
     569              :         /* Invalidate the cache so smgrnblocks asks the kernel. */
     570       142593 :         reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
     571       142593 :         if (smgrexists(reln, FSM_FORKNUM))
     572        79852 :             smgrnblocks(reln, FSM_FORKNUM);
     573              :         else
     574        62741 :             reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
     575              :     }
     576              : 
     577              :     /*
     578              :      * For reading we use ZERO_ON_ERROR mode, and initialize the page if
     579              :      * necessary.  The FSM information is not accurate anyway, so it's better
     580              :      * to clear corrupt pages than error out. Since the FSM changes are not
     581              :      * WAL-logged, the so-called torn page problem on crash can lead to pages
     582              :      * with corrupt headers, for example.
     583              :      *
     584              :      * We use the same path below to initialize pages when extending the
     585              :      * relation, as a concurrent extension can end up with vm_extend()
     586              :      * returning an already-initialized page.
     587              :      */
     588      1020979 :     if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
     589              :     {
     590        63398 :         if (extend)
     591         4121 :             buf = fsm_extend(rel, blkno + 1);
     592              :         else
     593        59277 :             return InvalidBuffer;
     594              :     }
     595              :     else
     596       957581 :         buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
     597              : 
     598              :     /*
     599              :      * Initializing the page when needed is trickier than it looks, because of
     600              :      * the possibility of multiple backends doing this concurrently, and our
     601              :      * desire to not uselessly take the buffer lock in the normal path where
     602              :      * the page is OK.  We must take the lock to initialize the page, so
     603              :      * recheck page newness after we have the lock, in case someone else
     604              :      * already did it.  Also, because we initially check PageIsNew with no
     605              :      * lock, it's possible to fall through and return the buffer while someone
     606              :      * else is still initializing the page (i.e., we might see pd_upper as set
     607              :      * but other page header fields are still zeroes).  This is harmless for
     608              :      * callers that will take a buffer lock themselves, but some callers
     609              :      * inspect the page without any lock at all.  The latter is OK only so
     610              :      * long as it doesn't depend on the page header having correct contents.
     611              :      * Current usage is safe because PageGetContents() does not require that.
     612              :      */
     613       961702 :     if (PageIsNew(BufferGetPage(buf)))
     614              :     {
     615        14063 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     616        14063 :         if (PageIsNew(BufferGetPage(buf)))
     617        14063 :             PageInit(BufferGetPage(buf), BLCKSZ, 0);
     618        14063 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     619              :     }
     620       961702 :     return buf;
     621              : }
     622              : 
     623              : /*
     624              :  * Ensure that the FSM fork is at least fsm_nblocks long, extending
     625              :  * it if necessary with empty pages. And by empty, I mean pages filled
     626              :  * with zeros, meaning there's no free space.
     627              :  */
     628              : static Buffer
     629         4121 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
     630              : {
     631         4121 :     return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
     632              :                                EB_CREATE_FORK_IF_NEEDED |
     633              :                                EB_CLEAR_SIZE_CACHE,
     634              :                                fsm_nblocks,
     635              :                                RBM_ZERO_ON_ERROR);
     636              : }
     637              : 
     638              : /*
     639              :  * Set value in given FSM page and slot.
     640              :  *
     641              :  * If minValue > 0, the updated page is also searched for a page with at
     642              :  * least minValue of free space. If one is found, its slot number is
     643              :  * returned, -1 otherwise.
     644              :  */
     645              : static int
     646       651094 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     647              :                    uint8 newValue, uint8 minValue)
     648              : {
     649              :     Buffer      buf;
     650              :     Page        page;
     651       651094 :     int         newslot = -1;
     652              : 
     653       651094 :     buf = fsm_readbuf(rel, addr, true);
     654       651094 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     655              : 
     656       651094 :     page = BufferGetPage(buf);
     657              : 
     658       651094 :     if (fsm_set_avail(page, slot, newValue))
     659       109707 :         MarkBufferDirtyHint(buf, false);
     660              : 
     661       651094 :     if (minValue != 0)
     662              :     {
     663              :         /* Search while we still hold the lock */
     664       111892 :         newslot = fsm_search_avail(buf, minValue,
     665       111892 :                                    addr.level == FSM_BOTTOM_LEVEL,
     666              :                                    true);
     667              :     }
     668              : 
     669       651094 :     UnlockReleaseBuffer(buf);
     670              : 
     671       651094 :     return newslot;
     672              : }
     673              : 
     674              : /*
     675              :  * Search the tree for a heap page with at least min_cat of free space
     676              :  */
     677              : static BlockNumber
     678       186851 : fsm_search(Relation rel, uint8 min_cat)
     679              : {
     680       186851 :     int         restarts = 0;
     681       186851 :     FSMAddress  addr = FSM_ROOT_ADDRESS;
     682              : 
     683              :     for (;;)
     684        28896 :     {
     685              :         int         slot;
     686              :         Buffer      buf;
     687       215747 :         uint8       max_avail = 0;
     688              : 
     689              :         /* Read the FSM page. */
     690       215747 :         buf = fsm_readbuf(rel, addr, false);
     691              : 
     692              :         /* Search within the page */
     693       215747 :         if (BufferIsValid(buf))
     694              :         {
     695       157159 :             LockBuffer(buf, BUFFER_LOCK_SHARE);
     696       157159 :             slot = fsm_search_avail(buf, min_cat,
     697       157159 :                                     (addr.level == FSM_BOTTOM_LEVEL),
     698              :                                     false);
     699       157159 :             if (slot == -1)
     700              :             {
     701       118447 :                 max_avail = fsm_get_max_avail(BufferGetPage(buf));
     702       118447 :                 UnlockReleaseBuffer(buf);
     703              :             }
     704              :             else
     705              :             {
     706              :                 /* Keep the pin for possible update below */
     707        38712 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     708              :             }
     709              :         }
     710              :         else
     711        58588 :             slot = -1;
     712              : 
     713       215747 :         if (slot != -1)
     714              :         {
     715              :             /*
     716              :              * Descend the tree, or return the found block if we're at the
     717              :              * bottom.
     718              :              */
     719        38712 :             if (addr.level == FSM_BOTTOM_LEVEL)
     720              :             {
     721        11875 :                 BlockNumber blkno = fsm_get_heap_blk(addr, slot);
     722              :                 Page        page;
     723              : 
     724        11875 :                 if (fsm_does_block_exist(rel, blkno))
     725              :                 {
     726        11875 :                     ReleaseBuffer(buf);
     727        11875 :                     return blkno;
     728              :                 }
     729              : 
     730              :                 /*
     731              :                  * Block is past the end of the relation.  Update FSM, and
     732              :                  * restart from root.  The usual "advancenext" behavior is
     733              :                  * pessimal for this rare scenario, since every later slot is
     734              :                  * unusable in the same way.  We could zero all affected slots
     735              :                  * on the same FSM page, but don't bet on the benefits of that
     736              :                  * optimization justifying its compiled code bulk.
     737              :                  */
     738            0 :                 page = BufferGetPage(buf);
     739            0 :                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     740            0 :                 fsm_set_avail(page, slot, 0);
     741            0 :                 MarkBufferDirtyHint(buf, false);
     742            0 :                 UnlockReleaseBuffer(buf);
     743            0 :                 if (restarts++ > 10000) /* same rationale as below */
     744            0 :                     return InvalidBlockNumber;
     745            0 :                 addr = FSM_ROOT_ADDRESS;
     746              :             }
     747              :             else
     748              :             {
     749        26837 :                 ReleaseBuffer(buf);
     750              :             }
     751        26837 :             addr = fsm_get_child(addr, slot);
     752              :         }
     753       177035 :         else if (addr.level == FSM_ROOT_LEVEL)
     754              :         {
     755              :             /*
     756              :              * At the root, failure means there's no page with enough free
     757              :              * space in the FSM. Give up.
     758              :              */
     759       174976 :             return InvalidBlockNumber;
     760              :         }
     761              :         else
     762              :         {
     763              :             uint16      parentslot;
     764              :             FSMAddress  parent;
     765              : 
     766              :             /*
     767              :              * At lower level, failure can happen if the value in the upper-
     768              :              * level node didn't reflect the value on the lower page. Update
     769              :              * the upper node, to avoid falling into the same trap again, and
     770              :              * start over.
     771              :              *
     772              :              * There's a race condition here, if another backend updates this
     773              :              * page right after we release it, and gets the lock on the parent
     774              :              * page before us. We'll then update the parent page with the now
     775              :              * stale information we had. It's OK, because it should happen
     776              :              * rarely, and will be fixed by the next vacuum.
     777              :              */
     778         2059 :             parent = fsm_get_parent(addr, &parentslot);
     779         2059 :             fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
     780              : 
     781              :             /*
     782              :              * If the upper pages are badly out of date, we might need to loop
     783              :              * quite a few times, updating them as we go. Any inconsistencies
     784              :              * should eventually be corrected and the loop should end. Looping
     785              :              * indefinitely is nevertheless scary, so provide an emergency
     786              :              * valve.
     787              :              */
     788         2059 :             if (restarts++ > 10000)
     789            0 :                 return InvalidBlockNumber;
     790              : 
     791              :             /* Start search all over from the root */
     792         2059 :             addr = FSM_ROOT_ADDRESS;
     793              :         }
     794              :     }
     795              : }
     796              : 
     797              : 
     798              : /*
     799              :  * Recursive guts of FreeSpaceMapVacuum
     800              :  *
     801              :  * Examine the FSM page indicated by addr, as well as its children, updating
     802              :  * upper-level nodes that cover the heap block range from start to end-1.
     803              :  * (It's okay if end is beyond the actual end of the map.)
     804              :  * Return the maximum freespace value on this page.
     805              :  *
     806              :  * If addr is past the end of the FSM, set *eof_p to true and return 0.
     807              :  *
     808              :  * This traverses the tree in depth-first order.  The tree is stored
     809              :  * physically in depth-first order, so this should be pretty I/O efficient.
     810              :  */
     811              : static uint8
     812       153412 : fsm_vacuum_page(Relation rel, FSMAddress addr,
     813              :                 BlockNumber start, BlockNumber end,
     814              :                 bool *eof_p)
     815              : {
     816              :     Buffer      buf;
     817              :     Page        page;
     818              :     uint8       max_avail;
     819              : 
     820              :     /* Read the page if it exists, or return EOF */
     821       153412 :     buf = fsm_readbuf(rel, addr, false);
     822       153412 :     if (!BufferIsValid(buf))
     823              :     {
     824          653 :         *eof_p = true;
     825          653 :         return 0;
     826              :     }
     827              :     else
     828       152759 :         *eof_p = false;
     829              : 
     830       152759 :     page = BufferGetPage(buf);
     831              : 
     832              :     /*
     833              :      * If we're above the bottom level, recurse into children, and fix the
     834              :      * information stored about them at this level.
     835              :      */
     836       152759 :     if (addr.level > FSM_BOTTOM_LEVEL)
     837              :     {
     838              :         FSMAddress  fsm_start,
     839              :                     fsm_end;
     840              :         uint16      fsm_start_slot,
     841              :                     fsm_end_slot;
     842              :         int         slot,
     843              :                     start_slot,
     844              :                     end_slot;
     845       101898 :         bool        eof = false;
     846              : 
     847              :         /*
     848              :          * Compute the range of slots we need to update on this page, given
     849              :          * the requested range of heap blocks to consider.  The first slot to
     850              :          * update is the one covering the "start" block, and the last slot is
     851              :          * the one covering "end - 1".  (Some of this work will be duplicated
     852              :          * in each recursive call, but it's cheap enough to not worry about.)
     853              :          */
     854       101898 :         fsm_start = fsm_get_location(start, &fsm_start_slot);
     855       101898 :         fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
     856              : 
     857       254745 :         while (fsm_start.level < addr.level)
     858              :         {
     859       152847 :             fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
     860       152847 :             fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
     861              :         }
     862              :         Assert(fsm_start.level == addr.level);
     863              : 
     864       101898 :         if (fsm_start.logpageno == addr.logpageno)
     865       101898 :             start_slot = fsm_start_slot;
     866            0 :         else if (fsm_start.logpageno > addr.logpageno)
     867            0 :             start_slot = SlotsPerFSMPage;   /* shouldn't get here... */
     868              :         else
     869            0 :             start_slot = 0;
     870              : 
     871       101898 :         if (fsm_end.logpageno == addr.logpageno)
     872       101581 :             end_slot = fsm_end_slot;
     873          317 :         else if (fsm_end.logpageno > addr.logpageno)
     874          317 :             end_slot = SlotsPerFSMPage - 1;
     875              :         else
     876            0 :             end_slot = -1;      /* shouldn't get here... */
     877              : 
     878      1575455 :         for (slot = start_slot; slot <= end_slot; slot++)
     879              :         {
     880              :             int         child_avail;
     881              : 
     882      1473557 :             CHECK_FOR_INTERRUPTS();
     883              : 
     884              :             /* After we hit end-of-file, just clear the rest of the slots */
     885      1473557 :             if (!eof)
     886       102444 :                 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
     887              :                                               start, end,
     888              :                                               &eof);
     889              :             else
     890      1371113 :                 child_avail = 0;
     891              : 
     892              :             /* Update information about the child */
     893      1473557 :             if (fsm_get_avail(page, slot) != child_avail)
     894              :             {
     895         7936 :                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     896         7936 :                 fsm_set_avail(page, slot, child_avail);
     897         7936 :                 MarkBufferDirtyHint(buf, false);
     898         7936 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     899              :             }
     900              :         }
     901              :     }
     902              : 
     903              :     /* Now get the maximum value on the page, to return to caller */
     904       152759 :     max_avail = fsm_get_max_avail(page);
     905              : 
     906              :     /*
     907              :      * Reset the next slot pointer. This encourages the use of low-numbered
     908              :      * pages, increasing the chances that a later vacuum can truncate the
     909              :      * relation. We don't bother with marking the page dirty if it wasn't
     910              :      * already, since this is just a hint.
     911              :      */
     912       152759 :     LockBuffer(buf, BUFFER_LOCK_SHARE);
     913       152759 :     ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
     914       152759 :     LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     915              : 
     916       152759 :     ReleaseBuffer(buf);
     917              : 
     918       152759 :     return max_avail;
     919              : }
     920              : 
     921              : 
     922              : /*
     923              :  * Check whether a block number is past the end of the relation.  This can
     924              :  * happen after WAL replay, if the FSM reached disk but newly-extended pages
     925              :  * it refers to did not.
     926              :  */
     927              : static bool
     928        26692 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
     929              : {
     930        26692 :     SMgrRelation smgr = RelationGetSmgr(rel);
     931              : 
     932              :     /*
     933              :      * If below the cached nblocks, the block surely exists.  Otherwise, we
     934              :      * face a trade-off.  We opt to compare to a fresh nblocks, incurring
     935              :      * lseek() overhead.  The alternative would be to assume the block does
     936              :      * not exist, but that would cause FSM to set zero space available for
     937              :      * blocks that main fork extension just recorded.
     938              :      */
     939        26692 :     return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
     940        37257 :              blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
     941        10565 :             blknumber < RelationGetNumberOfBlocks(rel));
     942              : }
        

Generated by: LCOV version 2.0-1