LCOV - code coverage report
Current view: top level - src/backend/storage/freespace - freespace.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 208 228 91.2 %
Date: 2025-01-18 04:15:08 Functions: 22 22 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * freespace.c
       4             :  *    POSTGRES free space map for quickly finding free space in relations
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/storage/freespace/freespace.c
      12             :  *
      13             :  *
      14             :  * NOTES:
      15             :  *
      16             :  *  Free Space Map keeps track of the amount of free space on pages, and
      17             :  *  allows quickly searching for a page with enough free space. The FSM is
      18             :  *  stored in a dedicated relation fork of all heap relations, and those
      19             :  *  index access methods that need it (see also indexfsm.c). See README for
      20             :  *  more information.
      21             :  *
      22             :  *-------------------------------------------------------------------------
      23             :  */
      24             : #include "postgres.h"
      25             : 
      26             : #include "access/htup_details.h"
      27             : #include "access/xloginsert.h"
      28             : #include "access/xlogutils.h"
      29             : #include "miscadmin.h"
      30             : #include "storage/freespace.h"
      31             : #include "storage/fsm_internals.h"
      32             : #include "storage/smgr.h"
      33             : #include "utils/rel.h"
      34             : 
      35             : 
      36             : /*
      37             :  * We use just one byte to store the amount of free space on a page, so we
      38             :  * divide the amount of free space a page can have into 256 different
      39             :  * categories. The highest category, 255, represents a page with at least
      40             :  * MaxFSMRequestSize bytes of free space, and the second highest category
      41             :  * represents the range from 254 * FSM_CAT_STEP, inclusive, to
      42             :  * MaxFSMRequestSize, exclusive.
      43             :  *
      44             :  * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
      45             :  * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
      46             :  * categories look like this:
      47             :  *
      48             :  *
      49             :  * Range     Category
      50             :  * 0    - 31   0
      51             :  * 32   - 63   1
      52             :  * ...    ...  ...
      53             :  * 8096 - 8127 253
      54             :  * 8128 - 8163 254
      55             :  * 8164 - 8192 255
      56             :  *
      57             :  * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
      58             :  * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
      59             :  * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
      60             :  * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
      61             :  * completely empty page, that would mean that we could never satisfy a
      62             :  * request of exactly MaxFSMRequestSize bytes.
      63             :  */
      64             : #define FSM_CATEGORIES  256
      65             : #define FSM_CAT_STEP    (BLCKSZ / FSM_CATEGORIES)
      66             : #define MaxFSMRequestSize   MaxHeapTupleSize
      67             : 
      68             : /*
      69             :  * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
      70             :  * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
      71             :  * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
      72             :  * this means that 4096 bytes is the smallest BLCKSZ that we can get away
      73             :  * with a 3-level tree, and 512 is the smallest we support.
      74             :  */
      75             : #define FSM_TREE_DEPTH  ((SlotsPerFSMPage >= 1626) ? 3 : 4)
      76             : 
      77             : #define FSM_ROOT_LEVEL  (FSM_TREE_DEPTH - 1)
      78             : #define FSM_BOTTOM_LEVEL 0
      79             : 
      80             : /*
      81             :  * The internal FSM routines work on a logical addressing scheme. Each
      82             :  * level of the tree can be thought of as a separately addressable file.
      83             :  */
      84             : typedef struct
      85             : {
      86             :     int         level;          /* level */
      87             :     int         logpageno;      /* page number within the level */
      88             : } FSMAddress;
      89             : 
      90             : /* Address of the root page. */
      91             : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
      92             : 
      93             : /* functions to navigate the tree */
      94             : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
      95             : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
      96             : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
      97             : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
      98             : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
      99             : 
     100             : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
     101             : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
     102             : 
     103             : /* functions to convert amount of free space to a FSM category */
     104             : static uint8 fsm_space_avail_to_cat(Size avail);
     105             : static uint8 fsm_space_needed_to_cat(Size needed);
     106             : static Size fsm_space_cat_to_avail(uint8 cat);
     107             : 
     108             : /* workhorse functions for various operations */
     109             : static int  fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     110             :                                uint8 newValue, uint8 minValue);
     111             : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
     112             : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
     113             :                              BlockNumber start, BlockNumber end,
     114             :                              bool *eof_p);
     115             : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
     116             : 
     117             : 
     118             : /******** Public API ********/
     119             : 
     120             : /*
     121             :  * GetPageWithFreeSpace - try to find a page in the given relation with
     122             :  *      at least the specified amount of free space.
     123             :  *
     124             :  * If successful, return the block number; if not, return InvalidBlockNumber.
     125             :  *
     126             :  * The caller must be prepared for the possibility that the returned page
     127             :  * will turn out to have too little space available by the time the caller
     128             :  * gets a lock on it.  In that case, the caller should report the actual
     129             :  * amount of free space available on that page and then try again (see
     130             :  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
     131             :  * extend the relation.
     132             :  *
     133             :  * This can trigger FSM updates if any FSM entry is found to point to a block
     134             :  * past the end of the relation.
     135             :  */
     136             : BlockNumber
     137      165250 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
     138             : {
     139      165250 :     uint8       min_cat = fsm_space_needed_to_cat(spaceNeeded);
     140             : 
     141      165250 :     return fsm_search(rel, min_cat);
     142             : }
     143             : 
     144             : /*
     145             :  * RecordAndGetPageWithFreeSpace - update info about a page and try again.
     146             :  *
     147             :  * We provide this combo form to save some locking overhead, compared to
     148             :  * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
     149             :  * also some effort to return a page close to the old page; if there's a
     150             :  * page with enough free space on the same FSM page where the old one page
     151             :  * is located, it is preferred.
     152             :  */
     153             : BlockNumber
     154      193460 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
     155             :                               Size oldSpaceAvail, Size spaceNeeded)
     156             : {
     157      193460 :     int         old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
     158      193460 :     int         search_cat = fsm_space_needed_to_cat(spaceNeeded);
     159             :     FSMAddress  addr;
     160             :     uint16      slot;
     161             :     int         search_slot;
     162             : 
     163             :     /* Get the location of the FSM byte representing the heap block */
     164      193460 :     addr = fsm_get_location(oldPage, &slot);
     165             : 
     166      193460 :     search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
     167             : 
     168             :     /*
     169             :      * If fsm_set_and_search found a suitable new block, return that.
     170             :      * Otherwise, search as usual.
     171             :      */
     172      193460 :     if (search_slot != -1)
     173             :     {
     174       27332 :         BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
     175             : 
     176             :         /*
     177             :          * Check that the blknum is actually in the relation. Don't try to
     178             :          * update the FSM in that case, just fall back to the other case
     179             :          */
     180       27332 :         if (fsm_does_block_exist(rel, blknum))
     181       27332 :             return blknum;
     182             :     }
     183      166128 :     return fsm_search(rel, search_cat);
     184             : }
     185             : 
     186             : /*
     187             :  * RecordPageWithFreeSpace - update info about a page.
     188             :  *
     189             :  * Note that if the new spaceAvail value is higher than the old value stored
     190             :  * in the FSM, the space might not become visible to searchers until the next
     191             :  * FreeSpaceMapVacuum call, which updates the upper level pages.
     192             :  */
     193             : void
     194      772072 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
     195             : {
     196      772072 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     197             :     FSMAddress  addr;
     198             :     uint16      slot;
     199             : 
     200             :     /* Get the location of the FSM byte representing the heap block */
     201      772072 :     addr = fsm_get_location(heapBlk, &slot);
     202             : 
     203      772072 :     fsm_set_and_search(rel, addr, slot, new_cat, 0);
     204      772072 : }
     205             : 
     206             : /*
     207             :  * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
     208             :  *      WAL replay
     209             :  */
     210             : void
     211      571192 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
     212             :                             Size spaceAvail)
     213             : {
     214      571192 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     215             :     FSMAddress  addr;
     216             :     uint16      slot;
     217             :     BlockNumber blkno;
     218             :     Buffer      buf;
     219             :     Page        page;
     220             : 
     221             :     /* Get the location of the FSM byte representing the heap block */
     222      571192 :     addr = fsm_get_location(heapBlk, &slot);
     223      571192 :     blkno = fsm_logical_to_physical(addr);
     224             : 
     225             :     /* If the page doesn't exist already, extend */
     226      571192 :     buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
     227             :                                  RBM_ZERO_ON_ERROR, InvalidBuffer);
     228      571192 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     229             : 
     230      571192 :     page = BufferGetPage(buf);
     231      571192 :     if (PageIsNew(page))
     232         928 :         PageInit(page, BLCKSZ, 0);
     233             : 
     234      571192 :     if (fsm_set_avail(page, slot, new_cat))
     235      561556 :         MarkBufferDirtyHint(buf, false);
     236      571192 :     UnlockReleaseBuffer(buf);
     237      571192 : }
     238             : 
     239             : /*
     240             :  * GetRecordedFreeSpace - return the amount of free space on a particular page,
     241             :  *      according to the FSM.
     242             :  */
     243             : Size
     244        1996 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
     245             : {
     246             :     FSMAddress  addr;
     247             :     uint16      slot;
     248             :     Buffer      buf;
     249             :     uint8       cat;
     250             : 
     251             :     /* Get the location of the FSM byte representing the heap block */
     252        1996 :     addr = fsm_get_location(heapBlk, &slot);
     253             : 
     254        1996 :     buf = fsm_readbuf(rel, addr, false);
     255        1996 :     if (!BufferIsValid(buf))
     256          72 :         return 0;
     257        1924 :     cat = fsm_get_avail(BufferGetPage(buf), slot);
     258        1924 :     ReleaseBuffer(buf);
     259             : 
     260        1924 :     return fsm_space_cat_to_avail(cat);
     261             : }
     262             : 
     263             : /*
     264             :  * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
     265             :  *
     266             :  * nblocks is the new size of the heap.
     267             :  *
     268             :  * Return the number of blocks of new FSM.
     269             :  * If it's InvalidBlockNumber, there is nothing to truncate;
     270             :  * otherwise the caller is responsible for calling smgrtruncate()
     271             :  * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
     272             :  * to update upper-level pages in the FSM.
     273             :  */
     274             : BlockNumber
     275         366 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
     276             : {
     277             :     BlockNumber new_nfsmblocks;
     278             :     FSMAddress  first_removed_address;
     279             :     uint16      first_removed_slot;
     280             :     Buffer      buf;
     281             : 
     282             :     /*
     283             :      * If no FSM has been created yet for this relation, there's nothing to
     284             :      * truncate.
     285             :      */
     286         366 :     if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
     287           0 :         return InvalidBlockNumber;
     288             : 
     289             :     /* Get the location in the FSM of the first removed heap block */
     290         366 :     first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
     291             : 
     292             :     /*
     293             :      * Zero out the tail of the last remaining FSM page. If the slot
     294             :      * representing the first removed heap block is at a page boundary, as the
     295             :      * first slot on the FSM page that first_removed_address points to, we can
     296             :      * just truncate that page altogether.
     297             :      */
     298         366 :     if (first_removed_slot > 0)
     299             :     {
     300         210 :         buf = fsm_readbuf(rel, first_removed_address, false);
     301         210 :         if (!BufferIsValid(buf))
     302           0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already
     303             :                                          * smaller */
     304         210 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     305             : 
     306             :         /* NO EREPORT(ERROR) from here till changes are logged */
     307         210 :         START_CRIT_SECTION();
     308             : 
     309         210 :         fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
     310             : 
     311             :         /*
     312             :          * This change is non-critical, because fsm_does_block_exist() would
     313             :          * stop us from returning a truncated-away block.  However, since this
     314             :          * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
     315             :          * of that many fsm_does_block_exist() rejections.  Use a full
     316             :          * MarkBufferDirty(), not MarkBufferDirtyHint().
     317             :          */
     318         210 :         MarkBufferDirty(buf);
     319             : 
     320             :         /*
     321             :          * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
     322             :          * differing from the rest of the file in this respect.  This is
     323             :          * optional; see README mention of full page images.  XXX consider
     324             :          * XLogSaveBufferForHint() for even closer similarity.
     325             :          *
     326             :          * A higher-level operation calls us at WAL replay.  If we crash
     327             :          * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
     328             :          * not changed, and our fork remains valid.  If we crash after that
     329             :          * flush, redo will return here.
     330             :          */
     331         210 :         if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
     332         176 :             log_newpage_buffer(buf, false);
     333             : 
     334         210 :         END_CRIT_SECTION();
     335             : 
     336         210 :         UnlockReleaseBuffer(buf);
     337             : 
     338         210 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
     339             :     }
     340             :     else
     341             :     {
     342         156 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
     343         156 :         if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
     344           0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already
     345             :                                          * smaller */
     346             :     }
     347             : 
     348         366 :     return new_nfsmblocks;
     349             : }
     350             : 
     351             : /*
     352             :  * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
     353             :  *
     354             :  * We assume that the bottom-level pages have already been updated with
     355             :  * new free-space information.
     356             :  */
     357             : void
     358         270 : FreeSpaceMapVacuum(Relation rel)
     359             : {
     360             :     bool        dummy;
     361             : 
     362             :     /* Recursively scan the tree, starting at the root */
     363         270 :     (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
     364             :                            (BlockNumber) 0, InvalidBlockNumber,
     365             :                            &dummy);
     366         270 : }
     367             : 
     368             : /*
     369             :  * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
     370             :  *
     371             :  * As above, but assume that only heap pages between start and end-1 inclusive
     372             :  * have new free-space information, so update only the upper-level slots
     373             :  * covering that block range.  end == InvalidBlockNumber is equivalent to
     374             :  * "all the rest of the relation".
     375             :  */
     376             : void
     377       48094 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
     378             : {
     379             :     bool        dummy;
     380             : 
     381             :     /* Recursively scan the tree, starting at the root */
     382       48094 :     if (end > start)
     383       47752 :         (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
     384       48094 : }
     385             : 
     386             : /******** Internal routines ********/
     387             : 
     388             : /*
     389             :  * Return category corresponding x bytes of free space
     390             :  */
     391             : static uint8
     392     1536724 : fsm_space_avail_to_cat(Size avail)
     393             : {
     394             :     int         cat;
     395             : 
     396             :     Assert(avail < BLCKSZ);
     397             : 
     398     1536724 :     if (avail >= MaxFSMRequestSize)
     399       39934 :         return 255;
     400             : 
     401     1496790 :     cat = avail / FSM_CAT_STEP;
     402             : 
     403             :     /*
     404             :      * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
     405             :      * more.
     406             :      */
     407     1496790 :     if (cat > 254)
     408           0 :         cat = 254;
     409             : 
     410     1496790 :     return (uint8) cat;
     411             : }
     412             : 
     413             : /*
     414             :  * Return the lower bound of the range of free space represented by given
     415             :  * category.
     416             :  */
     417             : static Size
     418        1924 : fsm_space_cat_to_avail(uint8 cat)
     419             : {
     420             :     /* The highest category represents exactly MaxFSMRequestSize bytes. */
     421        1924 :     if (cat == 255)
     422         956 :         return MaxFSMRequestSize;
     423             :     else
     424         968 :         return cat * FSM_CAT_STEP;
     425             : }
     426             : 
     427             : /*
     428             :  * Which category does a page need to have, to accommodate x bytes of data?
     429             :  * While fsm_space_avail_to_cat() rounds down, this needs to round up.
     430             :  */
     431             : static uint8
     432      358710 : fsm_space_needed_to_cat(Size needed)
     433             : {
     434             :     int         cat;
     435             : 
     436             :     /* Can't ask for more space than the highest category represents */
     437      358710 :     if (needed > MaxFSMRequestSize)
     438           0 :         elog(ERROR, "invalid FSM request size %zu", needed);
     439             : 
     440      358710 :     if (needed == 0)
     441           0 :         return 1;
     442             : 
     443      358710 :     cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
     444             : 
     445      358710 :     if (cat > 255)
     446           0 :         cat = 255;
     447             : 
     448      358710 :     return (uint8) cat;
     449             : }
     450             : 
     451             : /*
     452             :  * Returns the physical block number of a FSM page
     453             :  */
     454             : static BlockNumber
     455     2067756 : fsm_logical_to_physical(FSMAddress addr)
     456             : {
     457             :     BlockNumber pages;
     458             :     int         leafno;
     459             :     int         l;
     460             : 
     461             :     /*
     462             :      * Calculate the logical page number of the first leaf page below the
     463             :      * given page.
     464             :      */
     465     2067756 :     leafno = addr.logpageno;
     466     2910888 :     for (l = 0; l < addr.level; l++)
     467      843132 :         leafno *= SlotsPerFSMPage;
     468             : 
     469             :     /* Count upper level nodes required to address the leaf page */
     470     2067756 :     pages = 0;
     471     8271024 :     for (l = 0; l < FSM_TREE_DEPTH; l++)
     472             :     {
     473     6203268 :         pages += leafno + 1;
     474     6203268 :         leafno /= SlotsPerFSMPage;
     475             :     }
     476             : 
     477             :     /*
     478             :      * If the page we were asked for wasn't at the bottom level, subtract the
     479             :      * additional lower level pages we counted above.
     480             :      */
     481     2067756 :     pages -= addr.level;
     482             : 
     483             :     /* Turn the page count into 0-based block number */
     484     2067756 :     return pages - 1;
     485             : }
     486             : 
     487             : /*
     488             :  * Return the FSM location corresponding to given heap block.
     489             :  */
     490             : static FSMAddress
     491     1731022 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
     492             : {
     493             :     FSMAddress  addr;
     494             : 
     495     1731022 :     addr.level = FSM_BOTTOM_LEVEL;
     496     1731022 :     addr.logpageno = heapblk / SlotsPerFSMPage;
     497     1731022 :     *slot = heapblk % SlotsPerFSMPage;
     498             : 
     499     1731022 :     return addr;
     500             : }
     501             : 
     502             : /*
     503             :  * Return the heap block number corresponding to given location in the FSM.
     504             :  */
     505             : static BlockNumber
     506       47170 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
     507             : {
     508             :     Assert(addr.level == FSM_BOTTOM_LEVEL);
     509       47170 :     return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
     510             : }
     511             : 
     512             : /*
     513             :  * Given a logical address of a child page, get the logical page number of
     514             :  * the parent, and the slot within the parent corresponding to the child.
     515             :  */
     516             : static FSMAddress
     517      291440 : fsm_get_parent(FSMAddress child, uint16 *slot)
     518             : {
     519             :     FSMAddress  parent;
     520             : 
     521             :     Assert(child.level < FSM_ROOT_LEVEL);
     522             : 
     523      291440 :     parent.level = child.level + 1;
     524      291440 :     parent.logpageno = child.logpageno / SlotsPerFSMPage;
     525      291440 :     *slot = child.logpageno % SlotsPerFSMPage;
     526             : 
     527      291440 :     return parent;
     528             : }
     529             : 
     530             : /*
     531             :  * Given a logical address of a parent page and a slot number, get the
     532             :  * logical address of the corresponding child page.
     533             :  */
     534             : static FSMAddress
     535      141988 : fsm_get_child(FSMAddress parent, uint16 slot)
     536             : {
     537             :     FSMAddress  child;
     538             : 
     539             :     Assert(parent.level > FSM_BOTTOM_LEVEL);
     540             : 
     541      141988 :     child.level = parent.level - 1;
     542      141988 :     child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
     543             : 
     544      141988 :     return child;
     545             : }
     546             : 
     547             : /*
     548             :  * Read a FSM page.
     549             :  *
     550             :  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
     551             :  * true, the FSM file is extended.
     552             :  */
     553             : static Buffer
     554     1496198 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
     555             : {
     556     1496198 :     BlockNumber blkno = fsm_logical_to_physical(addr);
     557             :     Buffer      buf;
     558     1496198 :     SMgrRelation reln = RelationGetSmgr(rel);
     559             : 
     560             :     /*
     561             :      * If we haven't cached the size of the FSM yet, check it first.  Also
     562             :      * recheck if the requested block seems to be past end, since our cached
     563             :      * value might be stale.  (We send smgr inval messages on truncation, but
     564             :      * not on extension.)
     565             :      */
     566     1496198 :     if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
     567     1342548 :         blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
     568             :     {
     569             :         /* Invalidate the cache so smgrnblocks asks the kernel. */
     570      216816 :         reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
     571      216816 :         if (smgrexists(reln, FSM_FORKNUM))
     572       98452 :             smgrnblocks(reln, FSM_FORKNUM);
     573             :         else
     574      118364 :             reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
     575             :     }
     576             : 
     577             :     /*
     578             :      * For reading we use ZERO_ON_ERROR mode, and initialize the page if
     579             :      * necessary.  The FSM information is not accurate anyway, so it's better
     580             :      * to clear corrupt pages than error out. Since the FSM changes are not
     581             :      * WAL-logged, the so-called torn page problem on crash can lead to pages
     582             :      * with corrupt headers, for example.
     583             :      *
     584             :      * We use the same path below to initialize pages when extending the
     585             :      * relation, as a concurrent extension can end up with vm_extend()
     586             :      * returning an already-initialized page.
     587             :      */
     588     1496198 :     if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
     589             :     {
     590      119602 :         if (extend)
     591        7366 :             buf = fsm_extend(rel, blkno + 1);
     592             :         else
     593      112236 :             return InvalidBuffer;
     594             :     }
     595             :     else
     596     1376596 :         buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
     597             : 
     598             :     /*
     599             :      * Initializing the page when needed is trickier than it looks, because of
     600             :      * the possibility of multiple backends doing this concurrently, and our
     601             :      * desire to not uselessly take the buffer lock in the normal path where
     602             :      * the page is OK.  We must take the lock to initialize the page, so
     603             :      * recheck page newness after we have the lock, in case someone else
     604             :      * already did it.  Also, because we initially check PageIsNew with no
     605             :      * lock, it's possible to fall through and return the buffer while someone
     606             :      * else is still initializing the page (i.e., we might see pd_upper as set
     607             :      * but other page header fields are still zeroes).  This is harmless for
     608             :      * callers that will take a buffer lock themselves, but some callers
     609             :      * inspect the page without any lock at all.  The latter is OK only so
     610             :      * long as it doesn't depend on the page header having correct contents.
     611             :      * Current usage is safe because PageGetContents() does not require that.
     612             :      */
     613     1383962 :     if (PageIsNew(BufferGetPage(buf)))
     614             :     {
     615       24986 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     616       24986 :         if (PageIsNew(BufferGetPage(buf)))
     617       24986 :             PageInit(BufferGetPage(buf), BLCKSZ, 0);
     618       24986 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     619             :     }
     620     1383962 :     return buf;
     621             : }
     622             : 
     623             : /*
     624             :  * Ensure that the FSM fork is at least fsm_nblocks long, extending
     625             :  * it if necessary with empty pages. And by empty, I mean pages filled
     626             :  * with zeros, meaning there's no free space.
     627             :  */
     628             : static Buffer
     629        7366 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
     630             : {
     631        7366 :     return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
     632             :                                EB_CREATE_FORK_IF_NEEDED |
     633             :                                EB_CLEAR_SIZE_CACHE,
     634             :                                fsm_nblocks,
     635             :                                RBM_ZERO_ON_ERROR);
     636             : }
     637             : 
     638             : /*
     639             :  * Set value in given FSM page and slot.
     640             :  *
     641             :  * If minValue > 0, the updated page is also searched for a page with at
     642             :  * least minValue of free space. If one is found, its slot number is
     643             :  * returned, -1 otherwise.
     644             :  */
     645             : static int
     646      969068 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     647             :                    uint8 newValue, uint8 minValue)
     648             : {
     649             :     Buffer      buf;
     650             :     Page        page;
     651      969068 :     int         newslot = -1;
     652             : 
     653      969068 :     buf = fsm_readbuf(rel, addr, true);
     654      969068 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     655             : 
     656      969068 :     page = BufferGetPage(buf);
     657             : 
     658      969068 :     if (fsm_set_avail(page, slot, newValue))
     659      194152 :         MarkBufferDirtyHint(buf, false);
     660             : 
     661      969068 :     if (minValue != 0)
     662             :     {
     663             :         /* Search while we still hold the lock */
     664      193460 :         newslot = fsm_search_avail(buf, minValue,
     665      193460 :                                    addr.level == FSM_BOTTOM_LEVEL,
     666             :                                    true);
     667             :     }
     668             : 
     669      969068 :     UnlockReleaseBuffer(buf);
     670             : 
     671      969068 :     return newslot;
     672             : }
     673             : 
     674             : /*
     675             :  * Search the tree for a heap page with at least min_cat of free space
     676             :  */
     677             : static BlockNumber
     678      331378 : fsm_search(Relation rel, uint8 min_cat)
     679             : {
     680      331378 :     int         restarts = 0;
     681      331378 :     FSMAddress  addr = FSM_ROOT_ADDRESS;
     682             : 
     683             :     for (;;)
     684       48516 :     {
     685             :         int         slot;
     686             :         Buffer      buf;
     687      379894 :         uint8       max_avail = 0;
     688             : 
     689             :         /* Read the FSM page. */
     690      379894 :         buf = fsm_readbuf(rel, addr, false);
     691             : 
     692             :         /* Search within the page */
     693      379894 :         if (BufferIsValid(buf))
     694             :         {
     695      268964 :             LockBuffer(buf, BUFFER_LOCK_SHARE);
     696      268964 :             slot = fsm_search_avail(buf, min_cat,
     697      268964 :                                     (addr.level == FSM_BOTTOM_LEVEL),
     698             :                                     false);
     699      268964 :             if (slot == -1)
     700             :             {
     701      204146 :                 max_avail = fsm_get_max_avail(BufferGetPage(buf));
     702      204146 :                 UnlockReleaseBuffer(buf);
     703             :             }
     704             :             else
     705             :             {
     706             :                 /* Keep the pin for possible update below */
     707       64818 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     708             :             }
     709             :         }
     710             :         else
     711      110930 :             slot = -1;
     712             : 
     713      379894 :         if (slot != -1)
     714             :         {
     715             :             /*
     716             :              * Descend the tree, or return the found block if we're at the
     717             :              * bottom.
     718             :              */
     719       64818 :             if (addr.level == FSM_BOTTOM_LEVEL)
     720             :             {
     721       19838 :                 BlockNumber blkno = fsm_get_heap_blk(addr, slot);
     722             :                 Page        page;
     723             : 
     724       19838 :                 if (fsm_does_block_exist(rel, blkno))
     725             :                 {
     726       19838 :                     ReleaseBuffer(buf);
     727       19838 :                     return blkno;
     728             :                 }
     729             : 
     730             :                 /*
     731             :                  * Block is past the end of the relation.  Update FSM, and
     732             :                  * restart from root.  The usual "advancenext" behavior is
     733             :                  * pessimal for this rare scenario, since every later slot is
     734             :                  * unusable in the same way.  We could zero all affected slots
     735             :                  * on the same FSM page, but don't bet on the benefits of that
     736             :                  * optimization justifying its compiled code bulk.
     737             :                  */
     738           0 :                 page = BufferGetPage(buf);
     739           0 :                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     740           0 :                 fsm_set_avail(page, slot, 0);
     741           0 :                 MarkBufferDirtyHint(buf, false);
     742           0 :                 UnlockReleaseBuffer(buf);
     743           0 :                 if (restarts++ > 10000) /* same rationale as below */
     744           0 :                     return InvalidBlockNumber;
     745           0 :                 addr = FSM_ROOT_ADDRESS;
     746             :             }
     747             :             else
     748             :             {
     749       44980 :                 ReleaseBuffer(buf);
     750             :             }
     751       44980 :             addr = fsm_get_child(addr, slot);
     752             :         }
     753      315076 :         else if (addr.level == FSM_ROOT_LEVEL)
     754             :         {
     755             :             /*
     756             :              * At the root, failure means there's no page with enough free
     757             :              * space in the FSM. Give up.
     758             :              */
     759      311540 :             return InvalidBlockNumber;
     760             :         }
     761             :         else
     762             :         {
     763             :             uint16      parentslot;
     764             :             FSMAddress  parent;
     765             : 
     766             :             /*
     767             :              * At lower level, failure can happen if the value in the upper-
     768             :              * level node didn't reflect the value on the lower page. Update
     769             :              * the upper node, to avoid falling into the same trap again, and
     770             :              * start over.
     771             :              *
     772             :              * There's a race condition here, if another backend updates this
     773             :              * page right after we release it, and gets the lock on the parent
     774             :              * page before us. We'll then update the parent page with the now
     775             :              * stale information we had. It's OK, because it should happen
     776             :              * rarely, and will be fixed by the next vacuum.
     777             :              */
     778        3536 :             parent = fsm_get_parent(addr, &parentslot);
     779        3536 :             fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
     780             : 
     781             :             /*
     782             :              * If the upper pages are badly out of date, we might need to loop
     783             :              * quite a few times, updating them as we go. Any inconsistencies
     784             :              * should eventually be corrected and the loop should end. Looping
     785             :              * indefinitely is nevertheless scary, so provide an emergency
     786             :              * valve.
     787             :              */
     788        3536 :             if (restarts++ > 10000)
     789           0 :                 return InvalidBlockNumber;
     790             : 
     791             :             /* Start search all over from the root */
     792        3536 :             addr = FSM_ROOT_ADDRESS;
     793             :         }
     794             :     }
     795             : }
     796             : 
     797             : 
     798             : /*
     799             :  * Recursive guts of FreeSpaceMapVacuum
     800             :  *
     801             :  * Examine the FSM page indicated by addr, as well as its children, updating
     802             :  * upper-level nodes that cover the heap block range from start to end-1.
     803             :  * (It's okay if end is beyond the actual end of the map.)
     804             :  * Return the maximum freespace value on this page.
     805             :  *
     806             :  * If addr is past the end of the FSM, set *eof_p to true and return 0.
     807             :  *
     808             :  * This traverses the tree in depth-first order.  The tree is stored
     809             :  * physically in depth-first order, so this should be pretty I/O efficient.
     810             :  */
     811             : static uint8
     812      145030 : fsm_vacuum_page(Relation rel, FSMAddress addr,
     813             :                 BlockNumber start, BlockNumber end,
     814             :                 bool *eof_p)
     815             : {
     816             :     Buffer      buf;
     817             :     Page        page;
     818             :     uint8       max_avail;
     819             : 
     820             :     /* Read the page if it exists, or return EOF */
     821      145030 :     buf = fsm_readbuf(rel, addr, false);
     822      145030 :     if (!BufferIsValid(buf))
     823             :     {
     824        1234 :         *eof_p = true;
     825        1234 :         return 0;
     826             :     }
     827             :     else
     828      143796 :         *eof_p = false;
     829             : 
     830      143796 :     page = BufferGetPage(buf);
     831             : 
     832             :     /*
     833             :      * If we're above the bottom level, recurse into children, and fix the
     834             :      * information stored about them at this level.
     835             :      */
     836      143796 :     if (addr.level > FSM_BOTTOM_LEVEL)
     837             :     {
     838             :         FSMAddress  fsm_start,
     839             :                     fsm_end;
     840             :         uint16      fsm_start_slot,
     841             :                     fsm_end_slot;
     842             :         int         slot,
     843             :                     start_slot,
     844             :                     end_slot;
     845       95968 :         bool        eof = false;
     846             : 
     847             :         /*
     848             :          * Compute the range of slots we need to update on this page, given
     849             :          * the requested range of heap blocks to consider.  The first slot to
     850             :          * update is the one covering the "start" block, and the last slot is
     851             :          * the one covering "end - 1".  (Some of this work will be duplicated
     852             :          * in each recursive call, but it's cheap enough to not worry about.)
     853             :          */
     854       95968 :         fsm_start = fsm_get_location(start, &fsm_start_slot);
     855       95968 :         fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
     856             : 
     857      239920 :         while (fsm_start.level < addr.level)
     858             :         {
     859      143952 :             fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
     860      143952 :             fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
     861             :         }
     862             :         Assert(fsm_start.level == addr.level);
     863             : 
     864       95968 :         if (fsm_start.logpageno == addr.logpageno)
     865       95968 :             start_slot = fsm_start_slot;
     866           0 :         else if (fsm_start.logpageno > addr.logpageno)
     867           0 :             start_slot = SlotsPerFSMPage;   /* shouldn't get here... */
     868             :         else
     869           0 :             start_slot = 0;
     870             : 
     871       95968 :         if (fsm_end.logpageno == addr.logpageno)
     872       95370 :             end_slot = fsm_end_slot;
     873         598 :         else if (fsm_end.logpageno > addr.logpageno)
     874         598 :             end_slot = SlotsPerFSMPage - 1;
     875             :         else
     876           0 :             end_slot = -1;      /* shouldn't get here... */
     877             : 
     878     2779482 :         for (slot = start_slot; slot <= end_slot; slot++)
     879             :         {
     880             :             int         child_avail;
     881             : 
     882     2683514 :             CHECK_FOR_INTERRUPTS();
     883             : 
     884             :             /* After we hit end-of-file, just clear the rest of the slots */
     885     2683514 :             if (!eof)
     886       97008 :                 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
     887             :                                               start, end,
     888             :                                               &eof);
     889             :             else
     890     2586506 :                 child_avail = 0;
     891             : 
     892             :             /* Update information about the child */
     893     2683514 :             if (fsm_get_avail(page, slot) != child_avail)
     894             :             {
     895       14320 :                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     896       14320 :                 fsm_set_avail(page, slot, child_avail);
     897       14320 :                 MarkBufferDirtyHint(buf, false);
     898       14320 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     899             :             }
     900             :         }
     901             :     }
     902             : 
     903             :     /* Now get the maximum value on the page, to return to caller */
     904      143796 :     max_avail = fsm_get_max_avail(page);
     905             : 
     906             :     /*
     907             :      * Reset the next slot pointer. This encourages the use of low-numbered
     908             :      * pages, increasing the chances that a later vacuum can truncate the
     909             :      * relation.  We don't bother with a lock here, nor with marking the page
     910             :      * dirty if it wasn't already, since this is just a hint.
     911             :      */
     912      143796 :     ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
     913             : 
     914      143796 :     ReleaseBuffer(buf);
     915             : 
     916      143796 :     return max_avail;
     917             : }
     918             : 
     919             : 
     920             : /*
     921             :  * Check whether a block number is past the end of the relation.  This can
     922             :  * happen after WAL replay, if the FSM reached disk but newly-extended pages
     923             :  * it refers to did not.
     924             :  */
     925             : static bool
     926       47170 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
     927             : {
     928       47170 :     SMgrRelation smgr = RelationGetSmgr(rel);
     929             : 
     930             :     /*
     931             :      * If below the cached nblocks, the block surely exists.  Otherwise, we
     932             :      * face a trade-off.  We opt to compare to a fresh nblocks, incurring
     933             :      * lseek() overhead.  The alternative would be to assume the block does
     934             :      * not exist, but that would cause FSM to set zero space available for
     935             :      * blocks that main fork extension just recorded.
     936             :      */
     937       47170 :     return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
     938       64972 :              blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
     939       17802 :             blknumber < RelationGetNumberOfBlocks(rel));
     940             : }

Generated by: LCOV version 1.14