LCOV - code coverage report
Current view: top level - src/backend/storage/freespace - freespace.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 91.3 % 231 211
Test Date: 2026-05-07 03:16:33 Functions: 100.0 % 22 22
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * freespace.c
       4              :  *    POSTGRES free space map for quickly finding free space in relations
       5              :  *
       6              :  *
       7              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       8              :  * Portions Copyright (c) 1994, Regents of the University of California
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/backend/storage/freespace/freespace.c
      12              :  *
      13              :  *
      14              :  * NOTES:
      15              :  *
      16              :  *  Free Space Map keeps track of the amount of free space on pages, and
      17              :  *  allows quickly searching for a page with enough free space. The FSM is
      18              :  *  stored in a dedicated relation fork of all heap relations, and those
      19              :  *  index access methods that need it (see also indexfsm.c). See README for
      20              :  *  more information.
      21              :  *
      22              :  *-------------------------------------------------------------------------
      23              :  */
      24              : #include "postgres.h"
      25              : 
      26              : #include "access/htup_details.h"
      27              : #include "access/xloginsert.h"
      28              : #include "access/xlogutils.h"
      29              : #include "miscadmin.h"
      30              : #include "storage/freespace.h"
      31              : #include "storage/fsm_internals.h"
      32              : #include "storage/smgr.h"
      33              : #include "utils/rel.h"
      34              : 
      35              : 
      36              : /*
      37              :  * We use just one byte to store the amount of free space on a page, so we
      38              :  * divide the amount of free space a page can have into 256 different
      39              :  * categories. The highest category, 255, represents a page with at least
      40              :  * MaxFSMRequestSize bytes of free space, and the second highest category
      41              :  * represents the range from 254 * FSM_CAT_STEP, inclusive, to
      42              :  * MaxFSMRequestSize, exclusive.
      43              :  *
      44              :  * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
      45              :  * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
      46              :  * categories look like this:
      47              :  *
      48              :  *
      49              :  * Range     Category
      50              :  * 0    - 31   0
      51              :  * 32   - 63   1
      52              :  * ...    ...  ...
      53              :  * 8096 - 8127 253
      54              :  * 8128 - 8163 254
      55              :  * 8164 - 8192 255
      56              :  *
      57              :  * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
      58              :  * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
      59              :  * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
      60              :  * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
      61              :  * completely empty page, that would mean that we could never satisfy a
      62              :  * request of exactly MaxFSMRequestSize bytes.
      63              :  */
      64              : #define FSM_CATEGORIES  256
      65              : #define FSM_CAT_STEP    (BLCKSZ / FSM_CATEGORIES)
      66              : #define MaxFSMRequestSize   MaxHeapTupleSize
      67              : 
      68              : /*
      69              :  * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
      70              :  * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
      71              :  * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
      72              :  * this means that 4096 bytes is the smallest BLCKSZ that we can get away
      73              :  * with a 3-level tree, and 512 is the smallest we support.
      74              :  */
      75              : #define FSM_TREE_DEPTH  ((SlotsPerFSMPage >= 1626) ? 3 : 4)
      76              : 
      77              : #define FSM_ROOT_LEVEL  (FSM_TREE_DEPTH - 1)
      78              : #define FSM_BOTTOM_LEVEL 0
      79              : 
      80              : /*
      81              :  * The internal FSM routines work on a logical addressing scheme. Each
      82              :  * level of the tree can be thought of as a separately addressable file.
      83              :  */
      84              : typedef struct
      85              : {
      86              :     int         level;          /* level */
      87              :     int         logpageno;      /* page number within the level */
      88              : } FSMAddress;
      89              : 
      90              : /* Address of the root page. */
      91              : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
      92              : 
      93              : /* functions to navigate the tree */
      94              : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
      95              : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
      96              : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
      97              : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
      98              : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
      99              : 
     100              : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
     101              : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
     102              : 
     103              : /* functions to convert amount of free space to a FSM category */
     104              : static uint8 fsm_space_avail_to_cat(Size avail);
     105              : static uint8 fsm_space_needed_to_cat(Size needed);
     106              : static Size fsm_space_cat_to_avail(uint8 cat);
     107              : 
     108              : /* workhorse functions for various operations */
     109              : static int  fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     110              :                                uint8 newValue, uint8 minValue);
     111              : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
     112              : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
     113              :                              BlockNumber start, BlockNumber end,
     114              :                              bool *eof_p);
     115              : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
     116              : 
     117              : 
     118              : /******** Public API ********/
     119              : 
     120              : /*
     121              :  * GetPageWithFreeSpace - try to find a page in the given relation with
     122              :  *      at least the specified amount of free space.
     123              :  *
     124              :  * If successful, return the block number; if not, return InvalidBlockNumber.
     125              :  *
     126              :  * The caller must be prepared for the possibility that the returned page
     127              :  * will turn out to have too little space available by the time the caller
     128              :  * gets a lock on it.  In that case, the caller should report the actual
     129              :  * amount of free space available on that page and then try again (see
     130              :  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
     131              :  * extend the relation.
     132              :  *
     133              :  * This can trigger FSM updates if any FSM entry is found to point to a block
     134              :  * past the end of the relation.
     135              :  */
     136              : BlockNumber
     137       110166 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
     138              : {
     139       110166 :     uint8       min_cat = fsm_space_needed_to_cat(spaceNeeded);
     140              : 
     141       110166 :     return fsm_search(rel, min_cat);
     142              : }
     143              : 
     144              : /*
     145              :  * RecordAndGetPageWithFreeSpace - update info about a page and try again.
     146              :  *
     147              :  * We provide this combo form to save some locking overhead, compared to
     148              :  * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
     149              :  * also some effort to return a page close to the old page; if there's a
     150              :  * page with enough free space on the same FSM page where the old one page
     151              :  * is located, it is preferred.
     152              :  */
     153              : BlockNumber
     154       142332 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
     155              :                               Size oldSpaceAvail, Size spaceNeeded)
     156              : {
     157       142332 :     int         old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
     158       142332 :     int         search_cat = fsm_space_needed_to_cat(spaceNeeded);
     159              :     FSMAddress  addr;
     160              :     uint16      slot;
     161              :     int         search_slot;
     162              : 
     163              :     /* Get the location of the FSM byte representing the heap block */
     164       142332 :     addr = fsm_get_location(oldPage, &slot);
     165              : 
     166       142332 :     search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
     167              : 
     168              :     /*
     169              :      * If fsm_set_and_search found a suitable new block, return that.
     170              :      * Otherwise, search as usual.
     171              :      */
     172       142332 :     if (search_slot != -1)
     173              :     {
     174        15505 :         BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
     175              : 
     176              :         /*
     177              :          * Check that the blknum is actually in the relation. Don't try to
     178              :          * update the FSM in that case, just fall back to the other case
     179              :          */
     180        15505 :         if (fsm_does_block_exist(rel, blknum))
     181        15505 :             return blknum;
     182              :     }
     183       126827 :     return fsm_search(rel, search_cat);
     184              : }
     185              : 
     186              : /*
     187              :  * RecordPageWithFreeSpace - update info about a page.
     188              :  *
     189              :  * Note that if the new spaceAvail value is higher than the old value stored
     190              :  * in the FSM, the space might not become visible to searchers until the next
     191              :  * FreeSpaceMapVacuum call, which updates the upper level pages.
     192              :  */
     193              : void
     194       468706 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
     195              : {
     196       468706 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     197              :     FSMAddress  addr;
     198              :     uint16      slot;
     199              : 
     200              :     /* Get the location of the FSM byte representing the heap block */
     201       468706 :     addr = fsm_get_location(heapBlk, &slot);
     202              : 
     203       468706 :     fsm_set_and_search(rel, addr, slot, new_cat, 0);
     204       468706 : }
     205              : 
     206              : /*
     207              :  * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
     208              :  *      WAL replay
     209              :  */
     210              : void
     211       309552 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
     212              :                             Size spaceAvail)
     213              : {
     214       309552 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     215              :     FSMAddress  addr;
     216              :     uint16      slot;
     217              :     BlockNumber blkno;
     218              :     Buffer      buf;
     219              :     Page        page;
     220              : 
     221              :     /* Get the location of the FSM byte representing the heap block */
     222       309552 :     addr = fsm_get_location(heapBlk, &slot);
     223       309552 :     blkno = fsm_logical_to_physical(addr);
     224              : 
     225              :     /* If the page doesn't exist already, extend */
     226       309552 :     buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
     227              :                                  RBM_ZERO_ON_ERROR, InvalidBuffer);
     228       309552 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     229              : 
     230       309552 :     page = BufferGetPage(buf);
     231       309552 :     if (PageIsNew(page))
     232          406 :         PageInit(page, BLCKSZ, 0);
     233              : 
     234              :     /*
     235              :      * Changes to FSM are usually marked as changed using MarkBufferDirtyHint;
     236              :      * however, during recovery, it does nothing if checksums are enabled. It
     237              :      * is assumed that the page should not be dirtied during recovery while
     238              :      * modifying hints to prevent torn pages, since no new WAL data can be
     239              :      * generated at this point to store FPI. This is not relevant to the FSM
     240              :      * case, as its blocks are zeroed when a checksum mismatch occurs. So, we
     241              :      * need to use regular MarkBufferDirty here to mark the FSM block as
     242              :      * modified during recovery, otherwise changes to the FSM may be lost.
     243              :      */
     244       309552 :     if (fsm_set_avail(page, slot, new_cat))
     245       303096 :         MarkBufferDirty(buf);
     246       309552 :     UnlockReleaseBuffer(buf);
     247       309552 : }
     248              : 
     249              : /*
     250              :  * GetRecordedFreeSpace - return the amount of free space on a particular page,
     251              :  *      according to the FSM.
     252              :  */
     253              : Size
     254          899 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
     255              : {
     256              :     FSMAddress  addr;
     257              :     uint16      slot;
     258              :     Buffer      buf;
     259              :     uint8       cat;
     260              : 
     261              :     /* Get the location of the FSM byte representing the heap block */
     262          899 :     addr = fsm_get_location(heapBlk, &slot);
     263              : 
     264          899 :     buf = fsm_readbuf(rel, addr, false);
     265          899 :     if (!BufferIsValid(buf))
     266           36 :         return 0;
     267          863 :     cat = fsm_get_avail(BufferGetPage(buf), slot);
     268          863 :     ReleaseBuffer(buf);
     269              : 
     270          863 :     return fsm_space_cat_to_avail(cat);
     271              : }
     272              : 
     273              : /*
     274              :  * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
     275              :  *
     276              :  * nblocks is the new size of the heap.
     277              :  *
     278              :  * Return the number of blocks of new FSM.
     279              :  * If it's InvalidBlockNumber, there is nothing to truncate;
     280              :  * otherwise the caller is responsible for calling smgrtruncate()
     281              :  * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
     282              :  * to update upper-level pages in the FSM.
     283              :  */
     284              : BlockNumber
     285          218 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
     286              : {
     287              :     BlockNumber new_nfsmblocks;
     288              :     FSMAddress  first_removed_address;
     289              :     uint16      first_removed_slot;
     290              :     Buffer      buf;
     291              : 
     292              :     /*
     293              :      * If no FSM has been created yet for this relation, there's nothing to
     294              :      * truncate.
     295              :      */
     296          218 :     if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
     297            0 :         return InvalidBlockNumber;
     298              : 
     299              :     /* Get the location in the FSM of the first removed heap block */
     300          218 :     first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
     301              : 
     302              :     /*
     303              :      * Zero out the tail of the last remaining FSM page. If the slot
     304              :      * representing the first removed heap block is at a page boundary, as the
     305              :      * first slot on the FSM page that first_removed_address points to, we can
     306              :      * just truncate that page altogether.
     307              :      */
     308          218 :     if (first_removed_slot > 0)
     309              :     {
     310          111 :         buf = fsm_readbuf(rel, first_removed_address, false);
     311          111 :         if (!BufferIsValid(buf))
     312            0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already
     313              :                                          * smaller */
     314          111 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     315              : 
     316              :         /* NO EREPORT(ERROR) from here till changes are logged */
     317          111 :         START_CRIT_SECTION();
     318              : 
     319          111 :         fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
     320              : 
     321              :         /*
     322              :          * This change is non-critical, because fsm_does_block_exist() would
     323              :          * stop us from returning a truncated-away block.  However, since this
     324              :          * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
     325              :          * of that many fsm_does_block_exist() rejections.  Use a full
     326              :          * MarkBufferDirty(), not MarkBufferDirtyHint().
     327              :          */
     328          111 :         MarkBufferDirty(buf);
     329              : 
     330              :         /*
     331              :          * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
     332              :          * differing from the rest of the file in this respect.  This is
     333              :          * optional; see README mention of full page images.  XXX consider
     334              :          * XLogSaveBufferForHint() for even closer similarity.
     335              :          *
     336              :          * A higher-level operation calls us at WAL replay.  If we crash
     337              :          * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
     338              :          * not changed, and our fork remains valid.  If we crash after that
     339              :          * flush, redo will return here.
     340              :          */
     341          111 :         if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
     342           91 :             log_newpage_buffer(buf, false);
     343              : 
     344          111 :         END_CRIT_SECTION();
     345              : 
     346          111 :         UnlockReleaseBuffer(buf);
     347              : 
     348          111 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
     349              :     }
     350              :     else
     351              :     {
     352          107 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
     353          107 :         if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
     354            0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already
     355              :                                          * smaller */
     356              :     }
     357              : 
     358          218 :     return new_nfsmblocks;
     359              : }
     360              : 
     361              : /*
     362              :  * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
     363              :  *
     364              :  * We assume that the bottom-level pages have already been updated with
     365              :  * new free-space information.
     366              :  */
     367              : void
     368          183 : FreeSpaceMapVacuum(Relation rel)
     369              : {
     370              :     bool        dummy;
     371              : 
     372              :     /* Recursively scan the tree, starting at the root */
     373          183 :     (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
     374              :                            (BlockNumber) 0, InvalidBlockNumber,
     375              :                            &dummy);
     376          183 : }
     377              : 
     378              : /*
     379              :  * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
     380              :  *
     381              :  * As above, but assume that only heap pages between start and end-1 inclusive
     382              :  * have new free-space information, so update only the upper-level slots
     383              :  * covering that block range.  end == InvalidBlockNumber is equivalent to
     384              :  * "all the rest of the relation".
     385              :  */
     386              : void
     387        48246 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
     388              : {
     389              :     bool        dummy;
     390              : 
     391              :     /* Recursively scan the tree, starting at the root */
     392        48246 :     if (end > start)
     393        48103 :         (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
     394        48246 : }
     395              : 
     396              : /******** Internal routines ********/
     397              : 
     398              : /*
     399              :  * Return category corresponding x bytes of free space
     400              :  */
     401              : static uint8
     402       920590 : fsm_space_avail_to_cat(Size avail)
     403              : {
     404              :     int         cat;
     405              : 
     406              :     Assert(avail < BLCKSZ);
     407              : 
     408       920590 :     if (avail >= MaxFSMRequestSize)
     409        22328 :         return 255;
     410              : 
     411       898262 :     cat = avail / FSM_CAT_STEP;
     412              : 
     413              :     /*
     414              :      * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
     415              :      * more.
     416              :      */
     417       898262 :     if (cat > 254)
     418            0 :         cat = 254;
     419              : 
     420       898262 :     return (uint8) cat;
     421              : }
     422              : 
     423              : /*
     424              :  * Return the lower bound of the range of free space represented by given
     425              :  * category.
     426              :  */
     427              : static Size
     428          863 : fsm_space_cat_to_avail(uint8 cat)
     429              : {
     430              :     /* The highest category represents exactly MaxFSMRequestSize bytes. */
     431          863 :     if (cat == 255)
     432          234 :         return MaxFSMRequestSize;
     433              :     else
     434          629 :         return cat * FSM_CAT_STEP;
     435              : }
     436              : 
     437              : /*
     438              :  * Which category does a page need to have, to accommodate x bytes of data?
     439              :  * While fsm_space_avail_to_cat() rounds down, this needs to round up.
     440              :  */
     441              : static uint8
     442       252498 : fsm_space_needed_to_cat(Size needed)
     443              : {
     444              :     int         cat;
     445              : 
     446              :     /* Can't ask for more space than the highest category represents */
     447       252498 :     if (needed > MaxFSMRequestSize)
     448            0 :         elog(ERROR, "invalid FSM request size %zu", needed);
     449              : 
     450       252498 :     if (needed == 0)
     451            0 :         return 1;
     452              : 
     453       252498 :     cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
     454              : 
     455       252498 :     if (cat > 255)
     456            0 :         cat = 255;
     457              : 
     458       252498 :     return (uint8) cat;
     459              : }
     460              : 
     461              : /*
     462              :  * Returns the physical block number of a FSM page
     463              :  */
     464              : static BlockNumber
     465      1339245 : fsm_logical_to_physical(FSMAddress addr)
     466              : {
     467              :     BlockNumber pages;
     468              :     int         leafno;
     469              :     int         l;
     470              : 
     471              :     /*
     472              :      * Calculate the logical page number of the first leaf page below the
     473              :      * given page.
     474              :      */
     475      1339245 :     leafno = addr.logpageno;
     476      1982906 :     for (l = 0; l < addr.level; l++)
     477       643661 :         leafno *= SlotsPerFSMPage;
     478              : 
     479              :     /* Count upper level nodes required to address the leaf page */
     480      1339245 :     pages = 0;
     481      5356980 :     for (l = 0; l < FSM_TREE_DEPTH; l++)
     482              :     {
     483      4017735 :         pages += leafno + 1;
     484      4017735 :         leafno /= SlotsPerFSMPage;
     485              :     }
     486              : 
     487              :     /*
     488              :      * If the page we were asked for wasn't at the bottom level, subtract the
     489              :      * additional lower level pages we counted above.
     490              :      */
     491      1339245 :     pages -= addr.level;
     492              : 
     493              :     /* Turn the page count into 0-based block number */
     494      1339245 :     return pages - 1;
     495              : }
     496              : 
     497              : /*
     498              :  * Return the FSM location corresponding to given heap block.
     499              :  */
     500              : static FSMAddress
     501      1114763 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
     502              : {
     503              :     FSMAddress  addr;
     504              : 
     505      1114763 :     addr.level = FSM_BOTTOM_LEVEL;
     506      1114763 :     addr.logpageno = heapblk / SlotsPerFSMPage;
     507      1114763 :     *slot = heapblk % SlotsPerFSMPage;
     508              : 
     509      1114763 :     return addr;
     510              : }
     511              : 
     512              : /*
     513              :  * Return the heap block number corresponding to given location in the FSM.
     514              :  */
     515              : static BlockNumber
     516        28552 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
     517              : {
     518              :     Assert(addr.level == FSM_BOTTOM_LEVEL);
     519        28552 :     return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
     520              : }
     521              : 
     522              : /*
     523              :  * Given a logical address of a child page, get the logical page number of
     524              :  * the parent, and the slot within the parent corresponding to the child.
     525              :  */
     526              : static FSMAddress
     527       292120 : fsm_get_parent(FSMAddress child, uint16 *slot)
     528              : {
     529              :     FSMAddress  parent;
     530              : 
     531              :     Assert(child.level < FSM_ROOT_LEVEL);
     532              : 
     533       292120 :     parent.level = child.level + 1;
     534       292120 :     parent.logpageno = child.logpageno / SlotsPerFSMPage;
     535       292120 :     *slot = child.logpageno % SlotsPerFSMPage;
     536              : 
     537       292120 :     return parent;
     538              : }
     539              : 
     540              : /*
     541              :  * Given a logical address of a parent page and a slot number, get the
     542              :  * logical address of the corresponding child page.
     543              :  */
     544              : static FSMAddress
     545       127076 : fsm_get_child(FSMAddress parent, uint16 slot)
     546              : {
     547              :     FSMAddress  child;
     548              : 
     549              :     Assert(parent.level > FSM_BOTTOM_LEVEL);
     550              : 
     551       127076 :     child.level = parent.level - 1;
     552       127076 :     child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
     553              : 
     554       127076 :     return child;
     555              : }
     556              : 
     557              : /*
     558              :  * Read a FSM page.
     559              :  *
     560              :  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
     561              :  * true, the FSM file is extended.
     562              :  */
     563              : static Buffer
     564      1029475 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
     565              : {
     566      1029475 :     BlockNumber blkno = fsm_logical_to_physical(addr);
     567              :     Buffer      buf;
     568      1029475 :     SMgrRelation reln = RelationGetSmgr(rel);
     569              : 
     570              :     /*
     571              :      * If we haven't cached the size of the FSM yet, check it first.  Also
     572              :      * recheck if the requested block seems to be past end, since our cached
     573              :      * value might be stale.  (We send smgr inval messages on truncation, but
     574              :      * not on extension.)
     575              :      */
     576      1029475 :     if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
     577       909446 :         blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
     578              :     {
     579              :         /* Invalidate the cache so smgrnblocks asks the kernel. */
     580       160472 :         reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
     581       160472 :         if (smgrexists(reln, FSM_FORKNUM))
     582        83531 :             smgrnblocks(reln, FSM_FORKNUM);
     583              :         else
     584        76941 :             reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
     585              :     }
     586              : 
     587              :     /*
     588              :      * For reading we use ZERO_ON_ERROR mode, and initialize the page if
     589              :      * necessary.  The FSM information is not accurate anyway, so it's better
     590              :      * to clear corrupt pages than error out. Since the FSM changes are not
     591              :      * WAL-logged, the so-called torn page problem on crash can lead to pages
     592              :      * with corrupt headers, for example.
     593              :      *
     594              :      * We use the same path below to initialize pages when extending the
     595              :      * relation, as a concurrent extension can end up with vm_extend()
     596              :      * returning an already-initialized page.
     597              :      */
     598      1029475 :     if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
     599              :     {
     600        77736 :         if (extend)
     601         4862 :             buf = fsm_extend(rel, blkno + 1);
     602              :         else
     603        72874 :             return InvalidBuffer;
     604              :     }
     605              :     else
     606       951739 :         buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
     607              : 
     608              :     /*
     609              :      * Initializing the page when needed is trickier than it looks, because of
     610              :      * the possibility of multiple backends doing this concurrently, and our
     611              :      * desire to not uselessly take the buffer lock in the normal path where
     612              :      * the page is OK.  We must take the lock to initialize the page, so
     613              :      * recheck page newness after we have the lock, in case someone else
     614              :      * already did it.  Also, because we initially check PageIsNew with no
     615              :      * lock, it's possible to fall through and return the buffer while someone
     616              :      * else is still initializing the page (i.e., we might see pd_upper as set
     617              :      * but other page header fields are still zeroes).  This is harmless for
     618              :      * callers that will take a buffer lock themselves, but some callers
     619              :      * inspect the page without any lock at all.  The latter is OK only so
     620              :      * long as it doesn't depend on the page header having correct contents.
     621              :      * Current usage is safe because PageGetContents() does not require that.
     622              :      */
     623       956601 :     if (PageIsNew(BufferGetPage(buf)))
     624              :     {
     625        16169 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     626        16169 :         if (PageIsNew(BufferGetPage(buf)))
     627        16169 :             PageInit(BufferGetPage(buf), BLCKSZ, 0);
     628        16169 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     629              :     }
     630       956601 :     return buf;
     631              : }
     632              : 
     633              : /*
     634              :  * Ensure that the FSM fork is at least fsm_nblocks long, extending
     635              :  * it if necessary with empty pages. And by empty, I mean pages filled
     636              :  * with zeros, meaning there's no free space.
     637              :  */
     638              : static Buffer
     639         4862 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
     640              : {
     641         4862 :     return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
     642              :                                EB_CREATE_FORK_IF_NEEDED |
     643              :                                EB_CLEAR_SIZE_CACHE,
     644              :                                fsm_nblocks,
     645              :                                RBM_ZERO_ON_ERROR);
     646              : }
     647              : 
     648              : /*
     649              :  * Set value in given FSM page and slot.
     650              :  *
     651              :  * If minValue > 0, the updated page is also searched for a page with at
     652              :  * least minValue of free space. If one is found, its slot number is
     653              :  * returned, -1 otherwise.
     654              :  */
     655              : static int
     656       613574 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     657              :                    uint8 newValue, uint8 minValue)
     658              : {
     659              :     Buffer      buf;
     660              :     Page        page;
     661       613574 :     int         newslot = -1;
     662              : 
     663       613574 :     buf = fsm_readbuf(rel, addr, true);
     664       613574 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     665              : 
     666       613574 :     page = BufferGetPage(buf);
     667              : 
     668       613574 :     if (fsm_set_avail(page, slot, newValue))
     669       127012 :         MarkBufferDirtyHint(buf, false);
     670              : 
     671       613574 :     if (minValue != 0)
     672              :     {
     673              :         /* Search while we still hold the lock */
     674       142332 :         newslot = fsm_search_avail(buf, minValue,
     675       142332 :                                    addr.level == FSM_BOTTOM_LEVEL,
     676              :                                    true);
     677              :     }
     678              : 
     679       613574 :     UnlockReleaseBuffer(buf);
     680              : 
     681       613574 :     return newslot;
     682              : }
     683              : 
     684              : /*
     685              :  * Search the tree for a heap page with at least min_cat of free space
     686              :  */
     687              : static BlockNumber
     688       236993 : fsm_search(Relation rel, uint8 min_cat)
     689              : {
     690       236993 :     int         restarts = 0;
     691       236993 :     FSMAddress  addr = FSM_ROOT_ADDRESS;
     692              : 
     693              :     for (;;)
     694        32433 :     {
     695              :         int         slot;
     696              :         Buffer      buf;
     697       269426 :         uint8       max_avail = 0;
     698              : 
     699              :         /* Read the FSM page. */
     700       269426 :         buf = fsm_readbuf(rel, addr, false);
     701              : 
     702              :         /* Search within the page */
     703       269426 :         if (BufferIsValid(buf))
     704              :         {
     705       197368 :             LockBuffer(buf, BUFFER_LOCK_SHARE);
     706       197368 :             slot = fsm_search_avail(buf, min_cat,
     707       197368 :                                     (addr.level == FSM_BOTTOM_LEVEL),
     708              :                                     false);
     709       197368 :             if (slot == -1)
     710              :             {
     711       154424 :                 max_avail = fsm_get_max_avail(BufferGetPage(buf));
     712       154424 :                 UnlockReleaseBuffer(buf);
     713              :             }
     714              :             else
     715              :             {
     716              :                 /* Keep the pin for possible update below */
     717        42944 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     718              :             }
     719              :         }
     720              :         else
     721        72058 :             slot = -1;
     722              : 
     723       269426 :         if (slot != -1)
     724              :         {
     725              :             /*
     726              :              * Descend the tree, or return the found block if we're at the
     727              :              * bottom.
     728              :              */
     729        42944 :             if (addr.level == FSM_BOTTOM_LEVEL)
     730              :             {
     731        13047 :                 BlockNumber blkno = fsm_get_heap_blk(addr, slot);
     732              :                 Page        page;
     733              : 
     734        13047 :                 if (fsm_does_block_exist(rel, blkno))
     735              :                 {
     736        13047 :                     ReleaseBuffer(buf);
     737        13047 :                     return blkno;
     738              :                 }
     739              : 
     740              :                 /*
     741              :                  * Block is past the end of the relation.  Update FSM, and
     742              :                  * restart from root.  The usual "advancenext" behavior is
     743              :                  * pessimal for this rare scenario, since every later slot is
     744              :                  * unusable in the same way.  We could zero all affected slots
     745              :                  * on the same FSM page, but don't bet on the benefits of that
     746              :                  * optimization justifying its compiled code bulk.
     747              :                  */
     748            0 :                 page = BufferGetPage(buf);
     749            0 :                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     750            0 :                 fsm_set_avail(page, slot, 0);
     751            0 :                 MarkBufferDirtyHint(buf, false);
     752            0 :                 UnlockReleaseBuffer(buf);
     753            0 :                 if (restarts++ > 10000) /* same rationale as below */
     754            0 :                     return InvalidBlockNumber;
     755            0 :                 addr = FSM_ROOT_ADDRESS;
     756              :             }
     757              :             else
     758              :             {
     759        29897 :                 ReleaseBuffer(buf);
     760              :             }
     761        29897 :             addr = fsm_get_child(addr, slot);
     762              :         }
     763       226482 :         else if (addr.level == FSM_ROOT_LEVEL)
     764              :         {
     765              :             /*
     766              :              * At the root, failure means there's no page with enough free
     767              :              * space in the FSM. Give up.
     768              :              */
     769       223946 :             return InvalidBlockNumber;
     770              :         }
     771              :         else
     772              :         {
     773              :             uint16      parentslot;
     774              :             FSMAddress  parent;
     775              : 
     776              :             /*
     777              :              * At lower level, failure can happen if the value in the upper-
     778              :              * level node didn't reflect the value on the lower page. Update
     779              :              * the upper node, to avoid falling into the same trap again, and
     780              :              * start over.
     781              :              *
     782              :              * There's a race condition here, if another backend updates this
     783              :              * page right after we release it, and gets the lock on the parent
     784              :              * page before us. We'll then update the parent page with the now
     785              :              * stale information we had. It's OK, because it should happen
     786              :              * rarely, and will be fixed by the next vacuum.
     787              :              */
     788         2536 :             parent = fsm_get_parent(addr, &parentslot);
     789         2536 :             fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
     790              : 
     791              :             /*
     792              :              * If the upper pages are badly out of date, we might need to loop
     793              :              * quite a few times, updating them as we go. Any inconsistencies
     794              :              * should eventually be corrected and the loop should end. Looping
     795              :              * indefinitely is nevertheless scary, so provide an emergency
     796              :              * valve.
     797              :              */
     798         2536 :             if (restarts++ > 10000)
     799            0 :                 return InvalidBlockNumber;
     800              : 
     801              :             /* Start search all over from the root */
     802         2536 :             addr = FSM_ROOT_ADDRESS;
     803              :         }
     804              :     }
     805              : }
     806              : 
     807              : 
     808              : /*
     809              :  * Recursive guts of FreeSpaceMapVacuum
     810              :  *
     811              :  * Examine the FSM page indicated by addr, as well as its children, updating
     812              :  * upper-level nodes that cover the heap block range from start to end-1.
     813              :  * (It's okay if end is beyond the actual end of the map.)
     814              :  * Return the maximum freespace value on this page.
     815              :  *
     816              :  * If addr is past the end of the FSM, set *eof_p to true and return 0.
     817              :  *
     818              :  * This traverses the tree in depth-first order.  The tree is stored
     819              :  * physically in depth-first order, so this should be pretty I/O efficient.
     820              :  */
     821              : static uint8
     822       145465 : fsm_vacuum_page(Relation rel, FSMAddress addr,
     823              :                 BlockNumber start, BlockNumber end,
     824              :                 bool *eof_p)
     825              : {
     826              :     Buffer      buf;
     827              :     Page        page;
     828              :     uint8       max_avail;
     829              : 
     830              :     /* Read the page if it exists, or return EOF */
     831       145465 :     buf = fsm_readbuf(rel, addr, false);
     832       145465 :     if (!BufferIsValid(buf))
     833              :     {
     834          780 :         *eof_p = true;
     835          780 :         return 0;
     836              :     }
     837              :     else
     838       144685 :         *eof_p = false;
     839              : 
     840       144685 :     page = BufferGetPage(buf);
     841              : 
     842              :     /*
     843              :      * If we're above the bottom level, recurse into children, and fix the
     844              :      * information stored about them at this level.
     845              :      */
     846       144685 :     if (addr.level > FSM_BOTTOM_LEVEL)
     847              :     {
     848              :         FSMAddress  fsm_start,
     849              :                     fsm_end;
     850              :         uint16      fsm_start_slot,
     851              :                     fsm_end_slot;
     852              :         int         slot,
     853              :                     start_slot,
     854              :                     end_slot;
     855        96528 :         bool        eof = false;
     856              : 
     857              :         /*
     858              :          * Compute the range of slots we need to update on this page, given
     859              :          * the requested range of heap blocks to consider.  The first slot to
     860              :          * update is the one covering the "start" block, and the last slot is
     861              :          * the one covering "end - 1".  (Some of this work will be duplicated
     862              :          * in each recursive call, but it's cheap enough to not worry about.)
     863              :          */
     864        96528 :         fsm_start = fsm_get_location(start, &fsm_start_slot);
     865        96528 :         fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
     866              : 
     867       241320 :         while (fsm_start.level < addr.level)
     868              :         {
     869       144792 :             fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
     870       144792 :             fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
     871              :         }
     872              :         Assert(fsm_start.level == addr.level);
     873              : 
     874        96528 :         if (fsm_start.logpageno == addr.logpageno)
     875        96528 :             start_slot = fsm_start_slot;
     876            0 :         else if (fsm_start.logpageno > addr.logpageno)
     877            0 :             start_slot = SlotsPerFSMPage;   /* shouldn't get here... */
     878              :         else
     879            0 :             start_slot = 0;
     880              : 
     881        96528 :         if (fsm_end.logpageno == addr.logpageno)
     882        96149 :             end_slot = fsm_end_slot;
     883          379 :         else if (fsm_end.logpageno > addr.logpageno)
     884          379 :             end_slot = SlotsPerFSMPage - 1;
     885              :         else
     886            0 :             end_slot = -1;      /* shouldn't get here... */
     887              : 
     888      1832989 :         for (slot = start_slot; slot <= end_slot; slot++)
     889              :         {
     890              :             int         child_avail;
     891              : 
     892      1736461 :             CHECK_FOR_INTERRUPTS();
     893              : 
     894              :             /* After we hit end-of-file, just clear the rest of the slots */
     895      1736461 :             if (!eof)
     896        97179 :                 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
     897              :                                               start, end,
     898              :                                               &eof);
     899              :             else
     900      1639282 :                 child_avail = 0;
     901              : 
     902              :             /* Update information about the child */
     903      1736461 :             if (fsm_get_avail(page, slot) != child_avail)
     904              :             {
     905         9298 :                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     906         9298 :                 fsm_set_avail(page, slot, child_avail);
     907         9298 :                 MarkBufferDirtyHint(buf, false);
     908         9298 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     909              :             }
     910              :         }
     911              :     }
     912              : 
     913              :     /* Now get the maximum value on the page, to return to caller */
     914       144685 :     max_avail = fsm_get_max_avail(page);
     915              : 
     916              :     /*
     917              :      * Try to reset the next slot pointer. This encourages the use of
     918              :      * low-numbered pages, increasing the chances that a later vacuum can
     919              :      * truncate the relation. We don't bother with marking the page dirty if
     920              :      * it wasn't already, since this is just a hint.
     921              :      */
     922       144685 :     LockBuffer(buf, BUFFER_LOCK_SHARE);
     923       144685 :     if (BufferBeginSetHintBits(buf))
     924              :     {
     925       144685 :         ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
     926       144685 :         BufferFinishSetHintBits(buf, false, false);
     927              :     }
     928              : 
     929       144685 :     UnlockReleaseBuffer(buf);
     930              : 
     931       144685 :     return max_avail;
     932              : }
     933              : 
     934              : 
     935              : /*
     936              :  * Check whether a block number is past the end of the relation.  This can
     937              :  * happen after WAL replay, if the FSM reached disk but newly-extended pages
     938              :  * it refers to did not.
     939              :  */
     940              : static bool
     941        28552 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
     942              : {
     943        28552 :     SMgrRelation smgr = RelationGetSmgr(rel);
     944              : 
     945              :     /*
     946              :      * If below the cached nblocks, the block surely exists.  Otherwise, we
     947              :      * face a trade-off.  We opt to compare to a fresh nblocks, incurring
     948              :      * lseek() overhead.  The alternative would be to assume the block does
     949              :      * not exist, but that would cause FSM to set zero space available for
     950              :      * blocks that main fork extension just recorded.
     951              :      */
     952        28552 :     return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
     953        40140 :              blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
     954        11588 :             blknumber < RelationGetNumberOfBlocks(rel));
     955              : }
        

Generated by: LCOV version 2.0-1