LCOV - code coverage report
Current view: top level - src/backend/storage/freespace - freespace.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 198 222 89.2 %
Date: 2019-11-21 14:06:36 Functions: 19 21 90.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * freespace.c
       4             :  *    POSTGRES free space map for quickly finding free space in relations
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1994, Regents of the University of California
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/storage/freespace/freespace.c
      12             :  *
      13             :  *
      14             :  * NOTES:
      15             :  *
      16             :  *  Free Space Map keeps track of the amount of free space on pages, and
      17             :  *  allows quickly searching for a page with enough free space. The FSM is
      18             :  *  stored in a dedicated relation fork of all heap relations, and those
      19             :  *  index access methods that need it (see also indexfsm.c). See README for
      20             :  *  more information.
      21             :  *
      22             :  *-------------------------------------------------------------------------
      23             :  */
      24             : #include "postgres.h"
      25             : 
      26             : #include "access/htup_details.h"
      27             : #include "access/xlogutils.h"
      28             : #include "miscadmin.h"
      29             : #include "storage/freespace.h"
      30             : #include "storage/fsm_internals.h"
      31             : #include "storage/lmgr.h"
      32             : #include "storage/smgr.h"
      33             : 
      34             : 
      35             : /*
      36             :  * We use just one byte to store the amount of free space on a page, so we
      37             :  * divide the amount of free space a page can have into 256 different
      38             :  * categories. The highest category, 255, represents a page with at least
      39             :  * MaxFSMRequestSize bytes of free space, and the second highest category
      40             :  * represents the range from 254 * FSM_CAT_STEP, inclusive, to
      41             :  * MaxFSMRequestSize, exclusive.
      42             :  *
      43             :  * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
      44             :  * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
      45             :  * categories look like this:
      46             :  *
      47             :  *
      48             :  * Range     Category
      49             :  * 0    - 31   0
      50             :  * 32   - 63   1
      51             :  * ...    ...  ...
      52             :  * 8096 - 8127 253
      53             :  * 8128 - 8163 254
      54             :  * 8164 - 8192 255
      55             :  *
      56             :  * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
      57             :  * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
      58             :  * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
      59             :  * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
      60             :  * completely empty page, that would mean that we could never satisfy a
      61             :  * request of exactly MaxFSMRequestSize bytes.
      62             :  */
      63             : #define FSM_CATEGORIES  256
      64             : #define FSM_CAT_STEP    (BLCKSZ / FSM_CATEGORIES)
      65             : #define MaxFSMRequestSize   MaxHeapTupleSize
      66             : 
      67             : /*
      68             :  * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
      69             :  * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
      70             :  * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
      71             :  * this means that 4096 bytes is the smallest BLCKSZ that we can get away
      72             :  * with a 3-level tree, and 512 is the smallest we support.
      73             :  */
      74             : #define FSM_TREE_DEPTH  ((SlotsPerFSMPage >= 1626) ? 3 : 4)
      75             : 
      76             : #define FSM_ROOT_LEVEL  (FSM_TREE_DEPTH - 1)
      77             : #define FSM_BOTTOM_LEVEL 0
      78             : 
      79             : /*
      80             :  * The internal FSM routines work on a logical addressing scheme. Each
      81             :  * level of the tree can be thought of as a separately addressable file.
      82             :  */
      83             : typedef struct
      84             : {
      85             :     int         level;          /* level */
      86             :     int         logpageno;      /* page number within the level */
      87             : } FSMAddress;
      88             : 
      89             : /* Address of the root page. */
      90             : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
      91             : 
      92             : /* functions to navigate the tree */
      93             : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
      94             : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
      95             : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
      96             : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
      97             : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
      98             : 
      99             : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
     100             : static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);
     101             : 
     102             : /* functions to convert amount of free space to a FSM category */
     103             : static uint8 fsm_space_avail_to_cat(Size avail);
     104             : static uint8 fsm_space_needed_to_cat(Size needed);
     105             : static Size fsm_space_cat_to_avail(uint8 cat);
     106             : 
     107             : /* workhorse functions for various operations */
     108             : static int  fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     109             :                                uint8 newValue, uint8 minValue);
     110             : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
     111             : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
     112             :                              BlockNumber start, BlockNumber end,
     113             :                              bool *eof);
     114             : 
     115             : 
     116             : /******** Public API ********/
     117             : 
     118             : /*
     119             :  * GetPageWithFreeSpace - try to find a page in the given relation with
     120             :  *      at least the specified amount of free space.
     121             :  *
     122             :  * If successful, return the block number; if not, return InvalidBlockNumber.
     123             :  *
     124             :  * The caller must be prepared for the possibility that the returned page
     125             :  * will turn out to have too little space available by the time the caller
     126             :  * gets a lock on it.  In that case, the caller should report the actual
     127             :  * amount of free space available on that page and then try again (see
     128             :  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
     129             :  * extend the relation.
     130             :  */
     131             : BlockNumber
     132      197344 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
     133             : {
     134      197344 :     uint8       min_cat = fsm_space_needed_to_cat(spaceNeeded);
     135             : 
     136      197344 :     return fsm_search(rel, min_cat);
     137             : }
     138             : 
     139             : /*
     140             :  * RecordAndGetPageWithFreeSpace - update info about a page and try again.
     141             :  *
     142             :  * We provide this combo form to save some locking overhead, compared to
     143             :  * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
     144             :  * also some effort to return a page close to the old page; if there's a
     145             :  * page with enough free space on the same FSM page where the old one page
     146             :  * is located, it is preferred.
     147             :  */
     148             : BlockNumber
     149      237008 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
     150             :                               Size oldSpaceAvail, Size spaceNeeded)
     151             : {
     152      237008 :     int         old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
     153      237008 :     int         search_cat = fsm_space_needed_to_cat(spaceNeeded);
     154             :     FSMAddress  addr;
     155             :     uint16      slot;
     156             :     int         search_slot;
     157             : 
     158             :     /* Get the location of the FSM byte representing the heap block */
     159      237008 :     addr = fsm_get_location(oldPage, &slot);
     160             : 
     161      237008 :     search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
     162             : 
     163             :     /*
     164             :      * If fsm_set_and_search found a suitable new block, return that.
     165             :      * Otherwise, search as usual.
     166             :      */
     167      237008 :     if (search_slot != -1)
     168       15358 :         return fsm_get_heap_blk(addr, search_slot);
     169             :     else
     170      221650 :         return fsm_search(rel, search_cat);
     171             : }
     172             : 
     173             : /*
     174             :  * RecordPageWithFreeSpace - update info about a page.
     175             :  *
     176             :  * Note that if the new spaceAvail value is higher than the old value stored
     177             :  * in the FSM, the space might not become visible to searchers until the next
     178             :  * FreeSpaceMapVacuum call, which updates the upper level pages.
     179             :  */
     180             : void
     181      178748 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
     182             : {
     183      178748 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     184             :     FSMAddress  addr;
     185             :     uint16      slot;
     186             : 
     187             :     /* Get the location of the FSM byte representing the heap block */
     188      178748 :     addr = fsm_get_location(heapBlk, &slot);
     189             : 
     190      178748 :     fsm_set_and_search(rel, addr, slot, new_cat, 0);
     191      178748 : }
     192             : 
     193             : /*
     194             :  * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
     195             :  *      WAL replay
     196             :  */
     197             : void
     198       31006 : XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
     199             :                             Size spaceAvail)
     200             : {
     201       31006 :     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
     202             :     FSMAddress  addr;
     203             :     uint16      slot;
     204             :     BlockNumber blkno;
     205             :     Buffer      buf;
     206             :     Page        page;
     207             : 
     208             :     /* Get the location of the FSM byte representing the heap block */
     209       31006 :     addr = fsm_get_location(heapBlk, &slot);
     210       31006 :     blkno = fsm_logical_to_physical(addr);
     211             : 
     212             :     /* If the page doesn't exist already, extend */
     213       31006 :     buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
     214       31006 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     215             : 
     216       31006 :     page = BufferGetPage(buf);
     217       31006 :     if (PageIsNew(page))
     218          26 :         PageInit(page, BLCKSZ, 0);
     219             : 
     220       31006 :     if (fsm_set_avail(page, slot, new_cat))
     221       30382 :         MarkBufferDirtyHint(buf, false);
     222       31006 :     UnlockReleaseBuffer(buf);
     223       31006 : }
     224             : 
     225             : /*
     226             :  * GetRecordedFreeSpace - return the amount of free space on a particular page,
     227             :  *      according to the FSM.
     228             :  */
     229             : Size
     230           0 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
     231             : {
     232             :     FSMAddress  addr;
     233             :     uint16      slot;
     234             :     Buffer      buf;
     235             :     uint8       cat;
     236             : 
     237             :     /* Get the location of the FSM byte representing the heap block */
     238           0 :     addr = fsm_get_location(heapBlk, &slot);
     239             : 
     240           0 :     buf = fsm_readbuf(rel, addr, false);
     241           0 :     if (!BufferIsValid(buf))
     242           0 :         return 0;
     243           0 :     cat = fsm_get_avail(BufferGetPage(buf), slot);
     244           0 :     ReleaseBuffer(buf);
     245             : 
     246           0 :     return fsm_space_cat_to_avail(cat);
     247             : }
     248             : 
     249             : /*
     250             :  * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
     251             :  *
     252             :  * nblocks is the new size of the heap.
     253             :  *
     254             :  * Return the number of blocks of new FSM.
     255             :  * If it's InvalidBlockNumber, there is nothing to truncate;
     256             :  * otherwise the caller is responsible for calling smgrtruncate()
     257             :  * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
     258             :  * to update upper-level pages in the FSM.
     259             :  */
     260             : BlockNumber
     261         122 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
     262             : {
     263             :     BlockNumber new_nfsmblocks;
     264             :     FSMAddress  first_removed_address;
     265             :     uint16      first_removed_slot;
     266             :     Buffer      buf;
     267             : 
     268         122 :     RelationOpenSmgr(rel);
     269             : 
     270             :     /*
     271             :      * If no FSM has been created yet for this relation, there's nothing to
     272             :      * truncate.
     273             :      */
     274         122 :     if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
     275           0 :         return InvalidBlockNumber;
     276             : 
     277             :     /* Get the location in the FSM of the first removed heap block */
     278         122 :     first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
     279             : 
     280             :     /*
     281             :      * Zero out the tail of the last remaining FSM page. If the slot
     282             :      * representing the first removed heap block is at a page boundary, as the
     283             :      * first slot on the FSM page that first_removed_address points to, we can
     284             :      * just truncate that page altogether.
     285             :      */
     286         122 :     if (first_removed_slot > 0)
     287             :     {
     288          72 :         buf = fsm_readbuf(rel, first_removed_address, false);
     289          72 :         if (!BufferIsValid(buf))
     290           0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already smaller */
     291          72 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     292             : 
     293             :         /* NO EREPORT(ERROR) from here till changes are logged */
     294          72 :         START_CRIT_SECTION();
     295             : 
     296          72 :         fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
     297             : 
     298             :         /*
     299             :          * Truncation of a relation is WAL-logged at a higher-level, and we
     300             :          * will be called at WAL replay. But if checksums are enabled, we need
     301             :          * to still write a WAL record to protect against a torn page, if the
     302             :          * page is flushed to disk before the truncation WAL record. We cannot
     303             :          * use MarkBufferDirtyHint here, because that will not dirty the page
     304             :          * during recovery.
     305             :          */
     306          72 :         MarkBufferDirty(buf);
     307          72 :         if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
     308           4 :             log_newpage_buffer(buf, false);
     309             : 
     310          72 :         END_CRIT_SECTION();
     311             : 
     312          72 :         UnlockReleaseBuffer(buf);
     313             : 
     314          72 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
     315             :     }
     316             :     else
     317             :     {
     318          50 :         new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
     319          50 :         if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
     320           0 :             return InvalidBlockNumber;  /* nothing to do; the FSM was already smaller */
     321             :     }
     322             : 
     323         122 :     return new_nfsmblocks;
     324             : }
     325             : 
     326             : /*
     327             :  * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
     328             :  *
     329             :  * We assume that the bottom-level pages have already been updated with
     330             :  * new free-space information.
     331             :  */
     332             : void
     333         132 : FreeSpaceMapVacuum(Relation rel)
     334             : {
     335             :     bool        dummy;
     336             : 
     337             :     /* Recursively scan the tree, starting at the root */
     338         132 :     (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
     339             :                            (BlockNumber) 0, InvalidBlockNumber,
     340             :                            &dummy);
     341         132 : }
     342             : 
     343             : /*
     344             :  * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
     345             :  *
     346             :  * As above, but assume that only heap pages between start and end-1 inclusive
     347             :  * have new free-space information, so update only the upper-level slots
     348             :  * covering that block range.  end == InvalidBlockNumber is equivalent to
     349             :  * "all the rest of the relation".
     350             :  */
     351             : void
     352       16660 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
     353             : {
     354             :     bool        dummy;
     355             : 
     356             :     /* Recursively scan the tree, starting at the root */
     357       16660 :     if (end > start)
     358       16660 :         (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
     359       16660 : }
     360             : 
     361             : /******** Internal routines ********/
     362             : 
     363             : /*
     364             :  * Return category corresponding x bytes of free space
     365             :  */
     366             : static uint8
     367      446762 : fsm_space_avail_to_cat(Size avail)
     368             : {
     369             :     int         cat;
     370             : 
     371             :     Assert(avail < BLCKSZ);
     372             : 
     373      446762 :     if (avail >= MaxFSMRequestSize)
     374        4848 :         return 255;
     375             : 
     376      441914 :     cat = avail / FSM_CAT_STEP;
     377             : 
     378             :     /*
     379             :      * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
     380             :      * more.
     381             :      */
     382      441914 :     if (cat > 254)
     383           0 :         cat = 254;
     384             : 
     385      441914 :     return (uint8) cat;
     386             : }
     387             : 
     388             : /*
     389             :  * Return the lower bound of the range of free space represented by given
     390             :  * category.
     391             :  */
     392             : static Size
     393           0 : fsm_space_cat_to_avail(uint8 cat)
     394             : {
     395             :     /* The highest category represents exactly MaxFSMRequestSize bytes. */
     396           0 :     if (cat == 255)
     397           0 :         return MaxFSMRequestSize;
     398             :     else
     399           0 :         return cat * FSM_CAT_STEP;
     400             : }
     401             : 
     402             : /*
     403             :  * Which category does a page need to have, to accommodate x bytes of data?
     404             :  * While fsm_space_avail_to_cat() rounds down, this needs to round up.
     405             :  */
     406             : static uint8
     407      434352 : fsm_space_needed_to_cat(Size needed)
     408             : {
     409             :     int         cat;
     410             : 
     411             :     /* Can't ask for more space than the highest category represents */
     412      434352 :     if (needed > MaxFSMRequestSize)
     413           0 :         elog(ERROR, "invalid FSM request size %zu", needed);
     414             : 
     415      434352 :     if (needed == 0)
     416           0 :         return 1;
     417             : 
     418      434352 :     cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
     419             : 
     420      434352 :     if (cat > 255)
     421           0 :         cat = 255;
     422             : 
     423      434352 :     return (uint8) cat;
     424             : }
     425             : 
     426             : /*
     427             :  * Returns the physical block number of a FSM page
     428             :  */
     429             : static BlockNumber
     430      945360 : fsm_logical_to_physical(FSMAddress addr)
     431             : {
     432             :     BlockNumber pages;
     433             :     int         leafno;
     434             :     int         l;
     435             : 
     436             :     /*
     437             :      * Calculate the logical page number of the first leaf page below the
     438             :      * given page.
     439             :      */
     440      945360 :     leafno = addr.logpageno;
     441     1853472 :     for (l = 0; l < addr.level; l++)
     442      908112 :         leafno *= SlotsPerFSMPage;
     443             : 
     444             :     /* Count upper level nodes required to address the leaf page */
     445      945360 :     pages = 0;
     446     3781440 :     for (l = 0; l < FSM_TREE_DEPTH; l++)
     447             :     {
     448     2836080 :         pages += leafno + 1;
     449     2836080 :         leafno /= SlotsPerFSMPage;
     450             :     }
     451             : 
     452             :     /*
     453             :      * If the page we were asked for wasn't at the bottom level, subtract the
     454             :      * additional lower level pages we counted above.
     455             :      */
     456      945360 :     pages -= addr.level;
     457             : 
     458             :     /* Turn the page count into 0-based block number */
     459      945360 :     return pages - 1;
     460             : }
     461             : 
     462             : /*
     463             :  * Return the FSM location corresponding to given heap block.
     464             :  */
     465             : static FSMAddress
     466      514020 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
     467             : {
     468             :     FSMAddress  addr;
     469             : 
     470      514020 :     addr.level = FSM_BOTTOM_LEVEL;
     471      514020 :     addr.logpageno = heapblk / SlotsPerFSMPage;
     472      514020 :     *slot = heapblk % SlotsPerFSMPage;
     473             : 
     474      514020 :     return addr;
     475             : }
     476             : 
     477             : /*
     478             :  * Return the heap block number corresponding to given location in the FSM.
     479             :  */
     480             : static BlockNumber
     481       26352 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
     482             : {
     483             :     Assert(addr.level == FSM_BOTTOM_LEVEL);
     484       26352 :     return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
     485             : }
     486             : 
     487             : /*
     488             :  * Given a logical address of a child page, get the logical page number of
     489             :  * the parent, and the slot within the parent corresponding to the child.
     490             :  */
     491             : static FSMAddress
     492      102596 : fsm_get_parent(FSMAddress child, uint16 *slot)
     493             : {
     494             :     FSMAddress  parent;
     495             : 
     496             :     Assert(child.level < FSM_ROOT_LEVEL);
     497             : 
     498      102596 :     parent.level = child.level + 1;
     499      102596 :     parent.logpageno = child.logpageno / SlotsPerFSMPage;
     500      102596 :     *slot = child.logpageno % SlotsPerFSMPage;
     501             : 
     502      102596 :     return parent;
     503             : }
     504             : 
     505             : /*
     506             :  * Given a logical address of a parent page and a slot number, get the
     507             :  * logical address of the corresponding child page.
     508             :  */
     509             : static FSMAddress
     510       58834 : fsm_get_child(FSMAddress parent, uint16 slot)
     511             : {
     512             :     FSMAddress  child;
     513             : 
     514             :     Assert(parent.level > FSM_BOTTOM_LEVEL);
     515             : 
     516       58834 :     child.level = parent.level - 1;
     517       58834 :     child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
     518             : 
     519       58834 :     return child;
     520             : }
     521             : 
     522             : /*
     523             :  * Read a FSM page.
     524             :  *
     525             :  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
     526             :  * true, the FSM file is extended.
     527             :  */
     528             : static Buffer
     529      914232 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
     530             : {
     531      914232 :     BlockNumber blkno = fsm_logical_to_physical(addr);
     532             :     Buffer      buf;
     533             : 
     534      914232 :     RelationOpenSmgr(rel);
     535             : 
     536             :     /*
     537             :      * If we haven't cached the size of the FSM yet, check it first.  Also
     538             :      * recheck if the requested block seems to be past end, since our cached
     539             :      * value might be stale.  (We send smgr inval messages on truncation, but
     540             :      * not on extension.)
     541             :      */
     542     1675038 :     if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
     543      760806 :         blkno >= rel->rd_smgr->smgr_fsm_nblocks)
     544             :     {
     545      221312 :         if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
     546       96302 :             rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
     547             :                                                          FSM_FORKNUM);
     548             :         else
     549      125010 :             rel->rd_smgr->smgr_fsm_nblocks = 0;
     550             :     }
     551             : 
     552             :     /* Handle requests beyond EOF */
     553      914232 :     if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
     554             :     {
     555      125506 :         if (extend)
     556       16124 :             fsm_extend(rel, blkno + 1);
     557             :         else
     558      109382 :             return InvalidBuffer;
     559             :     }
     560             : 
     561             :     /*
     562             :      * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
     563             :      * information is not accurate anyway, so it's better to clear corrupt
     564             :      * pages than error out. Since the FSM changes are not WAL-logged, the
     565             :      * so-called torn page problem on crash can lead to pages with corrupt
     566             :      * headers, for example.
     567             :      *
     568             :      * The initialize-the-page part is trickier than it looks, because of the
     569             :      * possibility of multiple backends doing this concurrently, and our
     570             :      * desire to not uselessly take the buffer lock in the normal path where
     571             :      * the page is OK.  We must take the lock to initialize the page, so
     572             :      * recheck page newness after we have the lock, in case someone else
     573             :      * already did it.  Also, because we initially check PageIsNew with no
     574             :      * lock, it's possible to fall through and return the buffer while someone
     575             :      * else is still initializing the page (i.e., we might see pd_upper as set
     576             :      * but other page header fields are still zeroes).  This is harmless for
     577             :      * callers that will take a buffer lock themselves, but some callers
     578             :      * inspect the page without any lock at all.  The latter is OK only so
     579             :      * long as it doesn't depend on the page header having correct contents.
     580             :      * Current usage is safe because PageGetContents() does not require that.
     581             :      */
     582      804850 :     buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
     583      804850 :     if (PageIsNew(BufferGetPage(buf)))
     584             :     {
     585           4 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     586           4 :         if (PageIsNew(BufferGetPage(buf)))
     587           4 :             PageInit(BufferGetPage(buf), BLCKSZ, 0);
     588           4 :         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     589             :     }
     590      804850 :     return buf;
     591             : }
     592             : 
     593             : /*
     594             :  * Ensure that the FSM fork is at least fsm_nblocks long, extending
     595             :  * it if necessary with empty pages. And by empty, I mean pages filled
     596             :  * with zeros, meaning there's no free space.
     597             :  */
     598             : static void
     599       16124 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
     600             : {
     601             :     BlockNumber fsm_nblocks_now;
     602             :     PGAlignedBlock pg;
     603             : 
     604       16124 :     PageInit((Page) pg.data, BLCKSZ, 0);
     605             : 
     606             :     /*
     607             :      * We use the relation extension lock to lock out other backends trying to
     608             :      * extend the FSM at the same time. It also locks out extension of the
     609             :      * main fork, unnecessarily, but extending the FSM happens seldom enough
     610             :      * that it doesn't seem worthwhile to have a separate lock tag type for
     611             :      * it.
     612             :      *
     613             :      * Note that another backend might have extended or created the relation
     614             :      * by the time we get the lock.
     615             :      */
     616       16124 :     LockRelationForExtension(rel, ExclusiveLock);
     617             : 
     618             :     /* Might have to re-open if a cache flush happened */
     619       16124 :     RelationOpenSmgr(rel);
     620             : 
     621             :     /*
     622             :      * Create the FSM file first if it doesn't exist.  If smgr_fsm_nblocks is
     623             :      * positive then it must exist, no need for an smgrexists call.
     624             :      */
     625       16128 :     if ((rel->rd_smgr->smgr_fsm_nblocks == 0 ||
     626       16124 :          rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) &&
     627       16120 :         !smgrexists(rel->rd_smgr, FSM_FORKNUM))
     628       16116 :         smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
     629             : 
     630       16124 :     fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
     631             : 
     632       80600 :     while (fsm_nblocks_now < fsm_nblocks)
     633             :     {
     634       48352 :         PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now);
     635             : 
     636       48352 :         smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
     637             :                    pg.data, false);
     638       48352 :         fsm_nblocks_now++;
     639             :     }
     640             : 
     641             :     /* Update local cache with the up-to-date size */
     642       16124 :     rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;
     643             : 
     644       16124 :     UnlockRelationForExtension(rel, ExclusiveLock);
     645       16124 : }
     646             : 
     647             : /*
     648             :  * Set value in given FSM page and slot.
     649             :  *
     650             :  * If minValue > 0, the updated page is also searched for a page with at
     651             :  * least minValue of free space. If one is found, its slot number is
     652             :  * returned, -1 otherwise.
     653             :  */
     654             : static int
     655      417648 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     656             :                    uint8 newValue, uint8 minValue)
     657             : {
     658             :     Buffer      buf;
     659             :     Page        page;
     660      417648 :     int         newslot = -1;
     661             : 
     662      417648 :     buf = fsm_readbuf(rel, addr, true);
     663      417648 :     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     664             : 
     665      417648 :     page = BufferGetPage(buf);
     666             : 
     667      417648 :     if (fsm_set_avail(page, slot, newValue))
     668      186566 :         MarkBufferDirtyHint(buf, false);
     669             : 
     670      417648 :     if (minValue != 0)
     671             :     {
     672             :         /* Search while we still hold the lock */
     673      237008 :         newslot = fsm_search_avail(buf, minValue,
     674      237008 :                                    addr.level == FSM_BOTTOM_LEVEL,
     675             :                                    true);
     676             :     }
     677             : 
     678      417648 :     UnlockReleaseBuffer(buf);
     679             : 
     680      417648 :     return newslot;
     681             : }
     682             : 
     683             : /*
     684             :  * Search the tree for a heap page with at least min_cat of free space
     685             :  */
     686             : static BlockNumber
     687      418994 : fsm_search(Relation rel, uint8 min_cat)
     688             : {
     689      418994 :     int         restarts = 0;
     690      418994 :     FSMAddress  addr = FSM_ROOT_ADDRESS;
     691             : 
     692             :     for (;;)
     693       26716 :     {
     694             :         int         slot;
     695             :         Buffer      buf;
     696      445710 :         uint8       max_avail = 0;
     697             : 
     698             :         /* Read the FSM page. */
     699      445710 :         buf = fsm_readbuf(rel, addr, false);
     700             : 
     701             :         /* Search within the page */
     702      445710 :         if (BufferIsValid(buf))
     703             :         {
     704      336828 :             LockBuffer(buf, BUFFER_LOCK_SHARE);
     705      336828 :             slot = fsm_search_avail(buf, min_cat,
     706      336828 :                                     (addr.level == FSM_BOTTOM_LEVEL),
     707             :                                     false);
     708      336828 :             if (slot == -1)
     709      301010 :                 max_avail = fsm_get_max_avail(BufferGetPage(buf));
     710      336828 :             UnlockReleaseBuffer(buf);
     711             :         }
     712             :         else
     713      108882 :             slot = -1;
     714             : 
     715      445710 :         if (slot != -1)
     716             :         {
     717             :             /*
     718             :              * Descend the tree, or return the found block if we're at the
     719             :              * bottom.
     720             :              */
     721       35818 :             if (addr.level == FSM_BOTTOM_LEVEL)
     722       10994 :                 return fsm_get_heap_blk(addr, slot);
     723             : 
     724       24824 :             addr = fsm_get_child(addr, slot);
     725             :         }
     726      409892 :         else if (addr.level == FSM_ROOT_LEVEL)
     727             :         {
     728             :             /*
     729             :              * At the root, failure means there's no page with enough free
     730             :              * space in the FSM. Give up.
     731             :              */
     732      408000 :             return InvalidBlockNumber;
     733             :         }
     734             :         else
     735             :         {
     736             :             uint16      parentslot;
     737             :             FSMAddress  parent;
     738             : 
     739             :             /*
     740             :              * At lower level, failure can happen if the value in the upper-
     741             :              * level node didn't reflect the value on the lower page. Update
     742             :              * the upper node, to avoid falling into the same trap again, and
     743             :              * start over.
     744             :              *
     745             :              * There's a race condition here, if another backend updates this
     746             :              * page right after we release it, and gets the lock on the parent
     747             :              * page before us. We'll then update the parent page with the now
     748             :              * stale information we had. It's OK, because it should happen
     749             :              * rarely, and will be fixed by the next vacuum.
     750             :              */
     751        1892 :             parent = fsm_get_parent(addr, &parentslot);
     752        1892 :             fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
     753             : 
     754             :             /*
     755             :              * If the upper pages are badly out of date, we might need to loop
     756             :              * quite a few times, updating them as we go. Any inconsistencies
     757             :              * should eventually be corrected and the loop should end. Looping
     758             :              * indefinitely is nevertheless scary, so provide an emergency
     759             :              * valve.
     760             :              */
     761        1892 :             if (restarts++ > 10000)
     762           0 :                 return InvalidBlockNumber;
     763             : 
     764             :             /* Start search all over from the root */
     765        1892 :             addr = FSM_ROOT_ADDRESS;
     766             :         }
     767             :     }
     768             : }
     769             : 
     770             : 
     771             : /*
     772             :  * Recursive guts of FreeSpaceMapVacuum
     773             :  *
     774             :  * Examine the FSM page indicated by addr, as well as its children, updating
     775             :  * upper-level nodes that cover the heap block range from start to end-1.
     776             :  * (It's okay if end is beyond the actual end of the map.)
     777             :  * Return the maximum freespace value on this page.
     778             :  *
     779             :  * If addr is past the end of the FSM, set *eof_p to true and return 0.
     780             :  *
     781             :  * This traverses the tree in depth-first order.  The tree is stored
     782             :  * physically in depth-first order, so this should be pretty I/O efficient.
     783             :  */
     784             : static uint8
     785       50802 : fsm_vacuum_page(Relation rel, FSMAddress addr,
     786             :                 BlockNumber start, BlockNumber end,
     787             :                 bool *eof_p)
     788             : {
     789             :     Buffer      buf;
     790             :     Page        page;
     791             :     uint8       max_avail;
     792             : 
     793             :     /* Read the page if it exists, or return EOF */
     794       50802 :     buf = fsm_readbuf(rel, addr, false);
     795       50802 :     if (!BufferIsValid(buf))
     796             :     {
     797         500 :         *eof_p = true;
     798         500 :         return 0;
     799             :     }
     800             :     else
     801       50302 :         *eof_p = false;
     802             : 
     803       50302 :     page = BufferGetPage(buf);
     804             : 
     805             :     /*
     806             :      * If we're above the bottom level, recurse into children, and fix the
     807             :      * information stored about them at this level.
     808             :      */
     809       50302 :     if (addr.level > FSM_BOTTOM_LEVEL)
     810             :     {
     811             :         FSMAddress  fsm_start,
     812             :                     fsm_end;
     813             :         uint16      fsm_start_slot,
     814             :                     fsm_end_slot;
     815             :         int         slot,
     816             :                     start_slot,
     817             :                     end_slot;
     818       33568 :         bool        eof = false;
     819             : 
     820             :         /*
     821             :          * Compute the range of slots we need to update on this page, given
     822             :          * the requested range of heap blocks to consider.  The first slot to
     823             :          * update is the one covering the "start" block, and the last slot is
     824             :          * the one covering "end - 1".  (Some of this work will be duplicated
     825             :          * in each recursive call, but it's cheap enough to not worry about.)
     826             :          */
     827       33568 :         fsm_start = fsm_get_location(start, &fsm_start_slot);
     828       33568 :         fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
     829             : 
     830      117488 :         while (fsm_start.level < addr.level)
     831             :         {
     832       50352 :             fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
     833       50352 :             fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
     834             :         }
     835             :         Assert(fsm_start.level == addr.level);
     836             : 
     837       33568 :         if (fsm_start.logpageno == addr.logpageno)
     838       33568 :             start_slot = fsm_start_slot;
     839           0 :         else if (fsm_start.logpageno > addr.logpageno)
     840           0 :             start_slot = SlotsPerFSMPage;   /* shouldn't get here... */
     841             :         else
     842           0 :             start_slot = 0;
     843             : 
     844       33568 :         if (fsm_end.logpageno == addr.logpageno)
     845       33322 :             end_slot = fsm_end_slot;
     846         246 :         else if (fsm_end.logpageno > addr.logpageno)
     847         246 :             end_slot = SlotsPerFSMPage - 1;
     848             :         else
     849           0 :             end_slot = -1;      /* shouldn't get here... */
     850             : 
     851     1131578 :         for (slot = start_slot; slot <= end_slot; slot++)
     852             :         {
     853             :             int         child_avail;
     854             : 
     855     1098010 :             CHECK_FOR_INTERRUPTS();
     856             : 
     857             :             /* After we hit end-of-file, just clear the rest of the slots */
     858     1098010 :             if (!eof)
     859       34010 :                 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
     860             :                                               start, end,
     861             :                                               &eof);
     862             :             else
     863     1064000 :                 child_avail = 0;
     864             : 
     865             :             /* Update information about the child */
     866     1098010 :             if (fsm_get_avail(page, slot) != child_avail)
     867             :             {
     868       31224 :                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     869       31224 :                 fsm_set_avail(page, slot, child_avail);
     870       31224 :                 MarkBufferDirtyHint(buf, false);
     871       31224 :                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     872             :             }
     873             :         }
     874             :     }
     875             : 
     876             :     /* Now get the maximum value on the page, to return to caller */
     877       50302 :     max_avail = fsm_get_max_avail(page);
     878             : 
     879             :     /*
     880             :      * Reset the next slot pointer. This encourages the use of low-numbered
     881             :      * pages, increasing the chances that a later vacuum can truncate the
     882             :      * relation.  We don't bother with a lock here, nor with marking the page
     883             :      * dirty if it wasn't already, since this is just a hint.
     884             :      */
     885       50302 :     ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
     886             : 
     887       50302 :     ReleaseBuffer(buf);
     888             : 
     889       50302 :     return max_avail;
     890             : }

Generated by: LCOV version 1.13