Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * freespace.c
4 : * POSTGRES free space map for quickly finding free space in relations
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/freespace/freespace.c
12 : *
13 : *
14 : * NOTES:
15 : *
16 : * Free Space Map keeps track of the amount of free space on pages, and
17 : * allows quickly searching for a page with enough free space. The FSM is
18 : * stored in a dedicated relation fork of all heap relations, and those
19 : * index access methods that need it (see also indexfsm.c). See README for
20 : * more information.
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 : #include "postgres.h"
25 :
26 : #include "access/htup_details.h"
27 : #include "access/xloginsert.h"
28 : #include "access/xlogutils.h"
29 : #include "miscadmin.h"
30 : #include "storage/freespace.h"
31 : #include "storage/fsm_internals.h"
32 : #include "storage/smgr.h"
33 : #include "utils/rel.h"
34 :
35 :
36 : /*
37 : * We use just one byte to store the amount of free space on a page, so we
38 : * divide the amount of free space a page can have into 256 different
39 : * categories. The highest category, 255, represents a page with at least
40 : * MaxFSMRequestSize bytes of free space, and the second highest category
41 : * represents the range from 254 * FSM_CAT_STEP, inclusive, to
42 : * MaxFSMRequestSize, exclusive.
43 : *
44 : * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
45 : * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
46 : * categories look like this:
47 : *
48 : *
49 : * Range Category
50 : * 0 - 31 0
51 : * 32 - 63 1
52 : * ... ... ...
53 : * 8096 - 8127 253
54 : * 8128 - 8163 254
55 : * 8164 - 8192 255
56 : *
57 : * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
58 : * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
59 : * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
60 : * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
61 : * completely empty page, that would mean that we could never satisfy a
62 : * request of exactly MaxFSMRequestSize bytes.
63 : */
64 : #define FSM_CATEGORIES 256
65 : #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
66 : #define MaxFSMRequestSize MaxHeapTupleSize
67 :
68 : /*
69 : * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
70 : * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
71 : * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
72 : * this means that 4096 bytes is the smallest BLCKSZ that we can get away
73 : * with a 3-level tree, and 512 is the smallest we support.
74 : */
75 : #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
76 :
77 : #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
78 : #define FSM_BOTTOM_LEVEL 0
79 :
80 : /*
81 : * The internal FSM routines work on a logical addressing scheme. Each
82 : * level of the tree can be thought of as a separately addressable file.
83 : */
84 : typedef struct
85 : {
86 : int level; /* level */
87 : int logpageno; /* page number within the level */
88 : } FSMAddress;
89 :
90 : /* Address of the root page. */
91 : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
92 :
93 : /* functions to navigate the tree */
94 : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
95 : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
96 : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
97 : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
98 : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
99 :
100 : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
101 : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
102 :
103 : /* functions to convert amount of free space to a FSM category */
104 : static uint8 fsm_space_avail_to_cat(Size avail);
105 : static uint8 fsm_space_needed_to_cat(Size needed);
106 : static Size fsm_space_cat_to_avail(uint8 cat);
107 :
108 : /* workhorse functions for various operations */
109 : static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
110 : uint8 newValue, uint8 minValue);
111 : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
112 : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
113 : BlockNumber start, BlockNumber end,
114 : bool *eof_p);
115 : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
116 :
117 :
118 : /******** Public API ********/
119 :
120 : /*
121 : * GetPageWithFreeSpace - try to find a page in the given relation with
122 : * at least the specified amount of free space.
123 : *
124 : * If successful, return the block number; if not, return InvalidBlockNumber.
125 : *
126 : * The caller must be prepared for the possibility that the returned page
127 : * will turn out to have too little space available by the time the caller
128 : * gets a lock on it. In that case, the caller should report the actual
129 : * amount of free space available on that page and then try again (see
130 : * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131 : * extend the relation.
132 : *
133 : * This can trigger FSM updates if any FSM entry is found to point to a block
134 : * past the end of the relation.
135 : */
136 : BlockNumber
137 110166 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
138 : {
139 110166 : uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
140 :
141 110166 : return fsm_search(rel, min_cat);
142 : }
143 :
144 : /*
145 : * RecordAndGetPageWithFreeSpace - update info about a page and try again.
146 : *
147 : * We provide this combo form to save some locking overhead, compared to
148 : * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
149 : * also some effort to return a page close to the old page; if there's a
150 : * page with enough free space on the same FSM page where the old one page
151 : * is located, it is preferred.
152 : */
153 : BlockNumber
154 142332 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
155 : Size oldSpaceAvail, Size spaceNeeded)
156 : {
157 142332 : int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
158 142332 : int search_cat = fsm_space_needed_to_cat(spaceNeeded);
159 : FSMAddress addr;
160 : uint16 slot;
161 : int search_slot;
162 :
163 : /* Get the location of the FSM byte representing the heap block */
164 142332 : addr = fsm_get_location(oldPage, &slot);
165 :
166 142332 : search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
167 :
168 : /*
169 : * If fsm_set_and_search found a suitable new block, return that.
170 : * Otherwise, search as usual.
171 : */
172 142332 : if (search_slot != -1)
173 : {
174 15505 : BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
175 :
176 : /*
177 : * Check that the blknum is actually in the relation. Don't try to
178 : * update the FSM in that case, just fall back to the other case
179 : */
180 15505 : if (fsm_does_block_exist(rel, blknum))
181 15505 : return blknum;
182 : }
183 126827 : return fsm_search(rel, search_cat);
184 : }
185 :
186 : /*
187 : * RecordPageWithFreeSpace - update info about a page.
188 : *
189 : * Note that if the new spaceAvail value is higher than the old value stored
190 : * in the FSM, the space might not become visible to searchers until the next
191 : * FreeSpaceMapVacuum call, which updates the upper level pages.
192 : */
193 : void
194 468706 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
195 : {
196 468706 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
197 : FSMAddress addr;
198 : uint16 slot;
199 :
200 : /* Get the location of the FSM byte representing the heap block */
201 468706 : addr = fsm_get_location(heapBlk, &slot);
202 :
203 468706 : fsm_set_and_search(rel, addr, slot, new_cat, 0);
204 468706 : }
205 :
206 : /*
207 : * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
208 : * WAL replay
209 : */
210 : void
211 309552 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
212 : Size spaceAvail)
213 : {
214 309552 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
215 : FSMAddress addr;
216 : uint16 slot;
217 : BlockNumber blkno;
218 : Buffer buf;
219 : Page page;
220 :
221 : /* Get the location of the FSM byte representing the heap block */
222 309552 : addr = fsm_get_location(heapBlk, &slot);
223 309552 : blkno = fsm_logical_to_physical(addr);
224 :
225 : /* If the page doesn't exist already, extend */
226 309552 : buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
227 : RBM_ZERO_ON_ERROR, InvalidBuffer);
228 309552 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
229 :
230 309552 : page = BufferGetPage(buf);
231 309552 : if (PageIsNew(page))
232 406 : PageInit(page, BLCKSZ, 0);
233 :
234 : /*
235 : * Changes to FSM are usually marked as changed using MarkBufferDirtyHint;
236 : * however, during recovery, it does nothing if checksums are enabled. It
237 : * is assumed that the page should not be dirtied during recovery while
238 : * modifying hints to prevent torn pages, since no new WAL data can be
239 : * generated at this point to store FPI. This is not relevant to the FSM
240 : * case, as its blocks are zeroed when a checksum mismatch occurs. So, we
241 : * need to use regular MarkBufferDirty here to mark the FSM block as
242 : * modified during recovery, otherwise changes to the FSM may be lost.
243 : */
244 309552 : if (fsm_set_avail(page, slot, new_cat))
245 303096 : MarkBufferDirty(buf);
246 309552 : UnlockReleaseBuffer(buf);
247 309552 : }
248 :
249 : /*
250 : * GetRecordedFreeSpace - return the amount of free space on a particular page,
251 : * according to the FSM.
252 : */
253 : Size
254 899 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
255 : {
256 : FSMAddress addr;
257 : uint16 slot;
258 : Buffer buf;
259 : uint8 cat;
260 :
261 : /* Get the location of the FSM byte representing the heap block */
262 899 : addr = fsm_get_location(heapBlk, &slot);
263 :
264 899 : buf = fsm_readbuf(rel, addr, false);
265 899 : if (!BufferIsValid(buf))
266 36 : return 0;
267 863 : cat = fsm_get_avail(BufferGetPage(buf), slot);
268 863 : ReleaseBuffer(buf);
269 :
270 863 : return fsm_space_cat_to_avail(cat);
271 : }
272 :
273 : /*
274 : * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
275 : *
276 : * nblocks is the new size of the heap.
277 : *
278 : * Return the number of blocks of new FSM.
279 : * If it's InvalidBlockNumber, there is nothing to truncate;
280 : * otherwise the caller is responsible for calling smgrtruncate()
281 : * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
282 : * to update upper-level pages in the FSM.
283 : */
284 : BlockNumber
285 218 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
286 : {
287 : BlockNumber new_nfsmblocks;
288 : FSMAddress first_removed_address;
289 : uint16 first_removed_slot;
290 : Buffer buf;
291 :
292 : /*
293 : * If no FSM has been created yet for this relation, there's nothing to
294 : * truncate.
295 : */
296 218 : if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
297 0 : return InvalidBlockNumber;
298 :
299 : /* Get the location in the FSM of the first removed heap block */
300 218 : first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
301 :
302 : /*
303 : * Zero out the tail of the last remaining FSM page. If the slot
304 : * representing the first removed heap block is at a page boundary, as the
305 : * first slot on the FSM page that first_removed_address points to, we can
306 : * just truncate that page altogether.
307 : */
308 218 : if (first_removed_slot > 0)
309 : {
310 111 : buf = fsm_readbuf(rel, first_removed_address, false);
311 111 : if (!BufferIsValid(buf))
312 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
313 : * smaller */
314 111 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
315 :
316 : /* NO EREPORT(ERROR) from here till changes are logged */
317 111 : START_CRIT_SECTION();
318 :
319 111 : fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
320 :
321 : /*
322 : * This change is non-critical, because fsm_does_block_exist() would
323 : * stop us from returning a truncated-away block. However, since this
324 : * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
325 : * of that many fsm_does_block_exist() rejections. Use a full
326 : * MarkBufferDirty(), not MarkBufferDirtyHint().
327 : */
328 111 : MarkBufferDirty(buf);
329 :
330 : /*
331 : * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
332 : * differing from the rest of the file in this respect. This is
333 : * optional; see README mention of full page images. XXX consider
334 : * XLogSaveBufferForHint() for even closer similarity.
335 : *
336 : * A higher-level operation calls us at WAL replay. If we crash
337 : * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
338 : * not changed, and our fork remains valid. If we crash after that
339 : * flush, redo will return here.
340 : */
341 111 : if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
342 91 : log_newpage_buffer(buf, false);
343 :
344 111 : END_CRIT_SECTION();
345 :
346 111 : UnlockReleaseBuffer(buf);
347 :
348 111 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
349 : }
350 : else
351 : {
352 107 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
353 107 : if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
354 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
355 : * smaller */
356 : }
357 :
358 218 : return new_nfsmblocks;
359 : }
360 :
361 : /*
362 : * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
363 : *
364 : * We assume that the bottom-level pages have already been updated with
365 : * new free-space information.
366 : */
367 : void
368 183 : FreeSpaceMapVacuum(Relation rel)
369 : {
370 : bool dummy;
371 :
372 : /* Recursively scan the tree, starting at the root */
373 183 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
374 : (BlockNumber) 0, InvalidBlockNumber,
375 : &dummy);
376 183 : }
377 :
378 : /*
379 : * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
380 : *
381 : * As above, but assume that only heap pages between start and end-1 inclusive
382 : * have new free-space information, so update only the upper-level slots
383 : * covering that block range. end == InvalidBlockNumber is equivalent to
384 : * "all the rest of the relation".
385 : */
386 : void
387 48246 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
388 : {
389 : bool dummy;
390 :
391 : /* Recursively scan the tree, starting at the root */
392 48246 : if (end > start)
393 48103 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
394 48246 : }
395 :
396 : /******** Internal routines ********/
397 :
398 : /*
399 : * Return category corresponding x bytes of free space
400 : */
401 : static uint8
402 920590 : fsm_space_avail_to_cat(Size avail)
403 : {
404 : int cat;
405 :
406 : Assert(avail < BLCKSZ);
407 :
408 920590 : if (avail >= MaxFSMRequestSize)
409 22328 : return 255;
410 :
411 898262 : cat = avail / FSM_CAT_STEP;
412 :
413 : /*
414 : * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
415 : * more.
416 : */
417 898262 : if (cat > 254)
418 0 : cat = 254;
419 :
420 898262 : return (uint8) cat;
421 : }
422 :
423 : /*
424 : * Return the lower bound of the range of free space represented by given
425 : * category.
426 : */
427 : static Size
428 863 : fsm_space_cat_to_avail(uint8 cat)
429 : {
430 : /* The highest category represents exactly MaxFSMRequestSize bytes. */
431 863 : if (cat == 255)
432 234 : return MaxFSMRequestSize;
433 : else
434 629 : return cat * FSM_CAT_STEP;
435 : }
436 :
437 : /*
438 : * Which category does a page need to have, to accommodate x bytes of data?
439 : * While fsm_space_avail_to_cat() rounds down, this needs to round up.
440 : */
441 : static uint8
442 252498 : fsm_space_needed_to_cat(Size needed)
443 : {
444 : int cat;
445 :
446 : /* Can't ask for more space than the highest category represents */
447 252498 : if (needed > MaxFSMRequestSize)
448 0 : elog(ERROR, "invalid FSM request size %zu", needed);
449 :
450 252498 : if (needed == 0)
451 0 : return 1;
452 :
453 252498 : cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
454 :
455 252498 : if (cat > 255)
456 0 : cat = 255;
457 :
458 252498 : return (uint8) cat;
459 : }
460 :
461 : /*
462 : * Returns the physical block number of a FSM page
463 : */
464 : static BlockNumber
465 1339245 : fsm_logical_to_physical(FSMAddress addr)
466 : {
467 : BlockNumber pages;
468 : int leafno;
469 : int l;
470 :
471 : /*
472 : * Calculate the logical page number of the first leaf page below the
473 : * given page.
474 : */
475 1339245 : leafno = addr.logpageno;
476 1982906 : for (l = 0; l < addr.level; l++)
477 643661 : leafno *= SlotsPerFSMPage;
478 :
479 : /* Count upper level nodes required to address the leaf page */
480 1339245 : pages = 0;
481 5356980 : for (l = 0; l < FSM_TREE_DEPTH; l++)
482 : {
483 4017735 : pages += leafno + 1;
484 4017735 : leafno /= SlotsPerFSMPage;
485 : }
486 :
487 : /*
488 : * If the page we were asked for wasn't at the bottom level, subtract the
489 : * additional lower level pages we counted above.
490 : */
491 1339245 : pages -= addr.level;
492 :
493 : /* Turn the page count into 0-based block number */
494 1339245 : return pages - 1;
495 : }
496 :
497 : /*
498 : * Return the FSM location corresponding to given heap block.
499 : */
500 : static FSMAddress
501 1114763 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
502 : {
503 : FSMAddress addr;
504 :
505 1114763 : addr.level = FSM_BOTTOM_LEVEL;
506 1114763 : addr.logpageno = heapblk / SlotsPerFSMPage;
507 1114763 : *slot = heapblk % SlotsPerFSMPage;
508 :
509 1114763 : return addr;
510 : }
511 :
512 : /*
513 : * Return the heap block number corresponding to given location in the FSM.
514 : */
515 : static BlockNumber
516 28552 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
517 : {
518 : Assert(addr.level == FSM_BOTTOM_LEVEL);
519 28552 : return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
520 : }
521 :
522 : /*
523 : * Given a logical address of a child page, get the logical page number of
524 : * the parent, and the slot within the parent corresponding to the child.
525 : */
526 : static FSMAddress
527 292120 : fsm_get_parent(FSMAddress child, uint16 *slot)
528 : {
529 : FSMAddress parent;
530 :
531 : Assert(child.level < FSM_ROOT_LEVEL);
532 :
533 292120 : parent.level = child.level + 1;
534 292120 : parent.logpageno = child.logpageno / SlotsPerFSMPage;
535 292120 : *slot = child.logpageno % SlotsPerFSMPage;
536 :
537 292120 : return parent;
538 : }
539 :
540 : /*
541 : * Given a logical address of a parent page and a slot number, get the
542 : * logical address of the corresponding child page.
543 : */
544 : static FSMAddress
545 127076 : fsm_get_child(FSMAddress parent, uint16 slot)
546 : {
547 : FSMAddress child;
548 :
549 : Assert(parent.level > FSM_BOTTOM_LEVEL);
550 :
551 127076 : child.level = parent.level - 1;
552 127076 : child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
553 :
554 127076 : return child;
555 : }
556 :
557 : /*
558 : * Read a FSM page.
559 : *
560 : * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
561 : * true, the FSM file is extended.
562 : */
563 : static Buffer
564 1029475 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
565 : {
566 1029475 : BlockNumber blkno = fsm_logical_to_physical(addr);
567 : Buffer buf;
568 1029475 : SMgrRelation reln = RelationGetSmgr(rel);
569 :
570 : /*
571 : * If we haven't cached the size of the FSM yet, check it first. Also
572 : * recheck if the requested block seems to be past end, since our cached
573 : * value might be stale. (We send smgr inval messages on truncation, but
574 : * not on extension.)
575 : */
576 1029475 : if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
577 909446 : blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
578 : {
579 : /* Invalidate the cache so smgrnblocks asks the kernel. */
580 160472 : reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
581 160472 : if (smgrexists(reln, FSM_FORKNUM))
582 83531 : smgrnblocks(reln, FSM_FORKNUM);
583 : else
584 76941 : reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
585 : }
586 :
587 : /*
588 : * For reading we use ZERO_ON_ERROR mode, and initialize the page if
589 : * necessary. The FSM information is not accurate anyway, so it's better
590 : * to clear corrupt pages than error out. Since the FSM changes are not
591 : * WAL-logged, the so-called torn page problem on crash can lead to pages
592 : * with corrupt headers, for example.
593 : *
594 : * We use the same path below to initialize pages when extending the
595 : * relation, as a concurrent extension can end up with vm_extend()
596 : * returning an already-initialized page.
597 : */
598 1029475 : if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
599 : {
600 77736 : if (extend)
601 4862 : buf = fsm_extend(rel, blkno + 1);
602 : else
603 72874 : return InvalidBuffer;
604 : }
605 : else
606 951739 : buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
607 :
608 : /*
609 : * Initializing the page when needed is trickier than it looks, because of
610 : * the possibility of multiple backends doing this concurrently, and our
611 : * desire to not uselessly take the buffer lock in the normal path where
612 : * the page is OK. We must take the lock to initialize the page, so
613 : * recheck page newness after we have the lock, in case someone else
614 : * already did it. Also, because we initially check PageIsNew with no
615 : * lock, it's possible to fall through and return the buffer while someone
616 : * else is still initializing the page (i.e., we might see pd_upper as set
617 : * but other page header fields are still zeroes). This is harmless for
618 : * callers that will take a buffer lock themselves, but some callers
619 : * inspect the page without any lock at all. The latter is OK only so
620 : * long as it doesn't depend on the page header having correct contents.
621 : * Current usage is safe because PageGetContents() does not require that.
622 : */
623 956601 : if (PageIsNew(BufferGetPage(buf)))
624 : {
625 16169 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
626 16169 : if (PageIsNew(BufferGetPage(buf)))
627 16169 : PageInit(BufferGetPage(buf), BLCKSZ, 0);
628 16169 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
629 : }
630 956601 : return buf;
631 : }
632 :
633 : /*
634 : * Ensure that the FSM fork is at least fsm_nblocks long, extending
635 : * it if necessary with empty pages. And by empty, I mean pages filled
636 : * with zeros, meaning there's no free space.
637 : */
638 : static Buffer
639 4862 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
640 : {
641 4862 : return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
642 : EB_CREATE_FORK_IF_NEEDED |
643 : EB_CLEAR_SIZE_CACHE,
644 : fsm_nblocks,
645 : RBM_ZERO_ON_ERROR);
646 : }
647 :
648 : /*
649 : * Set value in given FSM page and slot.
650 : *
651 : * If minValue > 0, the updated page is also searched for a page with at
652 : * least minValue of free space. If one is found, its slot number is
653 : * returned, -1 otherwise.
654 : */
655 : static int
656 613574 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
657 : uint8 newValue, uint8 minValue)
658 : {
659 : Buffer buf;
660 : Page page;
661 613574 : int newslot = -1;
662 :
663 613574 : buf = fsm_readbuf(rel, addr, true);
664 613574 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
665 :
666 613574 : page = BufferGetPage(buf);
667 :
668 613574 : if (fsm_set_avail(page, slot, newValue))
669 127012 : MarkBufferDirtyHint(buf, false);
670 :
671 613574 : if (minValue != 0)
672 : {
673 : /* Search while we still hold the lock */
674 142332 : newslot = fsm_search_avail(buf, minValue,
675 142332 : addr.level == FSM_BOTTOM_LEVEL,
676 : true);
677 : }
678 :
679 613574 : UnlockReleaseBuffer(buf);
680 :
681 613574 : return newslot;
682 : }
683 :
684 : /*
685 : * Search the tree for a heap page with at least min_cat of free space
686 : */
687 : static BlockNumber
688 236993 : fsm_search(Relation rel, uint8 min_cat)
689 : {
690 236993 : int restarts = 0;
691 236993 : FSMAddress addr = FSM_ROOT_ADDRESS;
692 :
693 : for (;;)
694 32433 : {
695 : int slot;
696 : Buffer buf;
697 269426 : uint8 max_avail = 0;
698 :
699 : /* Read the FSM page. */
700 269426 : buf = fsm_readbuf(rel, addr, false);
701 :
702 : /* Search within the page */
703 269426 : if (BufferIsValid(buf))
704 : {
705 197368 : LockBuffer(buf, BUFFER_LOCK_SHARE);
706 197368 : slot = fsm_search_avail(buf, min_cat,
707 197368 : (addr.level == FSM_BOTTOM_LEVEL),
708 : false);
709 197368 : if (slot == -1)
710 : {
711 154424 : max_avail = fsm_get_max_avail(BufferGetPage(buf));
712 154424 : UnlockReleaseBuffer(buf);
713 : }
714 : else
715 : {
716 : /* Keep the pin for possible update below */
717 42944 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
718 : }
719 : }
720 : else
721 72058 : slot = -1;
722 :
723 269426 : if (slot != -1)
724 : {
725 : /*
726 : * Descend the tree, or return the found block if we're at the
727 : * bottom.
728 : */
729 42944 : if (addr.level == FSM_BOTTOM_LEVEL)
730 : {
731 13047 : BlockNumber blkno = fsm_get_heap_blk(addr, slot);
732 : Page page;
733 :
734 13047 : if (fsm_does_block_exist(rel, blkno))
735 : {
736 13047 : ReleaseBuffer(buf);
737 13047 : return blkno;
738 : }
739 :
740 : /*
741 : * Block is past the end of the relation. Update FSM, and
742 : * restart from root. The usual "advancenext" behavior is
743 : * pessimal for this rare scenario, since every later slot is
744 : * unusable in the same way. We could zero all affected slots
745 : * on the same FSM page, but don't bet on the benefits of that
746 : * optimization justifying its compiled code bulk.
747 : */
748 0 : page = BufferGetPage(buf);
749 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
750 0 : fsm_set_avail(page, slot, 0);
751 0 : MarkBufferDirtyHint(buf, false);
752 0 : UnlockReleaseBuffer(buf);
753 0 : if (restarts++ > 10000) /* same rationale as below */
754 0 : return InvalidBlockNumber;
755 0 : addr = FSM_ROOT_ADDRESS;
756 : }
757 : else
758 : {
759 29897 : ReleaseBuffer(buf);
760 : }
761 29897 : addr = fsm_get_child(addr, slot);
762 : }
763 226482 : else if (addr.level == FSM_ROOT_LEVEL)
764 : {
765 : /*
766 : * At the root, failure means there's no page with enough free
767 : * space in the FSM. Give up.
768 : */
769 223946 : return InvalidBlockNumber;
770 : }
771 : else
772 : {
773 : uint16 parentslot;
774 : FSMAddress parent;
775 :
776 : /*
777 : * At lower level, failure can happen if the value in the upper-
778 : * level node didn't reflect the value on the lower page. Update
779 : * the upper node, to avoid falling into the same trap again, and
780 : * start over.
781 : *
782 : * There's a race condition here, if another backend updates this
783 : * page right after we release it, and gets the lock on the parent
784 : * page before us. We'll then update the parent page with the now
785 : * stale information we had. It's OK, because it should happen
786 : * rarely, and will be fixed by the next vacuum.
787 : */
788 2536 : parent = fsm_get_parent(addr, &parentslot);
789 2536 : fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
790 :
791 : /*
792 : * If the upper pages are badly out of date, we might need to loop
793 : * quite a few times, updating them as we go. Any inconsistencies
794 : * should eventually be corrected and the loop should end. Looping
795 : * indefinitely is nevertheless scary, so provide an emergency
796 : * valve.
797 : */
798 2536 : if (restarts++ > 10000)
799 0 : return InvalidBlockNumber;
800 :
801 : /* Start search all over from the root */
802 2536 : addr = FSM_ROOT_ADDRESS;
803 : }
804 : }
805 : }
806 :
807 :
808 : /*
809 : * Recursive guts of FreeSpaceMapVacuum
810 : *
811 : * Examine the FSM page indicated by addr, as well as its children, updating
812 : * upper-level nodes that cover the heap block range from start to end-1.
813 : * (It's okay if end is beyond the actual end of the map.)
814 : * Return the maximum freespace value on this page.
815 : *
816 : * If addr is past the end of the FSM, set *eof_p to true and return 0.
817 : *
818 : * This traverses the tree in depth-first order. The tree is stored
819 : * physically in depth-first order, so this should be pretty I/O efficient.
820 : */
821 : static uint8
822 145465 : fsm_vacuum_page(Relation rel, FSMAddress addr,
823 : BlockNumber start, BlockNumber end,
824 : bool *eof_p)
825 : {
826 : Buffer buf;
827 : Page page;
828 : uint8 max_avail;
829 :
830 : /* Read the page if it exists, or return EOF */
831 145465 : buf = fsm_readbuf(rel, addr, false);
832 145465 : if (!BufferIsValid(buf))
833 : {
834 780 : *eof_p = true;
835 780 : return 0;
836 : }
837 : else
838 144685 : *eof_p = false;
839 :
840 144685 : page = BufferGetPage(buf);
841 :
842 : /*
843 : * If we're above the bottom level, recurse into children, and fix the
844 : * information stored about them at this level.
845 : */
846 144685 : if (addr.level > FSM_BOTTOM_LEVEL)
847 : {
848 : FSMAddress fsm_start,
849 : fsm_end;
850 : uint16 fsm_start_slot,
851 : fsm_end_slot;
852 : int slot,
853 : start_slot,
854 : end_slot;
855 96528 : bool eof = false;
856 :
857 : /*
858 : * Compute the range of slots we need to update on this page, given
859 : * the requested range of heap blocks to consider. The first slot to
860 : * update is the one covering the "start" block, and the last slot is
861 : * the one covering "end - 1". (Some of this work will be duplicated
862 : * in each recursive call, but it's cheap enough to not worry about.)
863 : */
864 96528 : fsm_start = fsm_get_location(start, &fsm_start_slot);
865 96528 : fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
866 :
867 241320 : while (fsm_start.level < addr.level)
868 : {
869 144792 : fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
870 144792 : fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
871 : }
872 : Assert(fsm_start.level == addr.level);
873 :
874 96528 : if (fsm_start.logpageno == addr.logpageno)
875 96528 : start_slot = fsm_start_slot;
876 0 : else if (fsm_start.logpageno > addr.logpageno)
877 0 : start_slot = SlotsPerFSMPage; /* shouldn't get here... */
878 : else
879 0 : start_slot = 0;
880 :
881 96528 : if (fsm_end.logpageno == addr.logpageno)
882 96149 : end_slot = fsm_end_slot;
883 379 : else if (fsm_end.logpageno > addr.logpageno)
884 379 : end_slot = SlotsPerFSMPage - 1;
885 : else
886 0 : end_slot = -1; /* shouldn't get here... */
887 :
888 1832989 : for (slot = start_slot; slot <= end_slot; slot++)
889 : {
890 : int child_avail;
891 :
892 1736461 : CHECK_FOR_INTERRUPTS();
893 :
894 : /* After we hit end-of-file, just clear the rest of the slots */
895 1736461 : if (!eof)
896 97179 : child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
897 : start, end,
898 : &eof);
899 : else
900 1639282 : child_avail = 0;
901 :
902 : /* Update information about the child */
903 1736461 : if (fsm_get_avail(page, slot) != child_avail)
904 : {
905 9298 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
906 9298 : fsm_set_avail(page, slot, child_avail);
907 9298 : MarkBufferDirtyHint(buf, false);
908 9298 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
909 : }
910 : }
911 : }
912 :
913 : /* Now get the maximum value on the page, to return to caller */
914 144685 : max_avail = fsm_get_max_avail(page);
915 :
916 : /*
917 : * Try to reset the next slot pointer. This encourages the use of
918 : * low-numbered pages, increasing the chances that a later vacuum can
919 : * truncate the relation. We don't bother with marking the page dirty if
920 : * it wasn't already, since this is just a hint.
921 : */
922 144685 : LockBuffer(buf, BUFFER_LOCK_SHARE);
923 144685 : if (BufferBeginSetHintBits(buf))
924 : {
925 144685 : ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
926 144685 : BufferFinishSetHintBits(buf, false, false);
927 : }
928 :
929 144685 : UnlockReleaseBuffer(buf);
930 :
931 144685 : return max_avail;
932 : }
933 :
934 :
935 : /*
936 : * Check whether a block number is past the end of the relation. This can
937 : * happen after WAL replay, if the FSM reached disk but newly-extended pages
938 : * it refers to did not.
939 : */
940 : static bool
941 28552 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
942 : {
943 28552 : SMgrRelation smgr = RelationGetSmgr(rel);
944 :
945 : /*
946 : * If below the cached nblocks, the block surely exists. Otherwise, we
947 : * face a trade-off. We opt to compare to a fresh nblocks, incurring
948 : * lseek() overhead. The alternative would be to assume the block does
949 : * not exist, but that would cause FSM to set zero space available for
950 : * blocks that main fork extension just recorded.
951 : */
952 28552 : return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
953 40140 : blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
954 11588 : blknumber < RelationGetNumberOfBlocks(rel));
955 : }
|