Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * freespace.c
4 : * POSTGRES free space map for quickly finding free space in relations
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/freespace/freespace.c
12 : *
13 : *
14 : * NOTES:
15 : *
16 : * Free Space Map keeps track of the amount of free space on pages, and
17 : * allows quickly searching for a page with enough free space. The FSM is
18 : * stored in a dedicated relation fork of all heap relations, and those
19 : * index access methods that need it (see also indexfsm.c). See README for
20 : * more information.
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 : #include "postgres.h"
25 :
26 : #include "access/htup_details.h"
27 : #include "access/xloginsert.h"
28 : #include "access/xlogutils.h"
29 : #include "miscadmin.h"
30 : #include "storage/freespace.h"
31 : #include "storage/fsm_internals.h"
32 : #include "storage/smgr.h"
33 : #include "utils/rel.h"
34 :
35 :
36 : /*
37 : * We use just one byte to store the amount of free space on a page, so we
38 : * divide the amount of free space a page can have into 256 different
39 : * categories. The highest category, 255, represents a page with at least
40 : * MaxFSMRequestSize bytes of free space, and the second highest category
41 : * represents the range from 254 * FSM_CAT_STEP, inclusive, to
42 : * MaxFSMRequestSize, exclusive.
43 : *
44 : * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
45 : * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
46 : * categories look like this:
47 : *
48 : *
49 : * Range Category
50 : * 0 - 31 0
51 : * 32 - 63 1
52 : * ... ... ...
53 : * 8096 - 8127 253
54 : * 8128 - 8163 254
55 : * 8164 - 8192 255
56 : *
57 : * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
58 : * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
59 : * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
60 : * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
61 : * completely empty page, that would mean that we could never satisfy a
62 : * request of exactly MaxFSMRequestSize bytes.
63 : */
64 : #define FSM_CATEGORIES 256
65 : #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
66 : #define MaxFSMRequestSize MaxHeapTupleSize
67 :
68 : /*
69 : * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
70 : * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
71 : * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
72 : * this means that 4096 bytes is the smallest BLCKSZ that we can get away
73 : * with a 3-level tree, and 512 is the smallest we support.
74 : */
75 : #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
76 :
77 : #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
78 : #define FSM_BOTTOM_LEVEL 0
79 :
80 : /*
81 : * The internal FSM routines work on a logical addressing scheme. Each
82 : * level of the tree can be thought of as a separately addressable file.
83 : */
84 : typedef struct
85 : {
86 : int level; /* level */
87 : int logpageno; /* page number within the level */
88 : } FSMAddress;
89 :
90 : /* Address of the root page. */
91 : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
92 :
93 : /* functions to navigate the tree */
94 : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
95 : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
96 : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
97 : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
98 : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
99 :
100 : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
101 : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
102 :
103 : /* functions to convert amount of free space to a FSM category */
104 : static uint8 fsm_space_avail_to_cat(Size avail);
105 : static uint8 fsm_space_needed_to_cat(Size needed);
106 : static Size fsm_space_cat_to_avail(uint8 cat);
107 :
108 : /* workhorse functions for various operations */
109 : static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
110 : uint8 newValue, uint8 minValue);
111 : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
112 : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
113 : BlockNumber start, BlockNumber end,
114 : bool *eof_p);
115 : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
116 :
117 :
118 : /******** Public API ********/
119 :
120 : /*
121 : * GetPageWithFreeSpace - try to find a page in the given relation with
122 : * at least the specified amount of free space.
123 : *
124 : * If successful, return the block number; if not, return InvalidBlockNumber.
125 : *
126 : * The caller must be prepared for the possibility that the returned page
127 : * will turn out to have too little space available by the time the caller
128 : * gets a lock on it. In that case, the caller should report the actual
129 : * amount of free space available on that page and then try again (see
130 : * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131 : * extend the relation.
132 : *
133 : * This can trigger FSM updates if any FSM entry is found to point to a block
134 : * past the end of the relation.
135 : */
136 : BlockNumber
137 165250 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
138 : {
139 165250 : uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
140 :
141 165250 : return fsm_search(rel, min_cat);
142 : }
143 :
144 : /*
145 : * RecordAndGetPageWithFreeSpace - update info about a page and try again.
146 : *
147 : * We provide this combo form to save some locking overhead, compared to
148 : * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
149 : * also some effort to return a page close to the old page; if there's a
150 : * page with enough free space on the same FSM page where the old one page
151 : * is located, it is preferred.
152 : */
153 : BlockNumber
154 193460 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
155 : Size oldSpaceAvail, Size spaceNeeded)
156 : {
157 193460 : int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
158 193460 : int search_cat = fsm_space_needed_to_cat(spaceNeeded);
159 : FSMAddress addr;
160 : uint16 slot;
161 : int search_slot;
162 :
163 : /* Get the location of the FSM byte representing the heap block */
164 193460 : addr = fsm_get_location(oldPage, &slot);
165 :
166 193460 : search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
167 :
168 : /*
169 : * If fsm_set_and_search found a suitable new block, return that.
170 : * Otherwise, search as usual.
171 : */
172 193460 : if (search_slot != -1)
173 : {
174 27332 : BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
175 :
176 : /*
177 : * Check that the blknum is actually in the relation. Don't try to
178 : * update the FSM in that case, just fall back to the other case
179 : */
180 27332 : if (fsm_does_block_exist(rel, blknum))
181 27332 : return blknum;
182 : }
183 166128 : return fsm_search(rel, search_cat);
184 : }
185 :
186 : /*
187 : * RecordPageWithFreeSpace - update info about a page.
188 : *
189 : * Note that if the new spaceAvail value is higher than the old value stored
190 : * in the FSM, the space might not become visible to searchers until the next
191 : * FreeSpaceMapVacuum call, which updates the upper level pages.
192 : */
193 : void
194 772072 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
195 : {
196 772072 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
197 : FSMAddress addr;
198 : uint16 slot;
199 :
200 : /* Get the location of the FSM byte representing the heap block */
201 772072 : addr = fsm_get_location(heapBlk, &slot);
202 :
203 772072 : fsm_set_and_search(rel, addr, slot, new_cat, 0);
204 772072 : }
205 :
206 : /*
207 : * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
208 : * WAL replay
209 : */
210 : void
211 571192 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
212 : Size spaceAvail)
213 : {
214 571192 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
215 : FSMAddress addr;
216 : uint16 slot;
217 : BlockNumber blkno;
218 : Buffer buf;
219 : Page page;
220 :
221 : /* Get the location of the FSM byte representing the heap block */
222 571192 : addr = fsm_get_location(heapBlk, &slot);
223 571192 : blkno = fsm_logical_to_physical(addr);
224 :
225 : /* If the page doesn't exist already, extend */
226 571192 : buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
227 : RBM_ZERO_ON_ERROR, InvalidBuffer);
228 571192 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
229 :
230 571192 : page = BufferGetPage(buf);
231 571192 : if (PageIsNew(page))
232 928 : PageInit(page, BLCKSZ, 0);
233 :
234 571192 : if (fsm_set_avail(page, slot, new_cat))
235 561556 : MarkBufferDirtyHint(buf, false);
236 571192 : UnlockReleaseBuffer(buf);
237 571192 : }
238 :
239 : /*
240 : * GetRecordedFreeSpace - return the amount of free space on a particular page,
241 : * according to the FSM.
242 : */
243 : Size
244 1996 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
245 : {
246 : FSMAddress addr;
247 : uint16 slot;
248 : Buffer buf;
249 : uint8 cat;
250 :
251 : /* Get the location of the FSM byte representing the heap block */
252 1996 : addr = fsm_get_location(heapBlk, &slot);
253 :
254 1996 : buf = fsm_readbuf(rel, addr, false);
255 1996 : if (!BufferIsValid(buf))
256 72 : return 0;
257 1924 : cat = fsm_get_avail(BufferGetPage(buf), slot);
258 1924 : ReleaseBuffer(buf);
259 :
260 1924 : return fsm_space_cat_to_avail(cat);
261 : }
262 :
263 : /*
264 : * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
265 : *
266 : * nblocks is the new size of the heap.
267 : *
268 : * Return the number of blocks of new FSM.
269 : * If it's InvalidBlockNumber, there is nothing to truncate;
270 : * otherwise the caller is responsible for calling smgrtruncate()
271 : * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
272 : * to update upper-level pages in the FSM.
273 : */
274 : BlockNumber
275 366 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
276 : {
277 : BlockNumber new_nfsmblocks;
278 : FSMAddress first_removed_address;
279 : uint16 first_removed_slot;
280 : Buffer buf;
281 :
282 : /*
283 : * If no FSM has been created yet for this relation, there's nothing to
284 : * truncate.
285 : */
286 366 : if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
287 0 : return InvalidBlockNumber;
288 :
289 : /* Get the location in the FSM of the first removed heap block */
290 366 : first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
291 :
292 : /*
293 : * Zero out the tail of the last remaining FSM page. If the slot
294 : * representing the first removed heap block is at a page boundary, as the
295 : * first slot on the FSM page that first_removed_address points to, we can
296 : * just truncate that page altogether.
297 : */
298 366 : if (first_removed_slot > 0)
299 : {
300 210 : buf = fsm_readbuf(rel, first_removed_address, false);
301 210 : if (!BufferIsValid(buf))
302 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
303 : * smaller */
304 210 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
305 :
306 : /* NO EREPORT(ERROR) from here till changes are logged */
307 210 : START_CRIT_SECTION();
308 :
309 210 : fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
310 :
311 : /*
312 : * This change is non-critical, because fsm_does_block_exist() would
313 : * stop us from returning a truncated-away block. However, since this
314 : * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
315 : * of that many fsm_does_block_exist() rejections. Use a full
316 : * MarkBufferDirty(), not MarkBufferDirtyHint().
317 : */
318 210 : MarkBufferDirty(buf);
319 :
320 : /*
321 : * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
322 : * differing from the rest of the file in this respect. This is
323 : * optional; see README mention of full page images. XXX consider
324 : * XLogSaveBufferForHint() for even closer similarity.
325 : *
326 : * A higher-level operation calls us at WAL replay. If we crash
327 : * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
328 : * not changed, and our fork remains valid. If we crash after that
329 : * flush, redo will return here.
330 : */
331 210 : if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
332 176 : log_newpage_buffer(buf, false);
333 :
334 210 : END_CRIT_SECTION();
335 :
336 210 : UnlockReleaseBuffer(buf);
337 :
338 210 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
339 : }
340 : else
341 : {
342 156 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
343 156 : if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
344 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
345 : * smaller */
346 : }
347 :
348 366 : return new_nfsmblocks;
349 : }
350 :
351 : /*
352 : * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
353 : *
354 : * We assume that the bottom-level pages have already been updated with
355 : * new free-space information.
356 : */
357 : void
358 270 : FreeSpaceMapVacuum(Relation rel)
359 : {
360 : bool dummy;
361 :
362 : /* Recursively scan the tree, starting at the root */
363 270 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
364 : (BlockNumber) 0, InvalidBlockNumber,
365 : &dummy);
366 270 : }
367 :
368 : /*
369 : * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
370 : *
371 : * As above, but assume that only heap pages between start and end-1 inclusive
372 : * have new free-space information, so update only the upper-level slots
373 : * covering that block range. end == InvalidBlockNumber is equivalent to
374 : * "all the rest of the relation".
375 : */
376 : void
377 48094 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
378 : {
379 : bool dummy;
380 :
381 : /* Recursively scan the tree, starting at the root */
382 48094 : if (end > start)
383 47752 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
384 48094 : }
385 :
386 : /******** Internal routines ********/
387 :
388 : /*
389 : * Return category corresponding x bytes of free space
390 : */
391 : static uint8
392 1536724 : fsm_space_avail_to_cat(Size avail)
393 : {
394 : int cat;
395 :
396 : Assert(avail < BLCKSZ);
397 :
398 1536724 : if (avail >= MaxFSMRequestSize)
399 39934 : return 255;
400 :
401 1496790 : cat = avail / FSM_CAT_STEP;
402 :
403 : /*
404 : * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
405 : * more.
406 : */
407 1496790 : if (cat > 254)
408 0 : cat = 254;
409 :
410 1496790 : return (uint8) cat;
411 : }
412 :
413 : /*
414 : * Return the lower bound of the range of free space represented by given
415 : * category.
416 : */
417 : static Size
418 1924 : fsm_space_cat_to_avail(uint8 cat)
419 : {
420 : /* The highest category represents exactly MaxFSMRequestSize bytes. */
421 1924 : if (cat == 255)
422 956 : return MaxFSMRequestSize;
423 : else
424 968 : return cat * FSM_CAT_STEP;
425 : }
426 :
427 : /*
428 : * Which category does a page need to have, to accommodate x bytes of data?
429 : * While fsm_space_avail_to_cat() rounds down, this needs to round up.
430 : */
431 : static uint8
432 358710 : fsm_space_needed_to_cat(Size needed)
433 : {
434 : int cat;
435 :
436 : /* Can't ask for more space than the highest category represents */
437 358710 : if (needed > MaxFSMRequestSize)
438 0 : elog(ERROR, "invalid FSM request size %zu", needed);
439 :
440 358710 : if (needed == 0)
441 0 : return 1;
442 :
443 358710 : cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
444 :
445 358710 : if (cat > 255)
446 0 : cat = 255;
447 :
448 358710 : return (uint8) cat;
449 : }
450 :
451 : /*
452 : * Returns the physical block number of a FSM page
453 : */
454 : static BlockNumber
455 2067756 : fsm_logical_to_physical(FSMAddress addr)
456 : {
457 : BlockNumber pages;
458 : int leafno;
459 : int l;
460 :
461 : /*
462 : * Calculate the logical page number of the first leaf page below the
463 : * given page.
464 : */
465 2067756 : leafno = addr.logpageno;
466 2910888 : for (l = 0; l < addr.level; l++)
467 843132 : leafno *= SlotsPerFSMPage;
468 :
469 : /* Count upper level nodes required to address the leaf page */
470 2067756 : pages = 0;
471 8271024 : for (l = 0; l < FSM_TREE_DEPTH; l++)
472 : {
473 6203268 : pages += leafno + 1;
474 6203268 : leafno /= SlotsPerFSMPage;
475 : }
476 :
477 : /*
478 : * If the page we were asked for wasn't at the bottom level, subtract the
479 : * additional lower level pages we counted above.
480 : */
481 2067756 : pages -= addr.level;
482 :
483 : /* Turn the page count into 0-based block number */
484 2067756 : return pages - 1;
485 : }
486 :
487 : /*
488 : * Return the FSM location corresponding to given heap block.
489 : */
490 : static FSMAddress
491 1731022 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
492 : {
493 : FSMAddress addr;
494 :
495 1731022 : addr.level = FSM_BOTTOM_LEVEL;
496 1731022 : addr.logpageno = heapblk / SlotsPerFSMPage;
497 1731022 : *slot = heapblk % SlotsPerFSMPage;
498 :
499 1731022 : return addr;
500 : }
501 :
502 : /*
503 : * Return the heap block number corresponding to given location in the FSM.
504 : */
505 : static BlockNumber
506 47170 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
507 : {
508 : Assert(addr.level == FSM_BOTTOM_LEVEL);
509 47170 : return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
510 : }
511 :
512 : /*
513 : * Given a logical address of a child page, get the logical page number of
514 : * the parent, and the slot within the parent corresponding to the child.
515 : */
516 : static FSMAddress
517 291440 : fsm_get_parent(FSMAddress child, uint16 *slot)
518 : {
519 : FSMAddress parent;
520 :
521 : Assert(child.level < FSM_ROOT_LEVEL);
522 :
523 291440 : parent.level = child.level + 1;
524 291440 : parent.logpageno = child.logpageno / SlotsPerFSMPage;
525 291440 : *slot = child.logpageno % SlotsPerFSMPage;
526 :
527 291440 : return parent;
528 : }
529 :
530 : /*
531 : * Given a logical address of a parent page and a slot number, get the
532 : * logical address of the corresponding child page.
533 : */
534 : static FSMAddress
535 141988 : fsm_get_child(FSMAddress parent, uint16 slot)
536 : {
537 : FSMAddress child;
538 :
539 : Assert(parent.level > FSM_BOTTOM_LEVEL);
540 :
541 141988 : child.level = parent.level - 1;
542 141988 : child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
543 :
544 141988 : return child;
545 : }
546 :
547 : /*
548 : * Read a FSM page.
549 : *
550 : * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
551 : * true, the FSM file is extended.
552 : */
553 : static Buffer
554 1496198 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
555 : {
556 1496198 : BlockNumber blkno = fsm_logical_to_physical(addr);
557 : Buffer buf;
558 1496198 : SMgrRelation reln = RelationGetSmgr(rel);
559 :
560 : /*
561 : * If we haven't cached the size of the FSM yet, check it first. Also
562 : * recheck if the requested block seems to be past end, since our cached
563 : * value might be stale. (We send smgr inval messages on truncation, but
564 : * not on extension.)
565 : */
566 1496198 : if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
567 1342548 : blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
568 : {
569 : /* Invalidate the cache so smgrnblocks asks the kernel. */
570 216816 : reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
571 216816 : if (smgrexists(reln, FSM_FORKNUM))
572 98452 : smgrnblocks(reln, FSM_FORKNUM);
573 : else
574 118364 : reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
575 : }
576 :
577 : /*
578 : * For reading we use ZERO_ON_ERROR mode, and initialize the page if
579 : * necessary. The FSM information is not accurate anyway, so it's better
580 : * to clear corrupt pages than error out. Since the FSM changes are not
581 : * WAL-logged, the so-called torn page problem on crash can lead to pages
582 : * with corrupt headers, for example.
583 : *
584 : * We use the same path below to initialize pages when extending the
585 : * relation, as a concurrent extension can end up with vm_extend()
586 : * returning an already-initialized page.
587 : */
588 1496198 : if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
589 : {
590 119602 : if (extend)
591 7366 : buf = fsm_extend(rel, blkno + 1);
592 : else
593 112236 : return InvalidBuffer;
594 : }
595 : else
596 1376596 : buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
597 :
598 : /*
599 : * Initializing the page when needed is trickier than it looks, because of
600 : * the possibility of multiple backends doing this concurrently, and our
601 : * desire to not uselessly take the buffer lock in the normal path where
602 : * the page is OK. We must take the lock to initialize the page, so
603 : * recheck page newness after we have the lock, in case someone else
604 : * already did it. Also, because we initially check PageIsNew with no
605 : * lock, it's possible to fall through and return the buffer while someone
606 : * else is still initializing the page (i.e., we might see pd_upper as set
607 : * but other page header fields are still zeroes). This is harmless for
608 : * callers that will take a buffer lock themselves, but some callers
609 : * inspect the page without any lock at all. The latter is OK only so
610 : * long as it doesn't depend on the page header having correct contents.
611 : * Current usage is safe because PageGetContents() does not require that.
612 : */
613 1383962 : if (PageIsNew(BufferGetPage(buf)))
614 : {
615 24986 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
616 24986 : if (PageIsNew(BufferGetPage(buf)))
617 24986 : PageInit(BufferGetPage(buf), BLCKSZ, 0);
618 24986 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
619 : }
620 1383962 : return buf;
621 : }
622 :
623 : /*
624 : * Ensure that the FSM fork is at least fsm_nblocks long, extending
625 : * it if necessary with empty pages. And by empty, I mean pages filled
626 : * with zeros, meaning there's no free space.
627 : */
628 : static Buffer
629 7366 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
630 : {
631 7366 : return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
632 : EB_CREATE_FORK_IF_NEEDED |
633 : EB_CLEAR_SIZE_CACHE,
634 : fsm_nblocks,
635 : RBM_ZERO_ON_ERROR);
636 : }
637 :
638 : /*
639 : * Set value in given FSM page and slot.
640 : *
641 : * If minValue > 0, the updated page is also searched for a page with at
642 : * least minValue of free space. If one is found, its slot number is
643 : * returned, -1 otherwise.
644 : */
645 : static int
646 969068 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
647 : uint8 newValue, uint8 minValue)
648 : {
649 : Buffer buf;
650 : Page page;
651 969068 : int newslot = -1;
652 :
653 969068 : buf = fsm_readbuf(rel, addr, true);
654 969068 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
655 :
656 969068 : page = BufferGetPage(buf);
657 :
658 969068 : if (fsm_set_avail(page, slot, newValue))
659 194152 : MarkBufferDirtyHint(buf, false);
660 :
661 969068 : if (minValue != 0)
662 : {
663 : /* Search while we still hold the lock */
664 193460 : newslot = fsm_search_avail(buf, minValue,
665 193460 : addr.level == FSM_BOTTOM_LEVEL,
666 : true);
667 : }
668 :
669 969068 : UnlockReleaseBuffer(buf);
670 :
671 969068 : return newslot;
672 : }
673 :
674 : /*
675 : * Search the tree for a heap page with at least min_cat of free space
676 : */
677 : static BlockNumber
678 331378 : fsm_search(Relation rel, uint8 min_cat)
679 : {
680 331378 : int restarts = 0;
681 331378 : FSMAddress addr = FSM_ROOT_ADDRESS;
682 :
683 : for (;;)
684 48516 : {
685 : int slot;
686 : Buffer buf;
687 379894 : uint8 max_avail = 0;
688 :
689 : /* Read the FSM page. */
690 379894 : buf = fsm_readbuf(rel, addr, false);
691 :
692 : /* Search within the page */
693 379894 : if (BufferIsValid(buf))
694 : {
695 268964 : LockBuffer(buf, BUFFER_LOCK_SHARE);
696 268964 : slot = fsm_search_avail(buf, min_cat,
697 268964 : (addr.level == FSM_BOTTOM_LEVEL),
698 : false);
699 268964 : if (slot == -1)
700 : {
701 204146 : max_avail = fsm_get_max_avail(BufferGetPage(buf));
702 204146 : UnlockReleaseBuffer(buf);
703 : }
704 : else
705 : {
706 : /* Keep the pin for possible update below */
707 64818 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
708 : }
709 : }
710 : else
711 110930 : slot = -1;
712 :
713 379894 : if (slot != -1)
714 : {
715 : /*
716 : * Descend the tree, or return the found block if we're at the
717 : * bottom.
718 : */
719 64818 : if (addr.level == FSM_BOTTOM_LEVEL)
720 : {
721 19838 : BlockNumber blkno = fsm_get_heap_blk(addr, slot);
722 : Page page;
723 :
724 19838 : if (fsm_does_block_exist(rel, blkno))
725 : {
726 19838 : ReleaseBuffer(buf);
727 19838 : return blkno;
728 : }
729 :
730 : /*
731 : * Block is past the end of the relation. Update FSM, and
732 : * restart from root. The usual "advancenext" behavior is
733 : * pessimal for this rare scenario, since every later slot is
734 : * unusable in the same way. We could zero all affected slots
735 : * on the same FSM page, but don't bet on the benefits of that
736 : * optimization justifying its compiled code bulk.
737 : */
738 0 : page = BufferGetPage(buf);
739 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
740 0 : fsm_set_avail(page, slot, 0);
741 0 : MarkBufferDirtyHint(buf, false);
742 0 : UnlockReleaseBuffer(buf);
743 0 : if (restarts++ > 10000) /* same rationale as below */
744 0 : return InvalidBlockNumber;
745 0 : addr = FSM_ROOT_ADDRESS;
746 : }
747 : else
748 : {
749 44980 : ReleaseBuffer(buf);
750 : }
751 44980 : addr = fsm_get_child(addr, slot);
752 : }
753 315076 : else if (addr.level == FSM_ROOT_LEVEL)
754 : {
755 : /*
756 : * At the root, failure means there's no page with enough free
757 : * space in the FSM. Give up.
758 : */
759 311540 : return InvalidBlockNumber;
760 : }
761 : else
762 : {
763 : uint16 parentslot;
764 : FSMAddress parent;
765 :
766 : /*
767 : * At lower level, failure can happen if the value in the upper-
768 : * level node didn't reflect the value on the lower page. Update
769 : * the upper node, to avoid falling into the same trap again, and
770 : * start over.
771 : *
772 : * There's a race condition here, if another backend updates this
773 : * page right after we release it, and gets the lock on the parent
774 : * page before us. We'll then update the parent page with the now
775 : * stale information we had. It's OK, because it should happen
776 : * rarely, and will be fixed by the next vacuum.
777 : */
778 3536 : parent = fsm_get_parent(addr, &parentslot);
779 3536 : fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
780 :
781 : /*
782 : * If the upper pages are badly out of date, we might need to loop
783 : * quite a few times, updating them as we go. Any inconsistencies
784 : * should eventually be corrected and the loop should end. Looping
785 : * indefinitely is nevertheless scary, so provide an emergency
786 : * valve.
787 : */
788 3536 : if (restarts++ > 10000)
789 0 : return InvalidBlockNumber;
790 :
791 : /* Start search all over from the root */
792 3536 : addr = FSM_ROOT_ADDRESS;
793 : }
794 : }
795 : }
796 :
797 :
798 : /*
799 : * Recursive guts of FreeSpaceMapVacuum
800 : *
801 : * Examine the FSM page indicated by addr, as well as its children, updating
802 : * upper-level nodes that cover the heap block range from start to end-1.
803 : * (It's okay if end is beyond the actual end of the map.)
804 : * Return the maximum freespace value on this page.
805 : *
806 : * If addr is past the end of the FSM, set *eof_p to true and return 0.
807 : *
808 : * This traverses the tree in depth-first order. The tree is stored
809 : * physically in depth-first order, so this should be pretty I/O efficient.
810 : */
811 : static uint8
812 145030 : fsm_vacuum_page(Relation rel, FSMAddress addr,
813 : BlockNumber start, BlockNumber end,
814 : bool *eof_p)
815 : {
816 : Buffer buf;
817 : Page page;
818 : uint8 max_avail;
819 :
820 : /* Read the page if it exists, or return EOF */
821 145030 : buf = fsm_readbuf(rel, addr, false);
822 145030 : if (!BufferIsValid(buf))
823 : {
824 1234 : *eof_p = true;
825 1234 : return 0;
826 : }
827 : else
828 143796 : *eof_p = false;
829 :
830 143796 : page = BufferGetPage(buf);
831 :
832 : /*
833 : * If we're above the bottom level, recurse into children, and fix the
834 : * information stored about them at this level.
835 : */
836 143796 : if (addr.level > FSM_BOTTOM_LEVEL)
837 : {
838 : FSMAddress fsm_start,
839 : fsm_end;
840 : uint16 fsm_start_slot,
841 : fsm_end_slot;
842 : int slot,
843 : start_slot,
844 : end_slot;
845 95968 : bool eof = false;
846 :
847 : /*
848 : * Compute the range of slots we need to update on this page, given
849 : * the requested range of heap blocks to consider. The first slot to
850 : * update is the one covering the "start" block, and the last slot is
851 : * the one covering "end - 1". (Some of this work will be duplicated
852 : * in each recursive call, but it's cheap enough to not worry about.)
853 : */
854 95968 : fsm_start = fsm_get_location(start, &fsm_start_slot);
855 95968 : fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
856 :
857 239920 : while (fsm_start.level < addr.level)
858 : {
859 143952 : fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
860 143952 : fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
861 : }
862 : Assert(fsm_start.level == addr.level);
863 :
864 95968 : if (fsm_start.logpageno == addr.logpageno)
865 95968 : start_slot = fsm_start_slot;
866 0 : else if (fsm_start.logpageno > addr.logpageno)
867 0 : start_slot = SlotsPerFSMPage; /* shouldn't get here... */
868 : else
869 0 : start_slot = 0;
870 :
871 95968 : if (fsm_end.logpageno == addr.logpageno)
872 95370 : end_slot = fsm_end_slot;
873 598 : else if (fsm_end.logpageno > addr.logpageno)
874 598 : end_slot = SlotsPerFSMPage - 1;
875 : else
876 0 : end_slot = -1; /* shouldn't get here... */
877 :
878 2779482 : for (slot = start_slot; slot <= end_slot; slot++)
879 : {
880 : int child_avail;
881 :
882 2683514 : CHECK_FOR_INTERRUPTS();
883 :
884 : /* After we hit end-of-file, just clear the rest of the slots */
885 2683514 : if (!eof)
886 97008 : child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
887 : start, end,
888 : &eof);
889 : else
890 2586506 : child_avail = 0;
891 :
892 : /* Update information about the child */
893 2683514 : if (fsm_get_avail(page, slot) != child_avail)
894 : {
895 14320 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
896 14320 : fsm_set_avail(page, slot, child_avail);
897 14320 : MarkBufferDirtyHint(buf, false);
898 14320 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
899 : }
900 : }
901 : }
902 :
903 : /* Now get the maximum value on the page, to return to caller */
904 143796 : max_avail = fsm_get_max_avail(page);
905 :
906 : /*
907 : * Reset the next slot pointer. This encourages the use of low-numbered
908 : * pages, increasing the chances that a later vacuum can truncate the
909 : * relation. We don't bother with a lock here, nor with marking the page
910 : * dirty if it wasn't already, since this is just a hint.
911 : */
912 143796 : ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
913 :
914 143796 : ReleaseBuffer(buf);
915 :
916 143796 : return max_avail;
917 : }
918 :
919 :
920 : /*
921 : * Check whether a block number is past the end of the relation. This can
922 : * happen after WAL replay, if the FSM reached disk but newly-extended pages
923 : * it refers to did not.
924 : */
925 : static bool
926 47170 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
927 : {
928 47170 : SMgrRelation smgr = RelationGetSmgr(rel);
929 :
930 : /*
931 : * If below the cached nblocks, the block surely exists. Otherwise, we
932 : * face a trade-off. We opt to compare to a fresh nblocks, incurring
933 : * lseek() overhead. The alternative would be to assume the block does
934 : * not exist, but that would cause FSM to set zero space available for
935 : * blocks that main fork extension just recorded.
936 : */
937 47170 : return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
938 64972 : blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
939 17802 : blknumber < RelationGetNumberOfBlocks(rel));
940 : }
|