Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * visibilitymap.c
4 : * bitmap for tracking visibility of heap tuples
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/visibilitymap.c
12 : *
13 : * INTERFACE ROUTINES
14 : * visibilitymap_clear - clear bits for one page in the visibility map
15 : * visibilitymap_pin - pin a map page for setting a bit
16 : * visibilitymap_pin_ok - check whether correct map page is already pinned
17 : * visibilitymap_set - set bit(s) in a previously pinned page and log
18 : * visibilitymap_set_vmbits - set bit(s) in a pinned page
19 : * visibilitymap_get_status - get status of bits
20 : * visibilitymap_count - count number of bits set in visibility map
21 : * visibilitymap_prepare_truncate -
22 : * prepare for truncation of the visibility map
23 : *
24 : * NOTES
25 : *
26 : * The visibility map is a bitmap with two bits (all-visible and all-frozen)
27 : * per heap page. A set all-visible bit means that all tuples on the page are
28 : * known visible to all transactions, and therefore the page doesn't need to
29 : * be vacuumed. A set all-frozen bit means that all tuples on the page are
30 : * completely frozen, and therefore the page doesn't need to be vacuumed even
31 : * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum).
32 : * The all-frozen bit must be set only when the page is already all-visible.
33 : *
34 : * The map is conservative in the sense that we make sure that whenever a bit
35 : * is set, we know the condition is true, but if a bit is not set, it might or
36 : * might not be true.
37 : *
38 : * Clearing visibility map bits is not separately WAL-logged. The callers
39 : * must make sure that whenever a bit is cleared, the bit is cleared on WAL
40 : * replay of the updating operation as well.
41 : *
42 : * When we *set* a visibility map during VACUUM, we must write WAL. This may
43 : * seem counterintuitive, since the bit is basically a hint: if it is clear,
44 : * it may still be the case that every tuple on the page is visible to all
45 : * transactions; we just don't know that for certain. The difficulty is that
46 : * there are two bits which are typically set together: the PD_ALL_VISIBLE bit
47 : * on the page itself, and the visibility map bit. If a crash occurs after the
48 : * visibility map page makes it to disk and before the updated heap page makes
49 : * it to disk, redo must set the bit on the heap page. Otherwise, the next
50 : * insert, update, or delete on the heap page will fail to realize that the
51 : * visibility map bit must be cleared, possibly causing index-only scans to
52 : * return wrong answers.
53 : *
54 : * VACUUM will normally skip pages for which the visibility map bit is set;
55 : * such pages can't contain any dead tuples and therefore don't need vacuuming.
56 : *
57 : * LOCKING
58 : *
59 : * In heapam.c, whenever a page is modified so that not all tuples on the
60 : * page are visible to everyone anymore, the corresponding bit in the
61 : * visibility map is cleared. In order to be crash-safe, we need to do this
62 : * while still holding a lock on the heap page and in the same critical
63 : * section that logs the page modification. However, we don't want to hold
64 : * the buffer lock over any I/O that may be required to read in the visibility
65 : * map page. To avoid this, we examine the heap page before locking it;
66 : * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map
67 : * bit. Then, we lock the buffer. But this creates a race condition: there
68 : * is a possibility that in the time it takes to lock the buffer, the
69 : * PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the
70 : * buffer, pin the visibility map page, and relock the buffer. This shouldn't
71 : * happen often, because only VACUUM currently sets visibility map bits,
72 : * and the race will only occur if VACUUM processes a given page at almost
73 : * exactly the same time that someone tries to further modify it.
74 : *
75 : * To set a bit, you need to hold a lock on the heap page. That prevents
76 : * the race condition where VACUUM sees that all tuples on the page are
77 : * visible to everyone, but another backend modifies the page before VACUUM
78 : * sets the bit in the visibility map.
79 : *
80 : * When a bit is set, the LSN of the visibility map page is updated to make
81 : * sure that the visibility map update doesn't get written to disk before the
82 : * WAL record of the changes that made it possible to set the bit is flushed.
83 : * But when a bit is cleared, we don't have to do that because it's always
84 : * safe to clear a bit in the map from correctness point of view.
85 : *
86 : *-------------------------------------------------------------------------
87 : */
88 : #include "postgres.h"
89 :
90 : #include "access/heapam_xlog.h"
91 : #include "access/visibilitymap.h"
92 : #include "access/xloginsert.h"
93 : #include "access/xlogutils.h"
94 : #include "miscadmin.h"
95 : #include "port/pg_bitutils.h"
96 : #include "storage/bufmgr.h"
97 : #include "storage/smgr.h"
98 : #include "utils/inval.h"
99 : #include "utils/rel.h"
100 :
101 :
102 : /*#define TRACE_VISIBILITYMAP */
103 :
104 : /*
105 : * Size of the bitmap on each visibility map page, in bytes. There's no
106 : * extra headers, so the whole page minus the standard page header is
107 : * used for the bitmap.
108 : */
109 : #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
110 :
111 : /* Number of heap blocks we can represent in one byte */
112 : #define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
113 :
114 : /* Number of heap blocks we can represent in one visibility map page. */
115 : #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
116 :
117 : /* Mapping from heap block number to the right bit in the visibility map */
118 : #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
119 : #define HEAPBLK_TO_MAPBLOCK_LIMIT(x) \
120 : (((x) + HEAPBLOCKS_PER_PAGE - 1) / HEAPBLOCKS_PER_PAGE)
121 : #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
122 : #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
123 :
124 : /* Masks for counting subsets of bits in the visibility map. */
125 : #define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */
126 : #define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */
127 :
128 : /* prototypes for internal routines */
129 : static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
130 : static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks);
131 :
132 :
133 : /*
134 : * visibilitymap_clear - clear specified bits for one page in visibility map
135 : *
136 : * You must pass a buffer containing the correct map page to this function.
137 : * Call visibilitymap_pin first to pin the right one. This function doesn't do
138 : * any I/O. Returns true if any bits have been cleared and false otherwise.
139 : */
140 : bool
141 16802 : visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
142 : {
143 16802 : BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
144 16802 : int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
145 16802 : int mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
146 16802 : uint8 mask = flags << mapOffset;
147 : char *map;
148 16802 : bool cleared = false;
149 :
150 : /* Must never clear all_visible bit while leaving all_frozen bit set */
151 : Assert(flags & VISIBILITYMAP_VALID_BITS);
152 : Assert(flags != VISIBILITYMAP_ALL_VISIBLE);
153 :
154 : #ifdef TRACE_VISIBILITYMAP
155 : elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
156 : #endif
157 :
158 16802 : if (!BufferIsValid(vmbuf) || BufferGetBlockNumber(vmbuf) != mapBlock)
159 0 : elog(ERROR, "wrong buffer passed to visibilitymap_clear");
160 :
161 16802 : LockBuffer(vmbuf, BUFFER_LOCK_EXCLUSIVE);
162 16802 : map = PageGetContents(BufferGetPage(vmbuf));
163 :
164 16802 : if (map[mapByte] & mask)
165 : {
166 14748 : map[mapByte] &= ~mask;
167 :
168 14748 : MarkBufferDirty(vmbuf);
169 14748 : cleared = true;
170 : }
171 :
172 16802 : LockBuffer(vmbuf, BUFFER_LOCK_UNLOCK);
173 :
174 16802 : return cleared;
175 : }
176 :
177 : /*
178 : * visibilitymap_pin - pin a map page for setting a bit
179 : *
180 : * Setting a bit in the visibility map is a two-phase operation. First, call
181 : * visibilitymap_pin, to pin the visibility map page containing the bit for
182 : * the heap page. Because that can require I/O to read the map page, you
183 : * shouldn't hold a lock on the heap page while doing that. Then, call
184 : * visibilitymap_set to actually set the bit.
185 : *
186 : * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by
187 : * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
188 : * relation. On return, *vmbuf is a valid buffer with the map page containing
189 : * the bit for heapBlk.
190 : *
191 : * If the page doesn't exist in the map file yet, it is extended.
192 : */
193 : void
194 487269 : visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
195 : {
196 487269 : BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
197 :
198 : /* Reuse the old pinned buffer if possible */
199 487269 : if (BufferIsValid(*vmbuf))
200 : {
201 425364 : if (BufferGetBlockNumber(*vmbuf) == mapBlock)
202 425364 : return;
203 :
204 0 : ReleaseBuffer(*vmbuf);
205 : }
206 61905 : *vmbuf = vm_readbuf(rel, mapBlock, true);
207 : }
208 :
209 : /*
210 : * visibilitymap_pin_ok - do we already have the correct page pinned?
211 : *
212 : * On entry, vmbuf should be InvalidBuffer or a valid buffer returned by
213 : * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
214 : * relation. The return value indicates whether the buffer covers the
215 : * given heapBlk.
216 : */
217 : bool
218 13239 : visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
219 : {
220 13239 : BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
221 :
222 13239 : return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock;
223 : }
224 :
225 : /*
226 : * visibilitymap_set - set bit(s) on a previously pinned page
227 : *
228 : * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
229 : * or InvalidXLogRecPtr in normal running. The VM page LSN is advanced to the
230 : * one provided; in normal running, we generate a new XLOG record and set the
231 : * page LSN to that value (though the heap page's LSN may *not* be updated;
232 : * see below). cutoff_xid is the largest xmin on the page being marked
233 : * all-visible; it is needed for Hot Standby, and can be InvalidTransactionId
234 : * if the page contains no tuples. It can also be set to InvalidTransactionId
235 : * when a page that is already all-visible is being marked all-frozen.
236 : *
237 : * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
238 : * this function. Except in recovery, caller should also pass the heap
239 : * buffer. When checksums are enabled and we're not in recovery, we must add
240 : * the heap buffer to the WAL chain to protect it from being torn.
241 : *
242 : * You must pass a buffer containing the correct map page to this function.
243 : * Call visibilitymap_pin first to pin the right one. This function doesn't do
244 : * any I/O.
245 : */
246 : void
247 41525 : visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
248 : XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
249 : uint8 flags)
250 : {
251 41525 : BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
252 41525 : uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
253 41525 : uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
254 : Page page;
255 : uint8 *map;
256 : uint8 status;
257 :
258 : #ifdef TRACE_VISIBILITYMAP
259 : elog(DEBUG1, "vm_set flags 0x%02X for %s %d",
260 : flags, RelationGetRelationName(rel), heapBlk);
261 : #endif
262 :
263 : Assert(InRecovery || !XLogRecPtrIsValid(recptr));
264 : Assert(InRecovery || PageIsAllVisible(BufferGetPage(heapBuf)));
265 : Assert((flags & VISIBILITYMAP_VALID_BITS) == flags);
266 :
267 : /* Must never set all_frozen bit without also setting all_visible bit */
268 : Assert(flags != VISIBILITYMAP_ALL_FROZEN);
269 :
270 : /* Check that we have the right heap page pinned, if present */
271 41525 : if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
272 0 : elog(ERROR, "wrong heap buffer passed to visibilitymap_set");
273 :
274 : Assert(!BufferIsValid(heapBuf) ||
275 : BufferIsLockedByMeInMode(heapBuf, BUFFER_LOCK_EXCLUSIVE));
276 :
277 : /* Check that we have the right VM page pinned */
278 41525 : if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
279 0 : elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
280 :
281 41525 : page = BufferGetPage(vmBuf);
282 41525 : map = (uint8 *) PageGetContents(page);
283 41525 : LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
284 :
285 41525 : status = (map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS;
286 41525 : if (flags != status)
287 : {
288 41525 : START_CRIT_SECTION();
289 :
290 41525 : map[mapByte] |= (flags << mapOffset);
291 41525 : MarkBufferDirty(vmBuf);
292 :
293 41525 : if (RelationNeedsWAL(rel))
294 : {
295 39965 : if (!XLogRecPtrIsValid(recptr))
296 : {
297 : Assert(!InRecovery);
298 35015 : recptr = log_heap_visible(rel, heapBuf, vmBuf, cutoff_xid, flags);
299 :
300 : /*
301 : * If data checksums are enabled (or wal_log_hints=on), we
302 : * need to protect the heap page from being torn.
303 : *
304 : * If not, then we must *not* update the heap page's LSN. In
305 : * this case, the FPI for the heap page was omitted from the
306 : * WAL record inserted above, so it would be incorrect to
307 : * update the heap page's LSN.
308 : */
309 35015 : if (XLogHintBitIsNeeded())
310 : {
311 31867 : Page heapPage = BufferGetPage(heapBuf);
312 :
313 31867 : PageSetLSN(heapPage, recptr);
314 : }
315 : }
316 39965 : PageSetLSN(page, recptr);
317 : }
318 :
319 41525 : END_CRIT_SECTION();
320 : }
321 :
322 41525 : LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
323 41525 : }
324 :
325 : /*
326 : * Set VM (visibility map) flags in the VM block in vmBuf.
327 : *
328 : * This function is intended for callers that log VM changes together
329 : * with the heap page modifications that rendered the page all-visible.
330 : * Callers that log VM changes separately should use visibilitymap_set().
331 : *
332 : * vmBuf must be pinned and exclusively locked, and it must cover the VM bits
333 : * corresponding to heapBlk.
334 : *
335 : * In normal operation (not recovery), this must be called inside a critical
336 : * section that also applies the necessary heap page changes and, if
337 : * applicable, emits WAL.
338 : *
339 : * The caller is responsible for ensuring consistency between the heap page
340 : * and the VM page by holding a pin and exclusive lock on the buffer
341 : * containing heapBlk.
342 : *
343 : * rlocator is used only for debugging messages.
344 : */
345 : void
346 16995 : visibilitymap_set_vmbits(BlockNumber heapBlk,
347 : Buffer vmBuf, uint8 flags,
348 : const RelFileLocator rlocator)
349 : {
350 16995 : BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
351 16995 : uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
352 16995 : uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
353 : Page page;
354 : uint8 *map;
355 : uint8 status;
356 :
357 : #ifdef TRACE_VISIBILITYMAP
358 : elog(DEBUG1, "vm_set flags 0x%02X for %s %d",
359 : flags,
360 : relpathbackend(rlocator, MyProcNumber, MAIN_FORKNUM).str,
361 : heapBlk);
362 : #endif
363 :
364 : /* Call in same critical section where WAL is emitted. */
365 : Assert(InRecovery || CritSectionCount > 0);
366 :
367 : /* Flags should be valid. Also never clear bits with this function */
368 : Assert((flags & VISIBILITYMAP_VALID_BITS) == flags);
369 :
370 : /* Must never set all_frozen bit without also setting all_visible bit */
371 : Assert(flags != VISIBILITYMAP_ALL_FROZEN);
372 :
373 : /* Check that we have the right VM page pinned */
374 16995 : if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
375 0 : elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
376 :
377 : Assert(BufferIsLockedByMeInMode(vmBuf, BUFFER_LOCK_EXCLUSIVE));
378 :
379 16995 : page = BufferGetPage(vmBuf);
380 16995 : map = (uint8 *) PageGetContents(page);
381 :
382 16995 : status = (map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS;
383 16995 : if (flags != status)
384 : {
385 16995 : map[mapByte] |= (flags << mapOffset);
386 16995 : MarkBufferDirty(vmBuf);
387 : }
388 16995 : }
389 :
390 : /*
391 : * visibilitymap_get_status - get status of bits
392 : *
393 : * Are all tuples on heapBlk visible to all or are marked frozen, according
394 : * to the visibility map?
395 : *
396 : * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by an
397 : * earlier call to visibilitymap_pin or visibilitymap_get_status on the same
398 : * relation. On return, *vmbuf is a valid buffer with the map page containing
399 : * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
400 : * releasing *vmbuf after it's done testing and setting bits.
401 : *
402 : * NOTE: This function is typically called without a lock on the heap page,
403 : * so somebody else could change the bit just after we look at it. In fact,
404 : * since we don't lock the visibility map page either, it's even possible that
405 : * someone else could have changed the bit just before we look at it, but yet
406 : * we might see the old value. It is the caller's responsibility to deal with
407 : * all concurrency issues!
408 : */
409 : uint8
410 4036032 : visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
411 : {
412 4036032 : BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
413 4036032 : uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
414 4036032 : uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk);
415 : char *map;
416 : uint8 result;
417 :
418 : #ifdef TRACE_VISIBILITYMAP
419 : elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk);
420 : #endif
421 :
422 : /* Reuse the old pinned buffer if possible */
423 4036032 : if (BufferIsValid(*vmbuf))
424 : {
425 3160551 : if (BufferGetBlockNumber(*vmbuf) != mapBlock)
426 : {
427 0 : ReleaseBuffer(*vmbuf);
428 0 : *vmbuf = InvalidBuffer;
429 : }
430 : }
431 :
432 4036032 : if (!BufferIsValid(*vmbuf))
433 : {
434 875481 : *vmbuf = vm_readbuf(rel, mapBlock, false);
435 875481 : if (!BufferIsValid(*vmbuf))
436 767088 : return (uint8) 0;
437 : }
438 :
439 3268944 : map = PageGetContents(BufferGetPage(*vmbuf));
440 :
441 : /*
442 : * A single byte read is atomic. There could be memory-ordering effects
443 : * here, but for performance reasons we make it the caller's job to worry
444 : * about that.
445 : */
446 3268944 : result = ((map[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS);
447 3268944 : return result;
448 : }
449 :
450 : /*
451 : * visibilitymap_count - count number of bits set in visibility map
452 : *
453 : * Note: we ignore the possibility of race conditions when the table is being
454 : * extended concurrently with the call. New pages added to the table aren't
455 : * going to be marked all-visible or all-frozen, so they won't affect the result.
456 : */
457 : void
458 127201 : visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen)
459 : {
460 : BlockNumber mapBlock;
461 127201 : BlockNumber nvisible = 0;
462 127201 : BlockNumber nfrozen = 0;
463 :
464 : /* all_visible must be specified */
465 : Assert(all_visible);
466 :
467 127201 : for (mapBlock = 0;; mapBlock++)
468 47630 : {
469 : Buffer mapBuffer;
470 : uint64 *map;
471 :
472 : /*
473 : * Read till we fall off the end of the map. We assume that any extra
474 : * bytes in the last page are zeroed, so we don't bother excluding
475 : * them from the count.
476 : */
477 174831 : mapBuffer = vm_readbuf(rel, mapBlock, false);
478 174831 : if (!BufferIsValid(mapBuffer))
479 127201 : break;
480 :
481 : /*
482 : * We choose not to lock the page, since the result is going to be
483 : * immediately stale anyway if anyone is concurrently setting or
484 : * clearing bits, and we only really need an approximate value.
485 : */
486 47630 : map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
487 :
488 47630 : nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
489 47630 : if (all_frozen)
490 47630 : nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
491 :
492 47630 : ReleaseBuffer(mapBuffer);
493 : }
494 :
495 127201 : *all_visible = nvisible;
496 127201 : if (all_frozen)
497 127201 : *all_frozen = nfrozen;
498 127201 : }
499 :
500 : /*
501 : * visibilitymap_prepare_truncate -
502 : * prepare for truncation of the visibility map
503 : *
504 : * nheapblocks is the new size of the heap.
505 : *
506 : * Return the number of blocks of new visibility map.
507 : * If it's InvalidBlockNumber, there is nothing to truncate;
508 : * otherwise the caller is responsible for calling smgrtruncate()
509 : * to truncate the visibility map pages.
510 : */
511 : BlockNumber
512 175 : visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
513 : {
514 : BlockNumber newnblocks;
515 :
516 : /* last remaining block, byte, and bit */
517 175 : BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
518 175 : uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
519 175 : uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks);
520 :
521 : #ifdef TRACE_VISIBILITYMAP
522 : elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
523 : #endif
524 :
525 : /*
526 : * If no visibility map has been created yet for this relation, there's
527 : * nothing to truncate.
528 : */
529 175 : if (!smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM))
530 0 : return InvalidBlockNumber;
531 :
532 : /*
533 : * Unless the new size is exactly at a visibility map page boundary, the
534 : * tail bits in the last remaining map page, representing truncated heap
535 : * blocks, need to be cleared. This is not only tidy, but also necessary
536 : * because we don't get a chance to clear the bits if the heap is extended
537 : * again.
538 : */
539 175 : if (truncByte != 0 || truncOffset != 0)
540 101 : {
541 : Buffer mapBuffer;
542 : Page page;
543 : char *map;
544 :
545 101 : newnblocks = truncBlock + 1;
546 :
547 101 : mapBuffer = vm_readbuf(rel, truncBlock, false);
548 101 : if (!BufferIsValid(mapBuffer))
549 : {
550 : /* nothing to do, the file was already smaller */
551 0 : return InvalidBlockNumber;
552 : }
553 :
554 101 : page = BufferGetPage(mapBuffer);
555 101 : map = PageGetContents(page);
556 :
557 101 : LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
558 :
559 : /* NO EREPORT(ERROR) from here till changes are logged */
560 101 : START_CRIT_SECTION();
561 :
562 : /* Clear out the unwanted bytes. */
563 101 : MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
564 :
565 : /*----
566 : * Mask out the unwanted bits of the last remaining byte.
567 : *
568 : * ((1 << 0) - 1) = 00000000
569 : * ((1 << 1) - 1) = 00000001
570 : * ...
571 : * ((1 << 6) - 1) = 00111111
572 : * ((1 << 7) - 1) = 01111111
573 : *----
574 : */
575 101 : map[truncByte] &= (1 << truncOffset) - 1;
576 :
577 : /*
578 : * Truncation of a relation is WAL-logged at a higher-level, and we
579 : * will be called at WAL replay. But if checksums are enabled, we need
580 : * to still write a WAL record to protect against a torn page, if the
581 : * page is flushed to disk before the truncation WAL record. We cannot
582 : * use MarkBufferDirtyHint here, because that will not dirty the page
583 : * during recovery.
584 : */
585 101 : MarkBufferDirty(mapBuffer);
586 101 : if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
587 78 : log_newpage_buffer(mapBuffer, false);
588 :
589 101 : END_CRIT_SECTION();
590 :
591 101 : UnlockReleaseBuffer(mapBuffer);
592 : }
593 : else
594 74 : newnblocks = truncBlock;
595 :
596 175 : if (smgrnblocks(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM) <= newnblocks)
597 : {
598 : /* nothing to do, the file was already smaller than requested size */
599 101 : return InvalidBlockNumber;
600 : }
601 :
602 74 : return newnblocks;
603 : }
604 :
605 : /*
606 : * visibilitymap_truncation_length -
607 : * compute truncation length for visibility map
608 : *
609 : * Given a proposed truncation length for the main fork, compute the
610 : * correct truncation length for the visibility map. Should return the
611 : * same answer as visibilitymap_prepare_truncate(), but without modifying
612 : * anything.
613 : */
614 : BlockNumber
615 1 : visibilitymap_truncation_length(BlockNumber nheapblocks)
616 : {
617 1 : return HEAPBLK_TO_MAPBLOCK_LIMIT(nheapblocks);
618 : }
619 :
620 : /*
621 : * Read a visibility map page.
622 : *
623 : * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
624 : * true, the visibility map file is extended.
625 : */
626 : static Buffer
627 1112318 : vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
628 : {
629 : Buffer buf;
630 : SMgrRelation reln;
631 :
632 : /*
633 : * Caution: re-using this smgr pointer could fail if the relcache entry
634 : * gets closed. It's safe as long as we only do smgr-level operations
635 : * between here and the last use of the pointer.
636 : */
637 1112318 : reln = RelationGetSmgr(rel);
638 :
639 : /*
640 : * If we haven't cached the size of the visibility map fork yet, check it
641 : * first.
642 : */
643 1112318 : if (reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber)
644 : {
645 139867 : if (smgrexists(reln, VISIBILITYMAP_FORKNUM))
646 58493 : smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
647 : else
648 81374 : reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0;
649 : }
650 :
651 : /*
652 : * For reading we use ZERO_ON_ERROR mode, and initialize the page if
653 : * necessary. It's always safe to clear bits, so it's better to clear
654 : * corrupt pages than error out.
655 : *
656 : * We use the same path below to initialize pages when extending the
657 : * relation, as a concurrent extension can end up with vm_extend()
658 : * returning an already-initialized page.
659 : */
660 1112318 : if (blkno >= reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM])
661 : {
662 897332 : if (extend)
663 3043 : buf = vm_extend(rel, blkno + 1);
664 : else
665 894289 : return InvalidBuffer;
666 : }
667 : else
668 214986 : buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
669 : RBM_ZERO_ON_ERROR, NULL);
670 :
671 : /*
672 : * Initializing the page when needed is trickier than it looks, because of
673 : * the possibility of multiple backends doing this concurrently, and our
674 : * desire to not uselessly take the buffer lock in the normal path where
675 : * the page is OK. We must take the lock to initialize the page, so
676 : * recheck page newness after we have the lock, in case someone else
677 : * already did it. Also, because we initially check PageIsNew with no
678 : * lock, it's possible to fall through and return the buffer while someone
679 : * else is still initializing the page (i.e., we might see pd_upper as set
680 : * but other page header fields are still zeroes). This is harmless for
681 : * callers that will take a buffer lock themselves, but some callers
682 : * inspect the page without any lock at all. The latter is OK only so
683 : * long as it doesn't depend on the page header having correct contents.
684 : * Current usage is safe because PageGetContents() does not require that.
685 : */
686 218029 : if (PageIsNew(BufferGetPage(buf)))
687 : {
688 3108 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
689 3108 : if (PageIsNew(BufferGetPage(buf)))
690 3108 : PageInit(BufferGetPage(buf), BLCKSZ, 0);
691 3108 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
692 : }
693 218029 : return buf;
694 : }
695 :
696 : /*
697 : * Ensure that the visibility map fork is at least vm_nblocks long, extending
698 : * it if necessary with zeroed pages.
699 : */
700 : static Buffer
701 3043 : vm_extend(Relation rel, BlockNumber vm_nblocks)
702 : {
703 : Buffer buf;
704 :
705 3043 : buf = ExtendBufferedRelTo(BMR_REL(rel), VISIBILITYMAP_FORKNUM, NULL,
706 : EB_CREATE_FORK_IF_NEEDED |
707 : EB_CLEAR_SIZE_CACHE,
708 : vm_nblocks,
709 : RBM_ZERO_ON_ERROR);
710 :
711 : /*
712 : * Send a shared-inval message to force other backends to close any smgr
713 : * references they may have for this rel, which we are about to change.
714 : * This is a useful optimization because it means that backends don't have
715 : * to keep checking for creation or extension of the file, which happens
716 : * infrequently.
717 : */
718 3043 : CacheInvalidateSmgr(RelationGetSmgr(rel)->smgr_rlocator);
719 :
720 3043 : return buf;
721 : }
|