Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidstore.c
4 : * TID (ItemPointerData) storage implementation.
5 : *
6 : * TidStore is a in-memory data structure to store TIDs (ItemPointerData).
7 : * Internally it uses a radix tree as the storage for TIDs. The key is the
8 : * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
9 : *
10 : * TidStore can be shared among parallel worker processes by using
11 : * TidStoreCreateShared(). Other backends can attach to the shared TidStore
12 : * by TidStoreAttach().
13 : *
14 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
15 : * Portions Copyright (c) 1994, Regents of the University of California
16 : *
17 : * IDENTIFICATION
18 : * src/backend/access/common/tidstore.c
19 : *
20 : *-------------------------------------------------------------------------
21 : */
22 : #include "postgres.h"
23 :
24 : #include "access/tidstore.h"
25 : #include "miscadmin.h"
26 : #include "nodes/bitmapset.h"
27 : #include "storage/lwlock.h"
28 : #include "utils/dsa.h"
29 :
30 :
31 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
32 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
33 :
34 : /* number of active words for a page: */
35 : #define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
36 :
37 : /* number of offsets we can store in the header of a BlocktableEntry */
38 : #define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint8) - sizeof(int8)) / sizeof(OffsetNumber))
39 :
40 : /*
41 : * This is named similarly to PagetableEntry in tidbitmap.c
42 : * because the two have a similar function.
43 : */
44 : typedef struct BlocktableEntry
45 : {
46 : struct
47 : {
48 : #ifndef WORDS_BIGENDIAN
49 : /*
50 : * We need to position this member to reserve space for the backing
51 : * radix tree to tag the lowest bit when struct 'header' is stored
52 : * inside a pointer or DSA pointer.
53 : */
54 : uint8 flags;
55 :
56 : int8 nwords;
57 : #endif
58 :
59 : /*
60 : * We can store a small number of offsets here to avoid wasting space
61 : * with a sparse bitmap.
62 : */
63 : OffsetNumber full_offsets[NUM_FULL_OFFSETS];
64 :
65 : #ifdef WORDS_BIGENDIAN
66 : int8 nwords;
67 : uint8 flags;
68 : #endif
69 : } header;
70 :
71 : /*
72 : * We don't expect any padding space here, but to be cautious, code
73 : * creating new entries should zero out space up to 'words'.
74 : */
75 :
76 : bitmapword words[FLEXIBLE_ARRAY_MEMBER];
77 : } BlocktableEntry;
78 :
79 : /*
80 : * The type of 'nwords' limits the max number of words in the 'words' array.
81 : * This computes the max offset we can actually store in the bitmap. In
82 : * practice, it's almost always the same as MaxOffsetNumber.
83 : */
84 : #define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
85 :
86 : #define MaxBlocktableEntrySize \
87 : offsetof(BlocktableEntry, words) + \
88 : (sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
89 :
90 : #define RT_PREFIX local_ts
91 : #define RT_SCOPE static
92 : #define RT_DECLARE
93 : #define RT_DEFINE
94 : #define RT_VALUE_TYPE BlocktableEntry
95 : #define RT_VARLEN_VALUE_SIZE(page) \
96 : (offsetof(BlocktableEntry, words) + \
97 : sizeof(bitmapword) * (page)->header.nwords)
98 : #define RT_RUNTIME_EMBEDDABLE_VALUE
99 : #include "lib/radixtree.h"
100 :
101 : #define RT_PREFIX shared_ts
102 : #define RT_SHMEM
103 : #define RT_SCOPE static
104 : #define RT_DECLARE
105 : #define RT_DEFINE
106 : #define RT_VALUE_TYPE BlocktableEntry
107 : #define RT_VARLEN_VALUE_SIZE(page) \
108 : (offsetof(BlocktableEntry, words) + \
109 : sizeof(bitmapword) * (page)->header.nwords)
110 : #define RT_RUNTIME_EMBEDDABLE_VALUE
111 : #include "lib/radixtree.h"
112 :
113 : /* Per-backend state for a TidStore */
114 : struct TidStore
115 : {
116 : /*
117 : * MemoryContext for the radix tree when using local memory, NULL for
118 : * shared memory
119 : */
120 : MemoryContext rt_context;
121 :
122 : /* Storage for TIDs. Use either one depending on TidStoreIsShared() */
123 : union
124 : {
125 : local_ts_radix_tree *local;
126 : shared_ts_radix_tree *shared;
127 : } tree;
128 :
129 : /* DSA area for TidStore if using shared memory */
130 : dsa_area *area;
131 : };
132 : #define TidStoreIsShared(ts) ((ts)->area != NULL)
133 :
134 : /* Iterator for TidStore */
135 : struct TidStoreIter
136 : {
137 : TidStore *ts;
138 :
139 : /* iterator of radix tree. Use either one depending on TidStoreIsShared() */
140 : union
141 : {
142 : shared_ts_iter *shared;
143 : local_ts_iter *local;
144 : } tree_iter;
145 :
146 : /* output for the caller */
147 : TidStoreIterResult output;
148 : };
149 :
150 : /*
151 : * Create a TidStore. The TidStore will live in the memory context that is
152 : * CurrentMemoryContext at the time of this call. The TID storage, backed
153 : * by a radix tree, will live in its child memory context, rt_context.
154 : *
155 : * "max_bytes" is not an internally-enforced limit; it is used only as a
156 : * hint to cap the memory block size of the memory context for TID storage.
157 : * This reduces space wastage due to over-allocation. If the caller wants to
158 : * monitor memory usage, it must compare its limit with the value reported
159 : * by TidStoreMemoryUsage().
160 : */
161 : TidStore *
162 114804 : TidStoreCreateLocal(size_t max_bytes, bool insert_only)
163 : {
164 : TidStore *ts;
165 114804 : size_t initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
166 114804 : size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE;
167 114804 : size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
168 :
169 114804 : ts = palloc0(sizeof(TidStore));
170 :
171 : /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */
172 229628 : while (16 * maxBlockSize > max_bytes)
173 114824 : maxBlockSize >>= 1;
174 :
175 114804 : if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
176 0 : maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
177 :
178 : /* Create a memory context for the TID storage */
179 114804 : if (insert_only)
180 : {
181 114800 : ts->rt_context = BumpContextCreate(CurrentMemoryContext,
182 : "TID storage",
183 : minContextSize,
184 : initBlockSize,
185 : maxBlockSize);
186 : }
187 : else
188 : {
189 4 : ts->rt_context = AllocSetContextCreate(CurrentMemoryContext,
190 : "TID storage",
191 : minContextSize,
192 : initBlockSize,
193 : maxBlockSize);
194 : }
195 :
196 114804 : ts->tree.local = local_ts_create(ts->rt_context);
197 :
198 114804 : return ts;
199 : }
200 :
201 : /*
202 : * Similar to TidStoreCreateLocal() but create a shared TidStore on a
203 : * DSA area.
204 : *
205 : * The returned object is allocated in backend-local memory.
206 : */
207 : TidStore *
208 36 : TidStoreCreateShared(size_t max_bytes, int tranche_id)
209 : {
210 : TidStore *ts;
211 : dsa_area *area;
212 36 : size_t dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE;
213 36 : size_t dsa_max_size = DSA_MAX_SEGMENT_SIZE;
214 :
215 36 : ts = palloc0(sizeof(TidStore));
216 :
217 : /*
218 : * Choose the initial and maximum DSA segment sizes to be no longer than
219 : * 1/8 of max_bytes.
220 : */
221 658 : while (8 * dsa_max_size > max_bytes)
222 622 : dsa_max_size >>= 1;
223 :
224 36 : if (dsa_max_size < DSA_MIN_SEGMENT_SIZE)
225 0 : dsa_max_size = DSA_MIN_SEGMENT_SIZE;
226 :
227 36 : if (dsa_init_size > dsa_max_size)
228 2 : dsa_init_size = dsa_max_size;
229 :
230 36 : area = dsa_create_ext(tranche_id, dsa_init_size, dsa_max_size);
231 36 : ts->tree.shared = shared_ts_create(area, tranche_id);
232 36 : ts->area = area;
233 :
234 36 : return ts;
235 : }
236 :
237 : /*
238 : * Attach to the shared TidStore. 'area_handle' is the DSA handle where
239 : * the TidStore is created. 'handle' is the dsa_pointer returned by
240 : * TidStoreGetHandle(). The returned object is allocated in backend-local
241 : * memory using the CurrentMemoryContext.
242 : */
243 : TidStore *
244 36 : TidStoreAttach(dsa_handle area_handle, dsa_pointer handle)
245 : {
246 : TidStore *ts;
247 : dsa_area *area;
248 :
249 : Assert(area_handle != DSA_HANDLE_INVALID);
250 : Assert(DsaPointerIsValid(handle));
251 :
252 : /* create per-backend state */
253 36 : ts = palloc0(sizeof(TidStore));
254 :
255 36 : area = dsa_attach(area_handle);
256 :
257 : /* Find the shared the shared radix tree */
258 36 : ts->tree.shared = shared_ts_attach(area, handle);
259 36 : ts->area = area;
260 :
261 36 : return ts;
262 : }
263 :
264 : /*
265 : * Detach from a TidStore. This also detaches from radix tree and frees
266 : * the backend-local resources.
267 : */
268 : void
269 36 : TidStoreDetach(TidStore *ts)
270 : {
271 : Assert(TidStoreIsShared(ts));
272 :
273 36 : shared_ts_detach(ts->tree.shared);
274 36 : dsa_detach(ts->area);
275 :
276 36 : pfree(ts);
277 36 : }
278 :
279 : /*
280 : * Lock support functions.
281 : *
282 : * We can use the radix tree's lock for shared TidStore as the data we
283 : * need to protect is only the shared radix tree.
284 : */
285 :
286 : void
287 2242 : TidStoreLockExclusive(TidStore *ts)
288 : {
289 2242 : if (TidStoreIsShared(ts))
290 206 : shared_ts_lock_exclusive(ts->tree.shared);
291 2242 : }
292 :
293 : void
294 4585304 : TidStoreLockShare(TidStore *ts)
295 : {
296 4585304 : if (TidStoreIsShared(ts))
297 421684 : shared_ts_lock_share(ts->tree.shared);
298 4585304 : }
299 :
300 : void
301 4587544 : TidStoreUnlock(TidStore *ts)
302 : {
303 4587544 : if (TidStoreIsShared(ts))
304 421890 : shared_ts_unlock(ts->tree.shared);
305 4587544 : }
306 :
307 : /*
308 : * Destroy a TidStore, returning all memory.
309 : *
310 : * Note that the caller must be certain that no other backend will attempt to
311 : * access the TidStore before calling this function. Other backend must
312 : * explicitly call TidStoreDetach() to free up backend-local memory associated
313 : * with the TidStore. The backend that calls TidStoreDestroy() must not call
314 : * TidStoreDetach().
315 : */
316 : void
317 1150 : TidStoreDestroy(TidStore *ts)
318 : {
319 : /* Destroy underlying radix tree */
320 1150 : if (TidStoreIsShared(ts))
321 : {
322 36 : shared_ts_free(ts->tree.shared);
323 36 : dsa_detach(ts->area);
324 : }
325 : else
326 : {
327 1114 : local_ts_free(ts->tree.local);
328 1114 : MemoryContextDelete(ts->rt_context);
329 : }
330 :
331 1150 : pfree(ts);
332 1150 : }
333 :
334 : /*
335 : * Create or replace an entry for the given block and array of offsets.
336 : *
337 : * NB: This function is designed and optimized for vacuum's heap scanning
338 : * phase, so has some limitations:
339 : *
340 : * - The offset numbers "offsets" must be sorted in ascending order.
341 : * - If the block number already exists, the entry will be replaced --
342 : * there is no way to add or remove offsets from an entry.
343 : */
344 : void
345 32424 : TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
346 : int num_offsets)
347 : {
348 : union
349 : {
350 : char data[MaxBlocktableEntrySize];
351 : BlocktableEntry force_align_entry;
352 : } data;
353 32424 : BlocktableEntry *page = (BlocktableEntry *) data.data;
354 : bitmapword word;
355 : int wordnum;
356 : int next_word_threshold;
357 32424 : int idx = 0;
358 :
359 : Assert(num_offsets > 0);
360 :
361 : /* Check if the given offset numbers are ordered */
362 1808394 : for (int i = 1; i < num_offsets; i++)
363 : Assert(offsets[i] > offsets[i - 1]);
364 :
365 32424 : memset(page, 0, offsetof(BlocktableEntry, words));
366 :
367 32424 : if (num_offsets <= NUM_FULL_OFFSETS)
368 : {
369 12710 : for (int i = 0; i < num_offsets; i++)
370 : {
371 8024 : OffsetNumber off = offsets[i];
372 :
373 : /* safety check to ensure we don't overrun bit array bounds */
374 8024 : if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
375 2 : elog(ERROR, "tuple offset out of range: %u", off);
376 :
377 8022 : page->header.full_offsets[i] = off;
378 : }
379 :
380 4686 : page->header.nwords = 0;
381 : }
382 : else
383 : {
384 27736 : for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
385 92070 : wordnum <= WORDNUM(offsets[num_offsets - 1]);
386 64334 : wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
387 : {
388 64334 : word = 0;
389 :
390 1864704 : while (idx < num_offsets)
391 : {
392 1836968 : OffsetNumber off = offsets[idx];
393 :
394 : /* safety check to ensure we don't overrun bit array bounds */
395 1836968 : if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
396 0 : elog(ERROR, "tuple offset out of range: %u", off);
397 :
398 1836968 : if (off >= next_word_threshold)
399 36598 : break;
400 :
401 1800370 : word |= ((bitmapword) 1 << BITNUM(off));
402 1800370 : idx++;
403 : }
404 :
405 : /* write out offset bitmap for this wordnum */
406 64334 : page->words[wordnum] = word;
407 : }
408 :
409 27736 : page->header.nwords = wordnum;
410 : Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
411 : }
412 :
413 32422 : if (TidStoreIsShared(ts))
414 1290 : shared_ts_set(ts->tree.shared, blkno, page);
415 : else
416 31132 : local_ts_set(ts->tree.local, blkno, page);
417 32422 : }
418 :
419 : /* Return true if the given TID is present in the TidStore */
420 : bool
421 10893810 : TidStoreIsMember(TidStore *ts, ItemPointer tid)
422 : {
423 : int wordnum;
424 : int bitnum;
425 : BlocktableEntry *page;
426 10893810 : BlockNumber blk = ItemPointerGetBlockNumber(tid);
427 10893810 : OffsetNumber off = ItemPointerGetOffsetNumber(tid);
428 :
429 10893810 : if (TidStoreIsShared(ts))
430 826936 : page = shared_ts_find(ts->tree.shared, blk);
431 : else
432 10066874 : page = local_ts_find(ts->tree.local, blk);
433 :
434 : /* no entry for the blk */
435 10893810 : if (page == NULL)
436 2173188 : return false;
437 :
438 8720622 : if (page->header.nwords == 0)
439 : {
440 : /* we have offsets in the header */
441 1397298 : for (int i = 0; i < NUM_FULL_OFFSETS; i++)
442 : {
443 1053606 : if (page->header.full_offsets[i] == off)
444 14786 : return true;
445 : }
446 343692 : return false;
447 : }
448 : else
449 : {
450 8362144 : wordnum = WORDNUM(off);
451 8362144 : bitnum = BITNUM(off);
452 :
453 : /* no bitmap for the off */
454 8362144 : if (wordnum >= page->header.nwords)
455 3937974 : return false;
456 :
457 4424170 : return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
458 : }
459 : }
460 :
461 : /*
462 : * Prepare to iterate through a TidStore.
463 : *
464 : * The TidStoreIter struct is created in the caller's memory context, and it
465 : * will be freed in TidStoreEndIterate.
466 : *
467 : * The caller is responsible for locking TidStore until the iteration is
468 : * finished.
469 : */
470 : TidStoreIter *
471 1114 : TidStoreBeginIterate(TidStore *ts)
472 : {
473 : TidStoreIter *iter;
474 :
475 1114 : iter = palloc0(sizeof(TidStoreIter));
476 1114 : iter->ts = ts;
477 :
478 1114 : if (TidStoreIsShared(ts))
479 14 : iter->tree_iter.shared = shared_ts_begin_iterate(ts->tree.shared);
480 : else
481 1100 : iter->tree_iter.local = local_ts_begin_iterate(ts->tree.local);
482 :
483 1114 : return iter;
484 : }
485 :
486 :
487 : /*
488 : * Return a result that contains the next block number and that can be used to
489 : * obtain the set of offsets by calling TidStoreGetBlockOffsets(). The result
490 : * is copyable.
491 : */
492 : TidStoreIterResult *
493 28892 : TidStoreIterateNext(TidStoreIter *iter)
494 : {
495 : uint64 key;
496 : BlocktableEntry *page;
497 :
498 28892 : if (TidStoreIsShared(iter->ts))
499 1304 : page = shared_ts_iterate_next(iter->tree_iter.shared, &key);
500 : else
501 27588 : page = local_ts_iterate_next(iter->tree_iter.local, &key);
502 :
503 28892 : if (page == NULL)
504 1114 : return NULL;
505 :
506 27778 : iter->output.blkno = key;
507 27778 : iter->output.internal_page = page;
508 :
509 27778 : return &(iter->output);
510 : }
511 :
512 : /*
513 : * Finish the iteration on TidStore.
514 : *
515 : * The caller is responsible for releasing any locks.
516 : */
517 : void
518 1114 : TidStoreEndIterate(TidStoreIter *iter)
519 : {
520 1114 : if (TidStoreIsShared(iter->ts))
521 14 : shared_ts_end_iterate(iter->tree_iter.shared);
522 : else
523 1100 : local_ts_end_iterate(iter->tree_iter.local);
524 :
525 1114 : pfree(iter);
526 1114 : }
527 :
528 : /*
529 : * Return the memory usage of TidStore.
530 : */
531 : size_t
532 801744 : TidStoreMemoryUsage(TidStore *ts)
533 : {
534 801744 : if (TidStoreIsShared(ts))
535 2392 : return shared_ts_memory_usage(ts->tree.shared);
536 : else
537 799352 : return local_ts_memory_usage(ts->tree.local);
538 : }
539 :
540 : /*
541 : * Return the DSA area where the TidStore lives.
542 : */
543 : dsa_area *
544 36 : TidStoreGetDSA(TidStore *ts)
545 : {
546 : Assert(TidStoreIsShared(ts));
547 :
548 36 : return ts->area;
549 : }
550 :
551 : dsa_pointer
552 34 : TidStoreGetHandle(TidStore *ts)
553 : {
554 : Assert(TidStoreIsShared(ts));
555 :
556 34 : return (dsa_pointer) shared_ts_get_handle(ts->tree.shared);
557 : }
558 :
559 : /*
560 : * Given a TidStoreIterResult returned by TidStoreIterateNext(), extract the
561 : * offset numbers. Returns the number of offsets filled in, if <=
562 : * max_offsets. Otherwise, fills in as much as it can in the given space, and
563 : * returns the size of the buffer that would be needed.
564 : */
565 : int
566 27778 : TidStoreGetBlockOffsets(TidStoreIterResult *result,
567 : OffsetNumber *offsets,
568 : int max_offsets)
569 : {
570 27778 : BlocktableEntry *page = result->internal_page;
571 27778 : int num_offsets = 0;
572 : int wordnum;
573 :
574 27778 : if (page->header.nwords == 0)
575 : {
576 : /* we have offsets in the header */
577 18632 : for (int i = 0; i < NUM_FULL_OFFSETS; i++)
578 : {
579 13974 : if (page->header.full_offsets[i] != InvalidOffsetNumber)
580 : {
581 7974 : if (num_offsets < max_offsets)
582 7974 : offsets[num_offsets] = page->header.full_offsets[i];
583 7974 : num_offsets++;
584 : }
585 : }
586 : }
587 : else
588 : {
589 82838 : for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
590 : {
591 59718 : bitmapword w = page->words[wordnum];
592 59718 : int off = wordnum * BITS_PER_BITMAPWORD;
593 :
594 2642274 : while (w != 0)
595 : {
596 2582556 : if (w & 1)
597 : {
598 1686528 : if (num_offsets < max_offsets)
599 1686528 : offsets[num_offsets] = (OffsetNumber) off;
600 1686528 : num_offsets++;
601 : }
602 2582556 : off++;
603 2582556 : w >>= 1;
604 : }
605 : }
606 : }
607 :
608 27778 : return num_offsets;
609 : }
|