Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidstore.c
4 : * TID (ItemPointerData) storage implementation.
5 : *
6 : * TidStore is a in-memory data structure to store TIDs (ItemPointerData).
7 : * Internally it uses a radix tree as the storage for TIDs. The key is the
8 : * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
9 : *
10 : * TidStore can be shared among parallel worker processes by using
11 : * TidStoreCreateShared(). Other backends can attach to the shared TidStore
12 : * by TidStoreAttach().
13 : *
14 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
15 : * Portions Copyright (c) 1994, Regents of the University of California
16 : *
17 : * IDENTIFICATION
18 : * src/backend/access/common/tidstore.c
19 : *
20 : *-------------------------------------------------------------------------
21 : */
22 : #include "postgres.h"
23 :
24 : #include "access/tidstore.h"
25 : #include "miscadmin.h"
26 : #include "nodes/bitmapset.h"
27 : #include "storage/lwlock.h"
28 : #include "utils/dsa.h"
29 :
30 :
31 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
32 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
33 :
34 : /* number of active words for a page: */
35 : #define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
36 :
37 : /* number of offsets we can store in the header of a BlocktableEntry */
38 : #define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint8) - sizeof(int8)) / sizeof(OffsetNumber))
39 :
40 : /*
41 : * This is named similarly to PagetableEntry in tidbitmap.c
42 : * because the two have a similar function.
43 : */
44 : typedef struct BlocktableEntry
45 : {
46 : struct
47 : {
48 : #ifndef WORDS_BIGENDIAN
49 : /*
50 : * We need to position this member to reserve space for the backing
51 : * radix tree to tag the lowest bit when struct 'header' is stored
52 : * inside a pointer or DSA pointer.
53 : */
54 : uint8 flags;
55 :
56 : int8 nwords;
57 : #endif
58 :
59 : /*
60 : * We can store a small number of offsets here to avoid wasting space
61 : * with a sparse bitmap.
62 : */
63 : OffsetNumber full_offsets[NUM_FULL_OFFSETS];
64 :
65 : #ifdef WORDS_BIGENDIAN
66 : int8 nwords;
67 : uint8 flags;
68 : #endif
69 : } header;
70 :
71 : /*
72 : * We don't expect any padding space here, but to be cautious, code
73 : * creating new entries should zero out space up to 'words'.
74 : */
75 :
76 : bitmapword words[FLEXIBLE_ARRAY_MEMBER];
77 : } BlocktableEntry;
78 :
79 : /*
80 : * The type of 'nwords' limits the max number of words in the 'words' array.
81 : * This computes the max offset we can actually store in the bitmap. In
82 : * practice, it's almost always the same as MaxOffsetNumber.
83 : */
84 : #define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
85 :
86 : #define MaxBlocktableEntrySize \
87 : offsetof(BlocktableEntry, words) + \
88 : (sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
89 :
90 : #define RT_PREFIX local_ts
91 : #define RT_SCOPE static
92 : #define RT_DECLARE
93 : #define RT_DEFINE
94 : #define RT_VALUE_TYPE BlocktableEntry
95 : #define RT_VARLEN_VALUE_SIZE(page) \
96 : (offsetof(BlocktableEntry, words) + \
97 : sizeof(bitmapword) * (page)->header.nwords)
98 : #define RT_RUNTIME_EMBEDDABLE_VALUE
99 : #include "lib/radixtree.h"
100 :
101 : #define RT_PREFIX shared_ts
102 : #define RT_SHMEM
103 : #define RT_SCOPE static
104 : #define RT_DECLARE
105 : #define RT_DEFINE
106 : #define RT_VALUE_TYPE BlocktableEntry
107 : #define RT_VARLEN_VALUE_SIZE(page) \
108 : (offsetof(BlocktableEntry, words) + \
109 : sizeof(bitmapword) * (page)->header.nwords)
110 : #define RT_RUNTIME_EMBEDDABLE_VALUE
111 : #include "lib/radixtree.h"
112 :
113 : /* Per-backend state for a TidStore */
114 : struct TidStore
115 : {
116 : /* MemoryContext where the TidStore is allocated */
117 : MemoryContext context;
118 :
119 : /* MemoryContext that the radix tree uses */
120 : MemoryContext rt_context;
121 :
122 : /* Storage for TIDs. Use either one depending on TidStoreIsShared() */
123 : union
124 : {
125 : local_ts_radix_tree *local;
126 : shared_ts_radix_tree *shared;
127 : } tree;
128 :
129 : /* DSA area for TidStore if using shared memory */
130 : dsa_area *area;
131 : };
132 : #define TidStoreIsShared(ts) ((ts)->area != NULL)
133 :
134 : /* Iterator for TidStore */
135 : struct TidStoreIter
136 : {
137 : TidStore *ts;
138 :
139 : /* iterator of radix tree. Use either one depending on TidStoreIsShared() */
140 : union
141 : {
142 : shared_ts_iter *shared;
143 : local_ts_iter *local;
144 : } tree_iter;
145 :
146 : /* output for the caller */
147 : TidStoreIterResult output;
148 : };
149 :
150 : static void tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
151 : BlocktableEntry *page);
152 :
153 : /*
154 : * Create a TidStore. The TidStore will live in the memory context that is
155 : * CurrentMemoryContext at the time of this call. The TID storage, backed
156 : * by a radix tree, will live in its child memory context, rt_context.
157 : *
158 : * "max_bytes" is not an internally-enforced limit; it is used only as a
159 : * hint to cap the memory block size of the memory context for TID storage.
160 : * This reduces space wastage due to over-allocation. If the caller wants to
161 : * monitor memory usage, it must compare its limit with the value reported
162 : * by TidStoreMemoryUsage().
163 : */
164 : TidStore *
165 19950 : TidStoreCreateLocal(size_t max_bytes, bool insert_only)
166 : {
167 : TidStore *ts;
168 19950 : size_t initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
169 19950 : size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE;
170 19950 : size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
171 :
172 19950 : ts = palloc0(sizeof(TidStore));
173 19950 : ts->context = CurrentMemoryContext;
174 :
175 : /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */
176 39920 : while (16 * maxBlockSize > max_bytes)
177 19970 : maxBlockSize >>= 1;
178 :
179 19950 : if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
180 0 : maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
181 :
182 : /* Create a memory context for the TID storage */
183 19950 : if (insert_only)
184 : {
185 19946 : ts->rt_context = BumpContextCreate(CurrentMemoryContext,
186 : "TID storage",
187 : minContextSize,
188 : initBlockSize,
189 : maxBlockSize);
190 : }
191 : else
192 : {
193 4 : ts->rt_context = AllocSetContextCreate(CurrentMemoryContext,
194 : "TID storage",
195 : minContextSize,
196 : initBlockSize,
197 : maxBlockSize);
198 : }
199 :
200 19950 : ts->tree.local = local_ts_create(ts->rt_context);
201 :
202 19950 : return ts;
203 : }
204 :
205 : /*
206 : * Similar to TidStoreCreateLocal() but create a shared TidStore on a
207 : * DSA area. The TID storage will live in the DSA area, and the memory
208 : * context rt_context will have only meta data of the radix tree.
209 : *
210 : * The returned object is allocated in backend-local memory.
211 : */
212 : TidStore *
213 26 : TidStoreCreateShared(size_t max_bytes, int tranche_id)
214 : {
215 : TidStore *ts;
216 : dsa_area *area;
217 26 : size_t dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE;
218 26 : size_t dsa_max_size = DSA_MAX_SEGMENT_SIZE;
219 :
220 26 : ts = palloc0(sizeof(TidStore));
221 26 : ts->context = CurrentMemoryContext;
222 :
223 26 : ts->rt_context = AllocSetContextCreate(CurrentMemoryContext,
224 : "TID storage meta data",
225 : ALLOCSET_SMALL_SIZES);
226 :
227 : /*
228 : * Choose the initial and maximum DSA segment sizes to be no longer than
229 : * 1/8 of max_bytes.
230 : */
231 478 : while (8 * dsa_max_size > max_bytes)
232 452 : dsa_max_size >>= 1;
233 :
234 26 : if (dsa_max_size < DSA_MIN_SEGMENT_SIZE)
235 0 : dsa_max_size = DSA_MIN_SEGMENT_SIZE;
236 :
237 26 : if (dsa_init_size > dsa_max_size)
238 2 : dsa_init_size = dsa_max_size;
239 :
240 26 : area = dsa_create_ext(tranche_id, dsa_init_size, dsa_max_size);
241 26 : ts->tree.shared = shared_ts_create(ts->rt_context, area,
242 : tranche_id);
243 26 : ts->area = area;
244 :
245 26 : return ts;
246 : }
247 :
248 : /*
249 : * Attach to the shared TidStore. 'area_handle' is the DSA handle where
250 : * the TidStore is created. 'handle' is the dsa_pointer returned by
251 : * TidStoreGetHandle(). The returned object is allocated in backend-local
252 : * memory using the CurrentMemoryContext.
253 : */
254 : TidStore *
255 30 : TidStoreAttach(dsa_handle area_handle, dsa_pointer handle)
256 : {
257 : TidStore *ts;
258 : dsa_area *area;
259 :
260 : Assert(area_handle != DSA_HANDLE_INVALID);
261 : Assert(DsaPointerIsValid(handle));
262 :
263 : /* create per-backend state */
264 30 : ts = palloc0(sizeof(TidStore));
265 :
266 30 : area = dsa_attach(area_handle);
267 :
268 : /* Find the shared the shared radix tree */
269 30 : ts->tree.shared = shared_ts_attach(area, handle);
270 30 : ts->area = area;
271 :
272 30 : return ts;
273 : }
274 :
275 : /*
276 : * Detach from a TidStore. This also detaches from radix tree and frees
277 : * the backend-local resources.
278 : */
279 : void
280 30 : TidStoreDetach(TidStore *ts)
281 : {
282 : Assert(TidStoreIsShared(ts));
283 :
284 30 : shared_ts_detach(ts->tree.shared);
285 30 : dsa_detach(ts->area);
286 :
287 30 : pfree(ts);
288 30 : }
289 :
290 : /*
291 : * Lock support functions.
292 : *
293 : * We can use the radix tree's lock for shared TidStore as the data we
294 : * need to protect is only the shared radix tree.
295 : */
296 :
297 : void
298 2242 : TidStoreLockExclusive(TidStore *ts)
299 : {
300 2242 : if (TidStoreIsShared(ts))
301 206 : shared_ts_lock_exclusive(ts->tree.shared);
302 2242 : }
303 :
304 : void
305 4585304 : TidStoreLockShare(TidStore *ts)
306 : {
307 4585304 : if (TidStoreIsShared(ts))
308 421684 : shared_ts_lock_share(ts->tree.shared);
309 4585304 : }
310 :
311 : void
312 4587544 : TidStoreUnlock(TidStore *ts)
313 : {
314 4587544 : if (TidStoreIsShared(ts))
315 421890 : shared_ts_unlock(ts->tree.shared);
316 4587544 : }
317 :
318 : /*
319 : * Destroy a TidStore, returning all memory.
320 : *
321 : * Note that the caller must be certain that no other backend will attempt to
322 : * access the TidStore before calling this function. Other backend must
323 : * explicitly call TidStoreDetach() to free up backend-local memory associated
324 : * with the TidStore. The backend that calls TidStoreDestroy() must not call
325 : * TidStoreDetach().
326 : */
327 : void
328 894 : TidStoreDestroy(TidStore *ts)
329 : {
330 : /* Destroy underlying radix tree */
331 894 : if (TidStoreIsShared(ts))
332 : {
333 26 : shared_ts_free(ts->tree.shared);
334 :
335 26 : dsa_detach(ts->area);
336 : }
337 : else
338 868 : local_ts_free(ts->tree.local);
339 :
340 894 : MemoryContextDelete(ts->rt_context);
341 :
342 894 : pfree(ts);
343 894 : }
344 :
345 : /*
346 : * Create or replace an entry for the given block and array of offsets.
347 : *
348 : * NB: This function is designed and optimized for vacuum's heap scanning
349 : * phase, so has some limitations:
350 : *
351 : * - The offset numbers "offsets" must be sorted in ascending order.
352 : * - If the block number already exists, the entry will be replaced --
353 : * there is no way to add or remove offsets from an entry.
354 : */
355 : void
356 22530 : TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
357 : int num_offsets)
358 : {
359 : union
360 : {
361 : char data[MaxBlocktableEntrySize];
362 : BlocktableEntry force_align_entry;
363 : } data;
364 22530 : BlocktableEntry *page = (BlocktableEntry *) data.data;
365 : bitmapword word;
366 : int wordnum;
367 : int next_word_threshold;
368 22530 : int idx = 0;
369 :
370 : Assert(num_offsets > 0);
371 :
372 : /* Check if the given offset numbers are ordered */
373 1355950 : for (int i = 1; i < num_offsets; i++)
374 : Assert(offsets[i] > offsets[i - 1]);
375 :
376 22530 : memset(page, 0, offsetof(BlocktableEntry, words));
377 :
378 22530 : if (num_offsets <= NUM_FULL_OFFSETS)
379 : {
380 9196 : for (int i = 0; i < num_offsets; i++)
381 : {
382 5784 : OffsetNumber off = offsets[i];
383 :
384 : /* safety check to ensure we don't overrun bit array bounds */
385 5784 : if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
386 2 : elog(ERROR, "tuple offset out of range: %u", off);
387 :
388 5782 : page->header.full_offsets[i] = off;
389 : }
390 :
391 3412 : page->header.nwords = 0;
392 : }
393 : else
394 : {
395 19116 : for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
396 70372 : wordnum <= WORDNUM(offsets[num_offsets - 1]);
397 51256 : wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
398 : {
399 51256 : word = 0;
400 :
401 1401422 : while (idx < num_offsets)
402 : {
403 1382306 : OffsetNumber off = offsets[idx];
404 :
405 : /* safety check to ensure we don't overrun bit array bounds */
406 1382306 : if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
407 0 : elog(ERROR, "tuple offset out of range: %u", off);
408 :
409 1382306 : if (off >= next_word_threshold)
410 32140 : break;
411 :
412 1350166 : word |= ((bitmapword) 1 << BITNUM(off));
413 1350166 : idx++;
414 : }
415 :
416 : /* write out offset bitmap for this wordnum */
417 51256 : page->words[wordnum] = word;
418 : }
419 :
420 19116 : page->header.nwords = wordnum;
421 : Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
422 : }
423 :
424 22528 : if (TidStoreIsShared(ts))
425 476 : shared_ts_set(ts->tree.shared, blkno, page);
426 : else
427 22052 : local_ts_set(ts->tree.local, blkno, page);
428 22528 : }
429 :
430 : /* Return true if the given TID is present in the TidStore */
431 : bool
432 9479774 : TidStoreIsMember(TidStore *ts, ItemPointer tid)
433 : {
434 : int wordnum;
435 : int bitnum;
436 : BlocktableEntry *page;
437 9479774 : BlockNumber blk = ItemPointerGetBlockNumber(tid);
438 9479774 : OffsetNumber off = ItemPointerGetOffsetNumber(tid);
439 :
440 9479774 : if (TidStoreIsShared(ts))
441 620108 : page = shared_ts_find(ts->tree.shared, blk);
442 : else
443 8859666 : page = local_ts_find(ts->tree.local, blk);
444 :
445 : /* no entry for the blk */
446 9479774 : if (page == NULL)
447 1477790 : return false;
448 :
449 8001984 : if (page->header.nwords == 0)
450 : {
451 : /* we have offsets in the header */
452 1060884 : for (int i = 0; i < NUM_FULL_OFFSETS; i++)
453 : {
454 799708 : if (page->header.full_offsets[i] == off)
455 10836 : return true;
456 : }
457 261176 : return false;
458 : }
459 : else
460 : {
461 7729972 : wordnum = WORDNUM(off);
462 7729972 : bitnum = BITNUM(off);
463 :
464 : /* no bitmap for the off */
465 7729972 : if (wordnum >= page->header.nwords)
466 3932464 : return false;
467 :
468 3797508 : return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
469 : }
470 : }
471 :
472 : /*
473 : * Prepare to iterate through a TidStore.
474 : *
475 : * The TidStoreIter struct is created in the caller's memory context, and it
476 : * will be freed in TidStoreEndIterate.
477 : *
478 : * The caller is responsible for locking TidStore until the iteration is
479 : * finished.
480 : */
481 : TidStoreIter *
482 880 : TidStoreBeginIterate(TidStore *ts)
483 : {
484 : TidStoreIter *iter;
485 :
486 880 : iter = palloc0(sizeof(TidStoreIter));
487 880 : iter->ts = ts;
488 :
489 : /*
490 : * We start with an array large enough to contain at least the offsets
491 : * from one completely full bitmap element.
492 : */
493 880 : iter->output.max_offset = 2 * BITS_PER_BITMAPWORD;
494 880 : iter->output.offsets = palloc(sizeof(OffsetNumber) * iter->output.max_offset);
495 :
496 880 : if (TidStoreIsShared(ts))
497 8 : iter->tree_iter.shared = shared_ts_begin_iterate(ts->tree.shared);
498 : else
499 872 : iter->tree_iter.local = local_ts_begin_iterate(ts->tree.local);
500 :
501 880 : return iter;
502 : }
503 :
504 :
505 : /*
506 : * Scan the TidStore and return the TIDs of the next block. The offsets in
507 : * each iteration result are ordered, as are the block numbers over all
508 : * iterations.
509 : */
510 : TidStoreIterResult *
511 23308 : TidStoreIterateNext(TidStoreIter *iter)
512 : {
513 : uint64 key;
514 : BlocktableEntry *page;
515 :
516 23308 : if (TidStoreIsShared(iter->ts))
517 484 : page = shared_ts_iterate_next(iter->tree_iter.shared, &key);
518 : else
519 22824 : page = local_ts_iterate_next(iter->tree_iter.local, &key);
520 :
521 23308 : if (page == NULL)
522 880 : return NULL;
523 :
524 : /* Collect TIDs from the key-value pair */
525 22428 : tidstore_iter_extract_tids(iter, (BlockNumber) key, page);
526 :
527 22428 : return &(iter->output);
528 : }
529 :
530 : /*
531 : * Finish the iteration on TidStore.
532 : *
533 : * The caller is responsible for releasing any locks.
534 : */
535 : void
536 880 : TidStoreEndIterate(TidStoreIter *iter)
537 : {
538 880 : if (TidStoreIsShared(iter->ts))
539 8 : shared_ts_end_iterate(iter->tree_iter.shared);
540 : else
541 872 : local_ts_end_iterate(iter->tree_iter.local);
542 :
543 880 : pfree(iter->output.offsets);
544 880 : pfree(iter);
545 880 : }
546 :
547 : /*
548 : * Return the memory usage of TidStore.
549 : */
550 : size_t
551 111924 : TidStoreMemoryUsage(TidStore *ts)
552 : {
553 111924 : if (TidStoreIsShared(ts))
554 734 : return shared_ts_memory_usage(ts->tree.shared);
555 : else
556 111190 : return local_ts_memory_usage(ts->tree.local);
557 : }
558 :
559 : /*
560 : * Return the DSA area where the TidStore lives.
561 : */
562 : dsa_area *
563 26 : TidStoreGetDSA(TidStore *ts)
564 : {
565 : Assert(TidStoreIsShared(ts));
566 :
567 26 : return ts->area;
568 : }
569 :
570 : dsa_pointer
571 24 : TidStoreGetHandle(TidStore *ts)
572 : {
573 : Assert(TidStoreIsShared(ts));
574 :
575 24 : return (dsa_pointer) shared_ts_get_handle(ts->tree.shared);
576 : }
577 :
578 : /* Extract TIDs from the given key-value pair */
579 : static void
580 22428 : tidstore_iter_extract_tids(TidStoreIter *iter, BlockNumber blkno,
581 : BlocktableEntry *page)
582 : {
583 22428 : TidStoreIterResult *result = (&iter->output);
584 : int wordnum;
585 :
586 22428 : result->num_offsets = 0;
587 22428 : result->blkno = blkno;
588 :
589 22428 : if (page->header.nwords == 0)
590 : {
591 : /* we have offsets in the header */
592 13616 : for (int i = 0; i < NUM_FULL_OFFSETS; i++)
593 : {
594 10212 : if (page->header.full_offsets[i] != InvalidOffsetNumber)
595 5766 : result->offsets[result->num_offsets++] = page->header.full_offsets[i];
596 : }
597 : }
598 : else
599 : {
600 70168 : for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
601 : {
602 51144 : bitmapword w = page->words[wordnum];
603 51144 : int off = wordnum * BITS_PER_BITMAPWORD;
604 :
605 : /* Make sure there is enough space to add offsets */
606 51144 : if ((result->num_offsets + BITS_PER_BITMAPWORD) > result->max_offset)
607 : {
608 156 : result->max_offset *= 2;
609 156 : result->offsets = repalloc(result->offsets,
610 156 : sizeof(OffsetNumber) * result->max_offset);
611 : }
612 :
613 2242294 : while (w != 0)
614 : {
615 2191150 : if (w & 1)
616 1347688 : result->offsets[result->num_offsets++] = (OffsetNumber) off;
617 2191150 : off++;
618 2191150 : w >>= 1;
619 : }
620 : }
621 : }
622 22428 : }
|