Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidstore.c
4 : * TID (ItemPointerData) storage implementation.
5 : *
6 : * TidStore is a in-memory data structure to store TIDs (ItemPointerData).
7 : * Internally it uses a radix tree as the storage for TIDs. The key is the
8 : * BlockNumber and the value is a bitmap of offsets, BlocktableEntry.
9 : *
10 : * TidStore can be shared among parallel worker processes by using
11 : * TidStoreCreateShared(). Other backends can attach to the shared TidStore
12 : * by TidStoreAttach().
13 : *
14 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
15 : * Portions Copyright (c) 1994, Regents of the University of California
16 : *
17 : * IDENTIFICATION
18 : * src/backend/access/common/tidstore.c
19 : *
20 : *-------------------------------------------------------------------------
21 : */
22 : #include "postgres.h"
23 :
24 : #include "access/tidstore.h"
25 : #include "miscadmin.h"
26 : #include "nodes/bitmapset.h"
27 : #include "storage/lwlock.h"
28 : #include "utils/dsa.h"
29 :
30 :
31 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
32 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
33 :
34 : /* number of active words for a page: */
35 : #define WORDS_PER_PAGE(n) ((n) / BITS_PER_BITMAPWORD + 1)
36 :
37 : /* number of offsets we can store in the header of a BlocktableEntry */
38 : #define NUM_FULL_OFFSETS ((sizeof(uintptr_t) - sizeof(uint8) - sizeof(int8)) / sizeof(OffsetNumber))
39 :
40 : /*
41 : * This is named similarly to PagetableEntry in tidbitmap.c
42 : * because the two have a similar function.
43 : */
44 : typedef struct BlocktableEntry
45 : {
46 : struct
47 : {
48 : #ifndef WORDS_BIGENDIAN
49 : /*
50 : * We need to position this member to reserve space for the backing
51 : * radix tree to tag the lowest bit when struct 'header' is stored
52 : * inside a pointer or DSA pointer.
53 : */
54 : uint8 flags;
55 :
56 : int8 nwords;
57 : #endif
58 :
59 : /*
60 : * We can store a small number of offsets here to avoid wasting space
61 : * with a sparse bitmap.
62 : */
63 : OffsetNumber full_offsets[NUM_FULL_OFFSETS];
64 :
65 : #ifdef WORDS_BIGENDIAN
66 : int8 nwords;
67 : uint8 flags;
68 : #endif
69 : } header;
70 :
71 : /*
72 : * We don't expect any padding space here, but to be cautious, code
73 : * creating new entries should zero out space up to 'words'.
74 : */
75 :
76 : bitmapword words[FLEXIBLE_ARRAY_MEMBER];
77 : } BlocktableEntry;
78 :
79 : /*
80 : * The type of 'nwords' limits the max number of words in the 'words' array.
81 : * This computes the max offset we can actually store in the bitmap. In
82 : * practice, it's almost always the same as MaxOffsetNumber.
83 : */
84 : #define MAX_OFFSET_IN_BITMAP Min(BITS_PER_BITMAPWORD * PG_INT8_MAX - 1, MaxOffsetNumber)
85 :
86 : #define MaxBlocktableEntrySize \
87 : offsetof(BlocktableEntry, words) + \
88 : (sizeof(bitmapword) * WORDS_PER_PAGE(MAX_OFFSET_IN_BITMAP))
89 :
90 : #define RT_PREFIX local_ts
91 : #define RT_SCOPE static
92 : #define RT_DECLARE
93 : #define RT_DEFINE
94 : #define RT_VALUE_TYPE BlocktableEntry
95 : #define RT_VARLEN_VALUE_SIZE(page) \
96 : (offsetof(BlocktableEntry, words) + \
97 : sizeof(bitmapword) * (page)->header.nwords)
98 : #define RT_RUNTIME_EMBEDDABLE_VALUE
99 : #include "lib/radixtree.h"
100 :
101 : #define RT_PREFIX shared_ts
102 : #define RT_SHMEM
103 : #define RT_SCOPE static
104 : #define RT_DECLARE
105 : #define RT_DEFINE
106 : #define RT_VALUE_TYPE BlocktableEntry
107 : #define RT_VARLEN_VALUE_SIZE(page) \
108 : (offsetof(BlocktableEntry, words) + \
109 : sizeof(bitmapword) * (page)->header.nwords)
110 : #define RT_RUNTIME_EMBEDDABLE_VALUE
111 : #include "lib/radixtree.h"
112 :
113 : /* Per-backend state for a TidStore */
114 : struct TidStore
115 : {
116 : /* MemoryContext where the TidStore is allocated */
117 : MemoryContext context;
118 :
119 : /* MemoryContext that the radix tree uses */
120 : MemoryContext rt_context;
121 :
122 : /* Storage for TIDs. Use either one depending on TidStoreIsShared() */
123 : union
124 : {
125 : local_ts_radix_tree *local;
126 : shared_ts_radix_tree *shared;
127 : } tree;
128 :
129 : /* DSA area for TidStore if using shared memory */
130 : dsa_area *area;
131 : };
132 : #define TidStoreIsShared(ts) ((ts)->area != NULL)
133 :
134 : /* Iterator for TidStore */
135 : struct TidStoreIter
136 : {
137 : TidStore *ts;
138 :
139 : /* iterator of radix tree. Use either one depending on TidStoreIsShared() */
140 : union
141 : {
142 : shared_ts_iter *shared;
143 : local_ts_iter *local;
144 : } tree_iter;
145 :
146 : /* output for the caller */
147 : TidStoreIterResult output;
148 : };
149 :
150 : /*
151 : * Create a TidStore. The TidStore will live in the memory context that is
152 : * CurrentMemoryContext at the time of this call. The TID storage, backed
153 : * by a radix tree, will live in its child memory context, rt_context.
154 : *
155 : * "max_bytes" is not an internally-enforced limit; it is used only as a
156 : * hint to cap the memory block size of the memory context for TID storage.
157 : * This reduces space wastage due to over-allocation. If the caller wants to
158 : * monitor memory usage, it must compare its limit with the value reported
159 : * by TidStoreMemoryUsage().
160 : */
161 : TidStore *
162 98110 : TidStoreCreateLocal(size_t max_bytes, bool insert_only)
163 : {
164 : TidStore *ts;
165 98110 : size_t initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
166 98110 : size_t minContextSize = ALLOCSET_DEFAULT_MINSIZE;
167 98110 : size_t maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
168 :
169 98110 : ts = palloc0(sizeof(TidStore));
170 98110 : ts->context = CurrentMemoryContext;
171 :
172 : /* choose the maxBlockSize to be no larger than 1/16 of max_bytes */
173 196240 : while (16 * maxBlockSize > max_bytes)
174 98130 : maxBlockSize >>= 1;
175 :
176 98110 : if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
177 0 : maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
178 :
179 : /* Create a memory context for the TID storage */
180 98110 : if (insert_only)
181 : {
182 98106 : ts->rt_context = BumpContextCreate(CurrentMemoryContext,
183 : "TID storage",
184 : minContextSize,
185 : initBlockSize,
186 : maxBlockSize);
187 : }
188 : else
189 : {
190 4 : ts->rt_context = AllocSetContextCreate(CurrentMemoryContext,
191 : "TID storage",
192 : minContextSize,
193 : initBlockSize,
194 : maxBlockSize);
195 : }
196 :
197 98110 : ts->tree.local = local_ts_create(ts->rt_context);
198 :
199 98110 : return ts;
200 : }
201 :
202 : /*
203 : * Similar to TidStoreCreateLocal() but create a shared TidStore on a
204 : * DSA area. The TID storage will live in the DSA area, and the memory
205 : * context rt_context will have only meta data of the radix tree.
206 : *
207 : * The returned object is allocated in backend-local memory.
208 : */
209 : TidStore *
210 38 : TidStoreCreateShared(size_t max_bytes, int tranche_id)
211 : {
212 : TidStore *ts;
213 : dsa_area *area;
214 38 : size_t dsa_init_size = DSA_DEFAULT_INIT_SEGMENT_SIZE;
215 38 : size_t dsa_max_size = DSA_MAX_SEGMENT_SIZE;
216 :
217 38 : ts = palloc0(sizeof(TidStore));
218 38 : ts->context = CurrentMemoryContext;
219 :
220 38 : ts->rt_context = AllocSetContextCreate(CurrentMemoryContext,
221 : "TID storage meta data",
222 : ALLOCSET_SMALL_SIZES);
223 :
224 : /*
225 : * Choose the initial and maximum DSA segment sizes to be no longer than
226 : * 1/8 of max_bytes.
227 : */
228 694 : while (8 * dsa_max_size > max_bytes)
229 656 : dsa_max_size >>= 1;
230 :
231 38 : if (dsa_max_size < DSA_MIN_SEGMENT_SIZE)
232 0 : dsa_max_size = DSA_MIN_SEGMENT_SIZE;
233 :
234 38 : if (dsa_init_size > dsa_max_size)
235 2 : dsa_init_size = dsa_max_size;
236 :
237 38 : area = dsa_create_ext(tranche_id, dsa_init_size, dsa_max_size);
238 38 : ts->tree.shared = shared_ts_create(ts->rt_context, area,
239 : tranche_id);
240 38 : ts->area = area;
241 :
242 38 : return ts;
243 : }
244 :
245 : /*
246 : * Attach to the shared TidStore. 'area_handle' is the DSA handle where
247 : * the TidStore is created. 'handle' is the dsa_pointer returned by
248 : * TidStoreGetHandle(). The returned object is allocated in backend-local
249 : * memory using the CurrentMemoryContext.
250 : */
251 : TidStore *
252 38 : TidStoreAttach(dsa_handle area_handle, dsa_pointer handle)
253 : {
254 : TidStore *ts;
255 : dsa_area *area;
256 :
257 : Assert(area_handle != DSA_HANDLE_INVALID);
258 : Assert(DsaPointerIsValid(handle));
259 :
260 : /* create per-backend state */
261 38 : ts = palloc0(sizeof(TidStore));
262 :
263 38 : area = dsa_attach(area_handle);
264 :
265 : /* Find the shared the shared radix tree */
266 38 : ts->tree.shared = shared_ts_attach(area, handle);
267 38 : ts->area = area;
268 :
269 38 : return ts;
270 : }
271 :
272 : /*
273 : * Detach from a TidStore. This also detaches from radix tree and frees
274 : * the backend-local resources.
275 : */
276 : void
277 38 : TidStoreDetach(TidStore *ts)
278 : {
279 : Assert(TidStoreIsShared(ts));
280 :
281 38 : shared_ts_detach(ts->tree.shared);
282 38 : dsa_detach(ts->area);
283 :
284 38 : pfree(ts);
285 38 : }
286 :
287 : /*
288 : * Lock support functions.
289 : *
290 : * We can use the radix tree's lock for shared TidStore as the data we
291 : * need to protect is only the shared radix tree.
292 : */
293 :
294 : void
295 2242 : TidStoreLockExclusive(TidStore *ts)
296 : {
297 2242 : if (TidStoreIsShared(ts))
298 206 : shared_ts_lock_exclusive(ts->tree.shared);
299 2242 : }
300 :
301 : void
302 4585304 : TidStoreLockShare(TidStore *ts)
303 : {
304 4585304 : if (TidStoreIsShared(ts))
305 421684 : shared_ts_lock_share(ts->tree.shared);
306 4585304 : }
307 :
308 : void
309 4587544 : TidStoreUnlock(TidStore *ts)
310 : {
311 4587544 : if (TidStoreIsShared(ts))
312 421890 : shared_ts_unlock(ts->tree.shared);
313 4587544 : }
314 :
315 : /*
316 : * Destroy a TidStore, returning all memory.
317 : *
318 : * Note that the caller must be certain that no other backend will attempt to
319 : * access the TidStore before calling this function. Other backend must
320 : * explicitly call TidStoreDetach() to free up backend-local memory associated
321 : * with the TidStore. The backend that calls TidStoreDestroy() must not call
322 : * TidStoreDetach().
323 : */
324 : void
325 1040 : TidStoreDestroy(TidStore *ts)
326 : {
327 : /* Destroy underlying radix tree */
328 1040 : if (TidStoreIsShared(ts))
329 : {
330 38 : shared_ts_free(ts->tree.shared);
331 :
332 38 : dsa_detach(ts->area);
333 : }
334 : else
335 1002 : local_ts_free(ts->tree.local);
336 :
337 1040 : MemoryContextDelete(ts->rt_context);
338 :
339 1040 : pfree(ts);
340 1040 : }
341 :
342 : /*
343 : * Create or replace an entry for the given block and array of offsets.
344 : *
345 : * NB: This function is designed and optimized for vacuum's heap scanning
346 : * phase, so has some limitations:
347 : *
348 : * - The offset numbers "offsets" must be sorted in ascending order.
349 : * - If the block number already exists, the entry will be replaced --
350 : * there is no way to add or remove offsets from an entry.
351 : */
352 : void
353 28974 : TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets,
354 : int num_offsets)
355 : {
356 : union
357 : {
358 : char data[MaxBlocktableEntrySize];
359 : BlocktableEntry force_align_entry;
360 : } data;
361 28974 : BlocktableEntry *page = (BlocktableEntry *) data.data;
362 : bitmapword word;
363 : int wordnum;
364 : int next_word_threshold;
365 28974 : int idx = 0;
366 :
367 : Assert(num_offsets > 0);
368 :
369 : /* Check if the given offset numbers are ordered */
370 1600064 : for (int i = 1; i < num_offsets; i++)
371 : Assert(offsets[i] > offsets[i - 1]);
372 :
373 28974 : memset(page, 0, offsetof(BlocktableEntry, words));
374 :
375 28974 : if (num_offsets <= NUM_FULL_OFFSETS)
376 : {
377 11626 : for (int i = 0; i < num_offsets; i++)
378 : {
379 7398 : OffsetNumber off = offsets[i];
380 :
381 : /* safety check to ensure we don't overrun bit array bounds */
382 7398 : if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
383 2 : elog(ERROR, "tuple offset out of range: %u", off);
384 :
385 7396 : page->header.full_offsets[i] = off;
386 : }
387 :
388 4228 : page->header.nwords = 0;
389 : }
390 : else
391 : {
392 24744 : for (wordnum = 0, next_word_threshold = BITS_PER_BITMAPWORD;
393 83324 : wordnum <= WORDNUM(offsets[num_offsets - 1]);
394 58580 : wordnum++, next_word_threshold += BITS_PER_BITMAPWORD)
395 : {
396 58580 : word = 0;
397 :
398 1651246 : while (idx < num_offsets)
399 : {
400 1626502 : OffsetNumber off = offsets[idx];
401 :
402 : /* safety check to ensure we don't overrun bit array bounds */
403 1626502 : if (off == InvalidOffsetNumber || off > MAX_OFFSET_IN_BITMAP)
404 0 : elog(ERROR, "tuple offset out of range: %u", off);
405 :
406 1626502 : if (off >= next_word_threshold)
407 33836 : break;
408 :
409 1592666 : word |= ((bitmapword) 1 << BITNUM(off));
410 1592666 : idx++;
411 : }
412 :
413 : /* write out offset bitmap for this wordnum */
414 58580 : page->words[wordnum] = word;
415 : }
416 :
417 24744 : page->header.nwords = wordnum;
418 : Assert(page->header.nwords == WORDS_PER_PAGE(offsets[num_offsets - 1]));
419 : }
420 :
421 28972 : if (TidStoreIsShared(ts))
422 1312 : shared_ts_set(ts->tree.shared, blkno, page);
423 : else
424 27660 : local_ts_set(ts->tree.local, blkno, page);
425 28972 : }
426 :
427 : /* Return true if the given TID is present in the TidStore */
428 : bool
429 9990422 : TidStoreIsMember(TidStore *ts, ItemPointer tid)
430 : {
431 : int wordnum;
432 : int bitnum;
433 : BlocktableEntry *page;
434 9990422 : BlockNumber blk = ItemPointerGetBlockNumber(tid);
435 9990422 : OffsetNumber off = ItemPointerGetOffsetNumber(tid);
436 :
437 9990422 : if (TidStoreIsShared(ts))
438 854934 : page = shared_ts_find(ts->tree.shared, blk);
439 : else
440 9135488 : page = local_ts_find(ts->tree.local, blk);
441 :
442 : /* no entry for the blk */
443 9990422 : if (page == NULL)
444 1683916 : return false;
445 :
446 8306506 : if (page->header.nwords == 0)
447 : {
448 : /* we have offsets in the header */
449 1277826 : for (int i = 0; i < NUM_FULL_OFFSETS; i++)
450 : {
451 963672 : if (page->header.full_offsets[i] == off)
452 13840 : return true;
453 : }
454 314154 : return false;
455 : }
456 : else
457 : {
458 7978512 : wordnum = WORDNUM(off);
459 7978512 : bitnum = BITNUM(off);
460 :
461 : /* no bitmap for the off */
462 7978512 : if (wordnum >= page->header.nwords)
463 3933112 : return false;
464 :
465 4045400 : return (page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0;
466 : }
467 : }
468 :
469 : /*
470 : * Prepare to iterate through a TidStore.
471 : *
472 : * The TidStoreIter struct is created in the caller's memory context, and it
473 : * will be freed in TidStoreEndIterate.
474 : *
475 : * The caller is responsible for locking TidStore until the iteration is
476 : * finished.
477 : */
478 : TidStoreIter *
479 1002 : TidStoreBeginIterate(TidStore *ts)
480 : {
481 : TidStoreIter *iter;
482 :
483 1002 : iter = palloc0(sizeof(TidStoreIter));
484 1002 : iter->ts = ts;
485 :
486 1002 : if (TidStoreIsShared(ts))
487 16 : iter->tree_iter.shared = shared_ts_begin_iterate(ts->tree.shared);
488 : else
489 986 : iter->tree_iter.local = local_ts_begin_iterate(ts->tree.local);
490 :
491 1002 : return iter;
492 : }
493 :
494 :
495 : /*
496 : * Return a result that contains the next block number and that can be used to
497 : * obtain the set of offsets by calling TidStoreGetBlockOffsets(). The result
498 : * is copyable.
499 : */
500 : TidStoreIterResult *
501 25272 : TidStoreIterateNext(TidStoreIter *iter)
502 : {
503 : uint64 key;
504 : BlocktableEntry *page;
505 :
506 25272 : if (TidStoreIsShared(iter->ts))
507 1328 : page = shared_ts_iterate_next(iter->tree_iter.shared, &key);
508 : else
509 23944 : page = local_ts_iterate_next(iter->tree_iter.local, &key);
510 :
511 25272 : if (page == NULL)
512 1002 : return NULL;
513 :
514 24270 : iter->output.blkno = key;
515 24270 : iter->output.internal_page = page;
516 :
517 24270 : return &(iter->output);
518 : }
519 :
520 : /*
521 : * Finish the iteration on TidStore.
522 : *
523 : * The caller is responsible for releasing any locks.
524 : */
525 : void
526 1002 : TidStoreEndIterate(TidStoreIter *iter)
527 : {
528 1002 : if (TidStoreIsShared(iter->ts))
529 16 : shared_ts_end_iterate(iter->tree_iter.shared);
530 : else
531 986 : local_ts_end_iterate(iter->tree_iter.local);
532 :
533 1002 : pfree(iter);
534 1002 : }
535 :
536 : /*
537 : * Return the memory usage of TidStore.
538 : */
539 : size_t
540 437292 : TidStoreMemoryUsage(TidStore *ts)
541 : {
542 437292 : if (TidStoreIsShared(ts))
543 2414 : return shared_ts_memory_usage(ts->tree.shared);
544 : else
545 434878 : return local_ts_memory_usage(ts->tree.local);
546 : }
547 :
548 : /*
549 : * Return the DSA area where the TidStore lives.
550 : */
551 : dsa_area *
552 38 : TidStoreGetDSA(TidStore *ts)
553 : {
554 : Assert(TidStoreIsShared(ts));
555 :
556 38 : return ts->area;
557 : }
558 :
559 : dsa_pointer
560 36 : TidStoreGetHandle(TidStore *ts)
561 : {
562 : Assert(TidStoreIsShared(ts));
563 :
564 36 : return (dsa_pointer) shared_ts_get_handle(ts->tree.shared);
565 : }
566 :
567 : /*
568 : * Given a TidStoreIterResult returned by TidStoreIterateNext(), extract the
569 : * offset numbers. Returns the number of offsets filled in, if <=
570 : * max_offsets. Otherwise, fills in as much as it can in the given space, and
571 : * returns the size of the buffer that would be needed.
572 : */
573 : int
574 24270 : TidStoreGetBlockOffsets(TidStoreIterResult *result,
575 : OffsetNumber *offsets,
576 : int max_offsets)
577 : {
578 24270 : BlocktableEntry *page = result->internal_page;
579 24270 : int num_offsets = 0;
580 : int wordnum;
581 :
582 24270 : if (page->header.nwords == 0)
583 : {
584 : /* we have offsets in the header */
585 16768 : for (int i = 0; i < NUM_FULL_OFFSETS; i++)
586 : {
587 12576 : if (page->header.full_offsets[i] != InvalidOffsetNumber)
588 : {
589 7328 : if (num_offsets < max_offsets)
590 7328 : offsets[num_offsets] = page->header.full_offsets[i];
591 7328 : num_offsets++;
592 : }
593 : }
594 : }
595 : else
596 : {
597 73952 : for (wordnum = 0; wordnum < page->header.nwords; wordnum++)
598 : {
599 53874 : bitmapword w = page->words[wordnum];
600 53874 : int off = wordnum * BITS_PER_BITMAPWORD;
601 :
602 2376852 : while (w != 0)
603 : {
604 2322978 : if (w & 1)
605 : {
606 1474970 : if (num_offsets < max_offsets)
607 1474970 : offsets[num_offsets] = (OffsetNumber) off;
608 1474970 : num_offsets++;
609 : }
610 2322978 : off++;
611 2322978 : w >>= 1;
612 : }
613 : }
614 : }
615 :
616 24270 : return num_offsets;
617 : }
|