Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidbitmap.c
4 : * PostgreSQL tuple-id (TID) bitmap package
5 : *
6 : * This module provides bitmap data structures that are spiritually
7 : * similar to Bitmapsets, but are specially adapted to store sets of
8 : * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 : * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 : * Also, since we wish to be able to store very large tuple sets in
11 : * memory with this data structure, we support "lossy" storage, in which
12 : * we no longer remember individual tuple offsets on a page but only the
13 : * fact that a particular page needs to be visited.
14 : *
15 : * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 : * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 : * of memory. People pushing around tables of that size should have a
18 : * couple of Mb to spare, so we don't worry about providing a second level
19 : * of lossiness. In theory we could fall back to page ranges at some
20 : * point, but for now that seems useless complexity.
21 : *
22 : * We also support the notion of candidate matches, or rechecking. This
23 : * means we know that a search need visit only some tuples on a page,
24 : * but we are not certain that all of those tuples are real matches.
25 : * So the eventual heap scan must recheck the quals for these tuples only,
26 : * rather than rechecking the quals for all tuples on the page as in the
27 : * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 : * into a bitmap, and it can also happen internally when we AND a lossy
29 : * and a non-lossy page.
30 : *
31 : *
32 : * Copyright (c) 2003-2024, PostgreSQL Global Development Group
33 : *
34 : * IDENTIFICATION
35 : * src/backend/nodes/tidbitmap.c
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include <limits.h>
42 :
43 : #include "access/htup_details.h"
44 : #include "common/hashfn.h"
45 : #include "common/int.h"
46 : #include "nodes/bitmapset.h"
47 : #include "nodes/tidbitmap.h"
48 : #include "storage/lwlock.h"
49 : #include "utils/dsa.h"
50 :
51 : /*
52 : * The maximum number of tuples per page is not large (typically 256 with
53 : * 8K pages, or 1024 with 32K pages). So there's not much point in making
54 : * the per-page bitmaps variable size. We just legislate that the size
55 : * is this:
56 : */
57 : #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
58 :
59 : /*
60 : * When we have to switch over to lossy storage, we use a data structure
61 : * with one bit per page, where all pages having the same number DIV
62 : * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
63 : * and has the bit set for a given page, there must not be a per-page entry
64 : * for that page in the page table.
65 : *
66 : * We actually store both exact pages and lossy chunks in the same hash
67 : * table, using identical data structures. (This is because the memory
68 : * management for hashtables doesn't easily/efficiently allow space to be
69 : * transferred easily from one hashtable to another.) Therefore it's best
70 : * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
71 : * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
72 : * avoid expensive integer remainder operations. So, define it like this:
73 : */
74 : #define PAGES_PER_CHUNK (BLCKSZ / 32)
75 :
76 : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
77 :
78 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
79 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
80 :
81 : /* number of active words for an exact page: */
82 : #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
83 : /* number of active words for a lossy chunk: */
84 : #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
85 :
86 : /*
87 : * The hashtable entries are represented by this data structure. For
88 : * an exact page, blockno is the page number and bit k of the bitmap
89 : * represents tuple offset k+1. For a lossy chunk, blockno is the first
90 : * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
91 : * bit k represents page blockno+k. Note that it is not possible to
92 : * have exact storage for the first page of a chunk if we are using
93 : * lossy storage for any page in the chunk's range, since the same
94 : * hashtable entry has to serve both purposes.
95 : *
96 : * recheck is used only on exact pages --- it indicates that although
97 : * only the stated tuples need be checked, the full index qual condition
98 : * must be checked for each (ie, these are candidate matches).
99 : */
100 : typedef struct PagetableEntry
101 : {
102 : BlockNumber blockno; /* page number (hashtable key) */
103 : char status; /* hash entry status */
104 : bool ischunk; /* T = lossy storage, F = exact */
105 : bool recheck; /* should the tuples be rechecked? */
106 : bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
107 : } PagetableEntry;
108 :
109 : /*
110 : * Holds array of pagetable entries.
111 : */
112 : typedef struct PTEntryArray
113 : {
114 : pg_atomic_uint32 refcount; /* no. of iterator attached */
115 : PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
116 : } PTEntryArray;
117 :
118 : /*
119 : * We want to avoid the overhead of creating the hashtable, which is
120 : * comparatively large, when not necessary. Particularly when we are using a
121 : * bitmap scan on the inside of a nestloop join: a bitmap may well live only
122 : * long enough to accumulate one entry in such cases. We therefore avoid
123 : * creating an actual hashtable until we need two pagetable entries. When
124 : * just one pagetable entry is needed, we store it in a fixed field of
125 : * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
126 : * shrinks down to zero or one page again. So, status can be TBM_HASH even
127 : * when nentries is zero or one.)
128 : */
129 : typedef enum
130 : {
131 : TBM_EMPTY, /* no hashtable, nentries == 0 */
132 : TBM_ONE_PAGE, /* entry1 contains the single entry */
133 : TBM_HASH, /* pagetable is valid, entry1 is not */
134 : } TBMStatus;
135 :
136 : /*
137 : * Current iterating state of the TBM.
138 : */
139 : typedef enum
140 : {
141 : TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
142 : TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
143 : TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
144 : } TBMIteratingState;
145 :
146 : /*
147 : * Here is the representation for a whole TIDBitMap:
148 : */
149 : struct TIDBitmap
150 : {
151 : NodeTag type; /* to make it a valid Node */
152 : MemoryContext mcxt; /* memory context containing me */
153 : TBMStatus status; /* see codes above */
154 : struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
155 : int nentries; /* number of entries in pagetable */
156 : int maxentries; /* limit on same to meet maxbytes */
157 : int npages; /* number of exact entries in pagetable */
158 : int nchunks; /* number of lossy entries in pagetable */
159 : TBMIteratingState iterating; /* tbm_begin_iterate called? */
160 : uint32 lossify_start; /* offset to start lossifying hashtable at */
161 : PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
162 : /* these are valid when iterating is true: */
163 : PagetableEntry **spages; /* sorted exact-page list, or NULL */
164 : PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
165 : dsa_pointer dsapagetable; /* dsa_pointer to the element array */
166 : dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
167 : dsa_pointer ptpages; /* dsa_pointer to the page array */
168 : dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
169 : dsa_area *dsa; /* reference to per-query dsa area */
170 : };
171 :
172 : /*
173 : * When iterating over a bitmap in sorted order, a TBMIterator is used to
174 : * track our progress. There can be several iterators scanning the same
175 : * bitmap concurrently. Note that the bitmap becomes read-only as soon as
176 : * any iterator is created.
177 : */
178 : struct TBMIterator
179 : {
180 : TIDBitmap *tbm; /* TIDBitmap we're iterating over */
181 : int spageptr; /* next spages index */
182 : int schunkptr; /* next schunks index */
183 : int schunkbit; /* next bit to check in current schunk */
184 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
185 : };
186 :
187 : /*
188 : * Holds the shared members of the iterator so that multiple processes
189 : * can jointly iterate.
190 : */
191 : typedef struct TBMSharedIteratorState
192 : {
193 : int nentries; /* number of entries in pagetable */
194 : int maxentries; /* limit on same to meet maxbytes */
195 : int npages; /* number of exact entries in pagetable */
196 : int nchunks; /* number of lossy entries in pagetable */
197 : dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
198 : dsa_pointer spages; /* dsa pointer to page array */
199 : dsa_pointer schunks; /* dsa pointer to chunk array */
200 : LWLock lock; /* lock to protect below members */
201 : int spageptr; /* next spages index */
202 : int schunkptr; /* next schunks index */
203 : int schunkbit; /* next bit to check in current schunk */
204 : } TBMSharedIteratorState;
205 :
206 : /*
207 : * pagetable iteration array.
208 : */
209 : typedef struct PTIterationArray
210 : {
211 : pg_atomic_uint32 refcount; /* no. of iterator attached */
212 : int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
213 : } PTIterationArray;
214 :
215 : /*
216 : * same as TBMIterator, but it is used for joint iteration, therefore this
217 : * also holds a reference to the shared state.
218 : */
219 : struct TBMSharedIterator
220 : {
221 : TBMSharedIteratorState *state; /* shared state */
222 : PTEntryArray *ptbase; /* pagetable element array */
223 : PTIterationArray *ptpages; /* sorted exact page index list */
224 : PTIterationArray *ptchunks; /* sorted lossy page index list */
225 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
226 : };
227 :
228 : /* Local function prototypes */
229 : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
230 : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
231 : const TIDBitmap *b);
232 : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
233 : BlockNumber pageno);
234 : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
235 : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
236 : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
237 : static void tbm_lossify(TIDBitmap *tbm);
238 : static int tbm_comparator(const void *left, const void *right);
239 : static int tbm_shared_comparator(const void *left, const void *right,
240 : void *arg);
241 :
242 : /* define hashtable mapping block numbers to PagetableEntry's */
243 : #define SH_USE_NONDEFAULT_ALLOCATOR
244 : #define SH_PREFIX pagetable
245 : #define SH_ELEMENT_TYPE PagetableEntry
246 : #define SH_KEY_TYPE BlockNumber
247 : #define SH_KEY blockno
248 : #define SH_HASH_KEY(tb, key) murmurhash32(key)
249 : #define SH_EQUAL(tb, a, b) a == b
250 : #define SH_SCOPE static inline
251 : #define SH_DEFINE
252 : #define SH_DECLARE
253 : #include "lib/simplehash.h"
254 :
255 :
256 : /*
257 : * tbm_create - create an initially-empty bitmap
258 : *
259 : * The bitmap will live in the memory context that is CurrentMemoryContext
260 : * at the time of this call. It will be limited to (approximately) maxbytes
261 : * total memory consumption. If the DSA passed to this function is not NULL
262 : * then the memory for storing elements of the underlying page table will
263 : * be allocated from the DSA.
264 : */
265 : TIDBitmap *
266 17900 : tbm_create(long maxbytes, dsa_area *dsa)
267 : {
268 : TIDBitmap *tbm;
269 :
270 : /* Create the TIDBitmap struct and zero all its fields */
271 17900 : tbm = makeNode(TIDBitmap);
272 :
273 17900 : tbm->mcxt = CurrentMemoryContext;
274 17900 : tbm->status = TBM_EMPTY;
275 :
276 17900 : tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
277 17900 : tbm->lossify_start = 0;
278 17900 : tbm->dsa = dsa;
279 17900 : tbm->dsapagetable = InvalidDsaPointer;
280 17900 : tbm->dsapagetableold = InvalidDsaPointer;
281 17900 : tbm->ptpages = InvalidDsaPointer;
282 17900 : tbm->ptchunks = InvalidDsaPointer;
283 :
284 17900 : return tbm;
285 : }
286 :
287 : /*
288 : * Actually create the hashtable. Since this is a moderately expensive
289 : * proposition, we don't do it until we have to.
290 : */
291 : static void
292 8154 : tbm_create_pagetable(TIDBitmap *tbm)
293 : {
294 : Assert(tbm->status != TBM_HASH);
295 : Assert(tbm->pagetable == NULL);
296 :
297 8154 : tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
298 :
299 : /* If entry1 is valid, push it into the hashtable */
300 8154 : if (tbm->status == TBM_ONE_PAGE)
301 : {
302 : PagetableEntry *page;
303 : bool found;
304 : char oldstatus;
305 :
306 5286 : page = pagetable_insert(tbm->pagetable,
307 : tbm->entry1.blockno,
308 : &found);
309 : Assert(!found);
310 5286 : oldstatus = page->status;
311 5286 : memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
312 5286 : page->status = oldstatus;
313 : }
314 :
315 8154 : tbm->status = TBM_HASH;
316 8154 : }
317 :
318 : /*
319 : * tbm_free - free a TIDBitmap
320 : */
321 : void
322 17834 : tbm_free(TIDBitmap *tbm)
323 : {
324 17834 : if (tbm->pagetable)
325 8154 : pagetable_destroy(tbm->pagetable);
326 17834 : if (tbm->spages)
327 5128 : pfree(tbm->spages);
328 17834 : if (tbm->schunks)
329 2880 : pfree(tbm->schunks);
330 17834 : pfree(tbm);
331 17834 : }
332 :
333 : /*
334 : * tbm_free_shared_area - free shared state
335 : *
336 : * Free shared iterator state, Also free shared pagetable and iterator arrays
337 : * memory if they are not referred by any of the shared iterator i.e recount
338 : * is becomes 0.
339 : */
340 : void
341 108 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
342 : {
343 108 : TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
344 : PTEntryArray *ptbase;
345 : PTIterationArray *ptpages;
346 : PTIterationArray *ptchunks;
347 :
348 108 : if (DsaPointerIsValid(istate->pagetable))
349 : {
350 108 : ptbase = dsa_get_address(dsa, istate->pagetable);
351 108 : if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
352 54 : dsa_free(dsa, istate->pagetable);
353 : }
354 108 : if (DsaPointerIsValid(istate->spages))
355 : {
356 108 : ptpages = dsa_get_address(dsa, istate->spages);
357 108 : if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
358 54 : dsa_free(dsa, istate->spages);
359 : }
360 108 : if (DsaPointerIsValid(istate->schunks))
361 : {
362 0 : ptchunks = dsa_get_address(dsa, istate->schunks);
363 0 : if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
364 0 : dsa_free(dsa, istate->schunks);
365 : }
366 :
367 108 : dsa_free(dsa, dp);
368 108 : }
369 :
370 : /*
371 : * tbm_add_tuples - add some tuple IDs to a TIDBitmap
372 : *
373 : * If recheck is true, then the recheck flag will be set in the
374 : * TBMIterateResult when any of these tuples are reported out.
375 : */
376 : void
377 6404262 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
378 : bool recheck)
379 : {
380 6404262 : BlockNumber currblk = InvalidBlockNumber;
381 6404262 : PagetableEntry *page = NULL; /* only valid when currblk is valid */
382 : int i;
383 :
384 : Assert(tbm->iterating == TBM_NOT_ITERATING);
385 14989982 : for (i = 0; i < ntids; i++)
386 : {
387 8585720 : BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
388 8585720 : OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
389 : int wordnum,
390 : bitnum;
391 :
392 : /* safety check to ensure we don't overrun bit array bounds */
393 8585720 : if (off < 1 || off > MAX_TUPLES_PER_PAGE)
394 0 : elog(ERROR, "tuple offset out of range: %u", off);
395 :
396 : /*
397 : * Look up target page unless we already did. This saves cycles when
398 : * the input includes consecutive tuples on the same page, which is
399 : * common enough to justify an extra test here.
400 : */
401 8585720 : if (blk != currblk)
402 : {
403 7180972 : if (tbm_page_is_lossy(tbm, blk))
404 2856 : page = NULL; /* remember page is lossy */
405 : else
406 7178116 : page = tbm_get_pageentry(tbm, blk);
407 7180972 : currblk = blk;
408 : }
409 :
410 8585720 : if (page == NULL)
411 2856 : continue; /* whole page is already marked */
412 :
413 8582864 : if (page->ischunk)
414 : {
415 : /* The page is a lossy chunk header, set bit for itself */
416 0 : wordnum = bitnum = 0;
417 : }
418 : else
419 : {
420 : /* Page is exact, so set bit for individual tuple */
421 8582864 : wordnum = WORDNUM(off - 1);
422 8582864 : bitnum = BITNUM(off - 1);
423 : }
424 8582864 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
425 8582864 : page->recheck |= recheck;
426 :
427 8582864 : if (tbm->nentries > tbm->maxentries)
428 : {
429 24 : tbm_lossify(tbm);
430 : /* Page could have been converted to lossy, so force new lookup */
431 24 : currblk = InvalidBlockNumber;
432 : }
433 : }
434 6404262 : }
435 :
436 : /*
437 : * tbm_add_page - add a whole page to a TIDBitmap
438 : *
439 : * This causes the whole page to be reported (with the recheck flag)
440 : * when the TIDBitmap is scanned.
441 : */
442 : void
443 147980 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
444 : {
445 : /* Enter the page in the bitmap, or mark it lossy if already present */
446 147980 : tbm_mark_page_lossy(tbm, pageno);
447 : /* If we went over the memory limit, lossify some more pages */
448 147980 : if (tbm->nentries > tbm->maxentries)
449 0 : tbm_lossify(tbm);
450 147980 : }
451 :
452 : /*
453 : * tbm_union - set union
454 : *
455 : * a is modified in-place, b is not changed
456 : */
457 : void
458 0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
459 : {
460 : Assert(!a->iterating);
461 : /* Nothing to do if b is empty */
462 0 : if (b->nentries == 0)
463 0 : return;
464 : /* Scan through chunks and pages in b, merge into a */
465 0 : if (b->status == TBM_ONE_PAGE)
466 0 : tbm_union_page(a, &b->entry1);
467 : else
468 : {
469 : pagetable_iterator i;
470 : PagetableEntry *bpage;
471 :
472 : Assert(b->status == TBM_HASH);
473 0 : pagetable_start_iterate(b->pagetable, &i);
474 0 : while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
475 0 : tbm_union_page(a, bpage);
476 : }
477 : }
478 :
479 : /* Process one page of b during a union op */
480 : static void
481 0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
482 : {
483 : PagetableEntry *apage;
484 : int wordnum;
485 :
486 0 : if (bpage->ischunk)
487 : {
488 : /* Scan b's chunk, mark each indicated page lossy in a */
489 0 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
490 : {
491 0 : bitmapword w = bpage->words[wordnum];
492 :
493 0 : if (w != 0)
494 : {
495 : BlockNumber pg;
496 :
497 0 : pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
498 0 : while (w != 0)
499 : {
500 0 : if (w & 1)
501 0 : tbm_mark_page_lossy(a, pg);
502 0 : pg++;
503 0 : w >>= 1;
504 : }
505 : }
506 : }
507 : }
508 0 : else if (tbm_page_is_lossy(a, bpage->blockno))
509 : {
510 : /* page is already lossy in a, nothing to do */
511 0 : return;
512 : }
513 : else
514 : {
515 0 : apage = tbm_get_pageentry(a, bpage->blockno);
516 0 : if (apage->ischunk)
517 : {
518 : /* The page is a lossy chunk header, set bit for itself */
519 0 : apage->words[0] |= ((bitmapword) 1 << 0);
520 : }
521 : else
522 : {
523 : /* Both pages are exact, merge at the bit level */
524 0 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
525 0 : apage->words[wordnum] |= bpage->words[wordnum];
526 0 : apage->recheck |= bpage->recheck;
527 : }
528 : }
529 :
530 0 : if (a->nentries > a->maxentries)
531 0 : tbm_lossify(a);
532 : }
533 :
534 : /*
535 : * tbm_intersect - set intersection
536 : *
537 : * a is modified in-place, b is not changed
538 : */
539 : void
540 80 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
541 : {
542 : Assert(!a->iterating);
543 : /* Nothing to do if a is empty */
544 80 : if (a->nentries == 0)
545 0 : return;
546 : /* Scan through chunks and pages in a, try to match to b */
547 80 : if (a->status == TBM_ONE_PAGE)
548 : {
549 52 : if (tbm_intersect_page(a, &a->entry1, b))
550 : {
551 : /* Page is now empty, remove it from a */
552 : Assert(!a->entry1.ischunk);
553 0 : a->npages--;
554 0 : a->nentries--;
555 : Assert(a->nentries == 0);
556 0 : a->status = TBM_EMPTY;
557 : }
558 : }
559 : else
560 : {
561 : pagetable_iterator i;
562 : PagetableEntry *apage;
563 :
564 : Assert(a->status == TBM_HASH);
565 28 : pagetable_start_iterate(a->pagetable, &i);
566 4834 : while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
567 : {
568 4806 : if (tbm_intersect_page(a, apage, b))
569 : {
570 : /* Page or chunk is now empty, remove it from a */
571 4626 : if (apage->ischunk)
572 0 : a->nchunks--;
573 : else
574 4626 : a->npages--;
575 4626 : a->nentries--;
576 4626 : if (!pagetable_delete(a->pagetable, apage->blockno))
577 0 : elog(ERROR, "hash table corrupted");
578 : }
579 : }
580 : }
581 : }
582 :
583 : /*
584 : * Process one page of a during an intersection op
585 : *
586 : * Returns true if apage is now empty and should be deleted from a
587 : */
588 : static bool
589 4858 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
590 : {
591 : const PagetableEntry *bpage;
592 : int wordnum;
593 :
594 4858 : if (apage->ischunk)
595 : {
596 : /* Scan each bit in chunk, try to clear */
597 30 : bool candelete = true;
598 :
599 150 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
600 : {
601 120 : bitmapword w = apage->words[wordnum];
602 :
603 120 : if (w != 0)
604 : {
605 108 : bitmapword neww = w;
606 : BlockNumber pg;
607 : int bitnum;
608 :
609 108 : pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
610 108 : bitnum = 0;
611 6606 : while (w != 0)
612 : {
613 6498 : if (w & 1)
614 : {
615 3360 : if (!tbm_page_is_lossy(b, pg) &&
616 252 : tbm_find_pageentry(b, pg) == NULL)
617 : {
618 : /* Page is not in b at all, lose lossy bit */
619 0 : neww &= ~((bitmapword) 1 << bitnum);
620 : }
621 : }
622 6498 : pg++;
623 6498 : bitnum++;
624 6498 : w >>= 1;
625 : }
626 108 : apage->words[wordnum] = neww;
627 108 : if (neww != 0)
628 108 : candelete = false;
629 : }
630 : }
631 30 : return candelete;
632 : }
633 4828 : else if (tbm_page_is_lossy(b, apage->blockno))
634 : {
635 : /*
636 : * Some of the tuples in 'a' might not satisfy the quals for 'b', but
637 : * because the page 'b' is lossy, we don't know which ones. Therefore
638 : * we mark 'a' as requiring rechecks, to indicate that at most those
639 : * tuples set in 'a' are matches.
640 : */
641 0 : apage->recheck = true;
642 0 : return false;
643 : }
644 : else
645 : {
646 4828 : bool candelete = true;
647 :
648 4828 : bpage = tbm_find_pageentry(b, apage->blockno);
649 4828 : if (bpage != NULL)
650 : {
651 : /* Both pages are exact, merge at the bit level */
652 : Assert(!bpage->ischunk);
653 25824 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
654 : {
655 21520 : apage->words[wordnum] &= bpage->words[wordnum];
656 21520 : if (apage->words[wordnum] != 0)
657 208 : candelete = false;
658 : }
659 4304 : apage->recheck |= bpage->recheck;
660 : }
661 : /* If there is no matching b page, we can just delete the a page */
662 4828 : return candelete;
663 : }
664 : }
665 :
666 : /*
667 : * tbm_is_empty - is a TIDBitmap completely empty?
668 : */
669 : bool
670 694 : tbm_is_empty(const TIDBitmap *tbm)
671 : {
672 694 : return (tbm->nentries == 0);
673 : }
674 :
675 : /*
676 : * tbm_begin_iterate - prepare to iterate through a TIDBitmap
677 : *
678 : * The TBMIterator struct is created in the caller's memory context.
679 : * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
680 : * okay to just allow the memory context to be released, too. It is caller's
681 : * responsibility not to touch the TBMIterator anymore once the TIDBitmap
682 : * is freed.
683 : *
684 : * NB: after this is called, it is no longer allowed to modify the contents
685 : * of the bitmap. However, you can call this multiple times to scan the
686 : * contents repeatedly, including parallel scans.
687 : */
688 : TBMIterator *
689 34848 : tbm_begin_iterate(TIDBitmap *tbm)
690 : {
691 : TBMIterator *iterator;
692 :
693 : Assert(tbm->iterating != TBM_ITERATING_SHARED);
694 :
695 : /*
696 : * Create the TBMIterator struct, with enough trailing space to serve the
697 : * needs of the TBMIterateResult sub-struct.
698 : */
699 34848 : iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
700 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
701 34848 : iterator->tbm = tbm;
702 :
703 : /*
704 : * Initialize iteration pointers.
705 : */
706 34848 : iterator->spageptr = 0;
707 34848 : iterator->schunkptr = 0;
708 34848 : iterator->schunkbit = 0;
709 :
710 : /*
711 : * If we have a hashtable, create and fill the sorted page lists, unless
712 : * we already did that for a previous iterator. Note that the lists are
713 : * attached to the bitmap not the iterator, so they can be used by more
714 : * than one iterator.
715 : */
716 34848 : if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
717 : {
718 : pagetable_iterator i;
719 : PagetableEntry *page;
720 : int npages;
721 : int nchunks;
722 :
723 8002 : if (!tbm->spages && tbm->npages > 0)
724 5128 : tbm->spages = (PagetableEntry **)
725 5128 : MemoryContextAlloc(tbm->mcxt,
726 5128 : tbm->npages * sizeof(PagetableEntry *));
727 8002 : if (!tbm->schunks && tbm->nchunks > 0)
728 2880 : tbm->schunks = (PagetableEntry **)
729 2880 : MemoryContextAlloc(tbm->mcxt,
730 2880 : tbm->nchunks * sizeof(PagetableEntry *));
731 :
732 8002 : npages = nchunks = 0;
733 8002 : pagetable_start_iterate(tbm->pagetable, &i);
734 215258 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
735 : {
736 207256 : if (page->ischunk)
737 2922 : tbm->schunks[nchunks++] = page;
738 : else
739 204334 : tbm->spages[npages++] = page;
740 : }
741 : Assert(npages == tbm->npages);
742 : Assert(nchunks == tbm->nchunks);
743 8002 : if (npages > 1)
744 5124 : qsort(tbm->spages, npages, sizeof(PagetableEntry *),
745 : tbm_comparator);
746 8002 : if (nchunks > 1)
747 12 : qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
748 : tbm_comparator);
749 : }
750 :
751 34848 : tbm->iterating = TBM_ITERATING_PRIVATE;
752 :
753 34848 : return iterator;
754 : }
755 :
756 : /*
757 : * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
758 : *
759 : * The necessary shared state will be allocated from the DSA passed to
760 : * tbm_create, so that multiple processes can attach to it and iterate jointly.
761 : *
762 : * This will convert the pagetable hash into page and chunk array of the index
763 : * into pagetable array.
764 : */
765 : dsa_pointer
766 144 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
767 : {
768 : dsa_pointer dp;
769 : TBMSharedIteratorState *istate;
770 144 : PTEntryArray *ptbase = NULL;
771 144 : PTIterationArray *ptpages = NULL;
772 144 : PTIterationArray *ptchunks = NULL;
773 :
774 : Assert(tbm->dsa != NULL);
775 : Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
776 :
777 : /*
778 : * Allocate TBMSharedIteratorState from DSA to hold the shared members and
779 : * lock, this will also be used by multiple worker for shared iterate.
780 : */
781 144 : dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
782 144 : istate = dsa_get_address(tbm->dsa, dp);
783 :
784 : /*
785 : * If we're not already iterating, create and fill the sorted page lists.
786 : * (If we are, the sorted page lists are already stored in the TIDBitmap,
787 : * and we can just reuse them.)
788 : */
789 144 : if (tbm->iterating == TBM_NOT_ITERATING)
790 : {
791 : pagetable_iterator i;
792 : PagetableEntry *page;
793 : int idx;
794 : int npages;
795 : int nchunks;
796 :
797 : /*
798 : * Allocate the page and chunk array memory from the DSA to share
799 : * across multiple processes.
800 : */
801 72 : if (tbm->npages)
802 : {
803 72 : tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
804 : tbm->npages * sizeof(int));
805 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
806 72 : pg_atomic_init_u32(&ptpages->refcount, 0);
807 : }
808 72 : if (tbm->nchunks)
809 : {
810 6 : tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
811 : tbm->nchunks * sizeof(int));
812 6 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
813 6 : pg_atomic_init_u32(&ptchunks->refcount, 0);
814 : }
815 :
816 : /*
817 : * If TBM status is TBM_HASH then iterate over the pagetable and
818 : * convert it to page and chunk arrays. But if it's in the
819 : * TBM_ONE_PAGE mode then directly allocate the space for one entry
820 : * from the DSA.
821 : */
822 72 : npages = nchunks = 0;
823 72 : if (tbm->status == TBM_HASH)
824 : {
825 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
826 :
827 72 : pagetable_start_iterate(tbm->pagetable, &i);
828 27102 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
829 : {
830 27030 : idx = page - ptbase->ptentry;
831 27030 : if (page->ischunk)
832 24 : ptchunks->index[nchunks++] = idx;
833 : else
834 27006 : ptpages->index[npages++] = idx;
835 : }
836 :
837 : Assert(npages == tbm->npages);
838 : Assert(nchunks == tbm->nchunks);
839 : }
840 0 : else if (tbm->status == TBM_ONE_PAGE)
841 : {
842 : /*
843 : * In one page mode allocate the space for one pagetable entry,
844 : * initialize it, and directly store its index (i.e. 0) in the
845 : * page array.
846 : */
847 0 : tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
848 : sizeof(PagetableEntry));
849 0 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
850 0 : memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
851 0 : ptpages->index[0] = 0;
852 : }
853 :
854 72 : if (ptbase != NULL)
855 72 : pg_atomic_init_u32(&ptbase->refcount, 0);
856 72 : if (npages > 1)
857 72 : qsort_arg(ptpages->index, npages, sizeof(int),
858 72 : tbm_shared_comparator, ptbase->ptentry);
859 72 : if (nchunks > 1)
860 6 : qsort_arg(ptchunks->index, nchunks, sizeof(int),
861 6 : tbm_shared_comparator, ptbase->ptentry);
862 : }
863 :
864 : /*
865 : * Store the TBM members in the shared state so that we can share them
866 : * across multiple processes.
867 : */
868 144 : istate->nentries = tbm->nentries;
869 144 : istate->maxentries = tbm->maxentries;
870 144 : istate->npages = tbm->npages;
871 144 : istate->nchunks = tbm->nchunks;
872 144 : istate->pagetable = tbm->dsapagetable;
873 144 : istate->spages = tbm->ptpages;
874 144 : istate->schunks = tbm->ptchunks;
875 :
876 144 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
877 144 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
878 144 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
879 :
880 : /*
881 : * For every shared iterator, referring to pagetable and iterator array,
882 : * increase the refcount by 1 so that while freeing the shared iterator we
883 : * don't free pagetable and iterator array until its refcount becomes 0.
884 : */
885 144 : if (ptbase != NULL)
886 144 : pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
887 144 : if (ptpages != NULL)
888 144 : pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
889 144 : if (ptchunks != NULL)
890 12 : pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
891 :
892 : /* Initialize the iterator lock */
893 144 : LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
894 :
895 : /* Initialize the shared iterator state */
896 144 : istate->schunkbit = 0;
897 144 : istate->schunkptr = 0;
898 144 : istate->spageptr = 0;
899 :
900 144 : tbm->iterating = TBM_ITERATING_SHARED;
901 :
902 144 : return dp;
903 : }
904 :
905 : /*
906 : * tbm_extract_page_tuple - extract the tuple offsets from a page
907 : *
908 : * The extracted offsets are stored into TBMIterateResult.
909 : */
910 : static inline int
911 467698 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
912 : {
913 : int wordnum;
914 467698 : int ntuples = 0;
915 :
916 2806188 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
917 : {
918 2338490 : bitmapword w = page->words[wordnum];
919 :
920 2338490 : if (w != 0)
921 : {
922 1101474 : int off = wordnum * BITS_PER_BITMAPWORD + 1;
923 :
924 47826650 : while (w != 0)
925 : {
926 46725176 : if (w & 1)
927 11338780 : output->offsets[ntuples++] = (OffsetNumber) off;
928 46725176 : off++;
929 46725176 : w >>= 1;
930 : }
931 : }
932 : }
933 :
934 467698 : return ntuples;
935 : }
936 :
937 : /*
938 : * tbm_advance_schunkbit - Advance the schunkbit
939 : */
940 : static inline void
941 332680 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
942 : {
943 332680 : int schunkbit = *schunkbitp;
944 :
945 1526448 : while (schunkbit < PAGES_PER_CHUNK)
946 : {
947 1520556 : int wordnum = WORDNUM(schunkbit);
948 1520556 : int bitnum = BITNUM(schunkbit);
949 :
950 1520556 : if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
951 326788 : break;
952 1193768 : schunkbit++;
953 : }
954 :
955 332680 : *schunkbitp = schunkbit;
956 332680 : }
957 :
958 : /*
959 : * tbm_iterate - scan through next page of a TIDBitmap
960 : *
961 : * Returns a TBMIterateResult representing one page, or NULL if there are
962 : * no more pages to scan. Pages are guaranteed to be delivered in numerical
963 : * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
964 : * remember the exact tuples to look at on this page --- the caller must
965 : * examine all tuples on the page and check if they meet the intended
966 : * condition. If result->recheck is true, only the indicated tuples need
967 : * be examined, but the condition must be rechecked anyway. (For ease of
968 : * testing, recheck is always set true when ntuples < 0.)
969 : */
970 : TBMIterateResult *
971 752908 : tbm_iterate(TBMIterator *iterator)
972 : {
973 752908 : TIDBitmap *tbm = iterator->tbm;
974 752908 : TBMIterateResult *output = &(iterator->output);
975 :
976 : Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
977 :
978 : /*
979 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
980 : * schunkbit to the next set bit.
981 : */
982 758752 : while (iterator->schunkptr < tbm->nchunks)
983 : {
984 320380 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
985 320380 : int schunkbit = iterator->schunkbit;
986 :
987 320380 : tbm_advance_schunkbit(chunk, &schunkbit);
988 320380 : if (schunkbit < PAGES_PER_CHUNK)
989 : {
990 314536 : iterator->schunkbit = schunkbit;
991 314536 : break;
992 : }
993 : /* advance to next chunk */
994 5844 : iterator->schunkptr++;
995 5844 : iterator->schunkbit = 0;
996 : }
997 :
998 : /*
999 : * If both chunk and per-page data remain, must output the numerically
1000 : * earlier page.
1001 : */
1002 752908 : if (iterator->schunkptr < tbm->nchunks)
1003 : {
1004 314536 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1005 : BlockNumber chunk_blockno;
1006 :
1007 314536 : chunk_blockno = chunk->blockno + iterator->schunkbit;
1008 314536 : if (iterator->spageptr >= tbm->npages ||
1009 18576 : chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1010 : {
1011 : /* Return a lossy page indicator from the chunk */
1012 308380 : output->blockno = chunk_blockno;
1013 308380 : output->ntuples = -1;
1014 308380 : output->recheck = true;
1015 308380 : iterator->schunkbit++;
1016 308380 : return output;
1017 : }
1018 : }
1019 :
1020 444528 : if (iterator->spageptr < tbm->npages)
1021 : {
1022 : PagetableEntry *page;
1023 : int ntuples;
1024 :
1025 : /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1026 413686 : if (tbm->status == TBM_ONE_PAGE)
1027 12246 : page = &tbm->entry1;
1028 : else
1029 401440 : page = tbm->spages[iterator->spageptr];
1030 :
1031 : /* scan bitmap to extract individual offset numbers */
1032 413686 : ntuples = tbm_extract_page_tuple(page, output);
1033 413686 : output->blockno = page->blockno;
1034 413686 : output->ntuples = ntuples;
1035 413686 : output->recheck = page->recheck;
1036 413686 : iterator->spageptr++;
1037 413686 : return output;
1038 : }
1039 :
1040 : /* Nothing more in the bitmap */
1041 30842 : return NULL;
1042 : }
1043 :
1044 : /*
1045 : * tbm_shared_iterate - scan through next page of a TIDBitmap
1046 : *
1047 : * As above, but this will iterate using an iterator which is shared
1048 : * across multiple processes. We need to acquire the iterator LWLock,
1049 : * before accessing the shared members.
1050 : */
1051 : TBMIterateResult *
1052 60628 : tbm_shared_iterate(TBMSharedIterator *iterator)
1053 : {
1054 60628 : TBMIterateResult *output = &iterator->output;
1055 60628 : TBMSharedIteratorState *istate = iterator->state;
1056 60628 : PagetableEntry *ptbase = NULL;
1057 60628 : int *idxpages = NULL;
1058 60628 : int *idxchunks = NULL;
1059 :
1060 60628 : if (iterator->ptbase != NULL)
1061 60628 : ptbase = iterator->ptbase->ptentry;
1062 60628 : if (iterator->ptpages != NULL)
1063 60628 : idxpages = iterator->ptpages->index;
1064 60628 : if (iterator->ptchunks != NULL)
1065 14858 : idxchunks = iterator->ptchunks->index;
1066 :
1067 : /* Acquire the LWLock before accessing the shared members */
1068 60628 : LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1069 :
1070 : /*
1071 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1072 : * schunkbit to the next set bit.
1073 : */
1074 60676 : while (istate->schunkptr < istate->nchunks)
1075 : {
1076 12300 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1077 12300 : int schunkbit = istate->schunkbit;
1078 :
1079 12300 : tbm_advance_schunkbit(chunk, &schunkbit);
1080 12300 : if (schunkbit < PAGES_PER_CHUNK)
1081 : {
1082 12252 : istate->schunkbit = schunkbit;
1083 12252 : break;
1084 : }
1085 : /* advance to next chunk */
1086 48 : istate->schunkptr++;
1087 48 : istate->schunkbit = 0;
1088 : }
1089 :
1090 : /*
1091 : * If both chunk and per-page data remain, must output the numerically
1092 : * earlier page.
1093 : */
1094 60628 : if (istate->schunkptr < istate->nchunks)
1095 : {
1096 12252 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1097 : BlockNumber chunk_blockno;
1098 :
1099 12252 : chunk_blockno = chunk->blockno + istate->schunkbit;
1100 :
1101 12252 : if (istate->spageptr >= istate->npages ||
1102 12252 : chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1103 : {
1104 : /* Return a lossy page indicator from the chunk */
1105 6204 : output->blockno = chunk_blockno;
1106 6204 : output->ntuples = -1;
1107 6204 : output->recheck = true;
1108 6204 : istate->schunkbit++;
1109 :
1110 6204 : LWLockRelease(&istate->lock);
1111 6204 : return output;
1112 : }
1113 : }
1114 :
1115 54424 : if (istate->spageptr < istate->npages)
1116 : {
1117 54012 : PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1118 : int ntuples;
1119 :
1120 : /* scan bitmap to extract individual offset numbers */
1121 54012 : ntuples = tbm_extract_page_tuple(page, output);
1122 54012 : output->blockno = page->blockno;
1123 54012 : output->ntuples = ntuples;
1124 54012 : output->recheck = page->recheck;
1125 54012 : istate->spageptr++;
1126 :
1127 54012 : LWLockRelease(&istate->lock);
1128 :
1129 54012 : return output;
1130 : }
1131 :
1132 412 : LWLockRelease(&istate->lock);
1133 :
1134 : /* Nothing more in the bitmap */
1135 412 : return NULL;
1136 : }
1137 :
1138 : /*
1139 : * tbm_end_iterate - finish an iteration over a TIDBitmap
1140 : *
1141 : * Currently this is just a pfree, but it might do more someday. (For
1142 : * instance, it could be useful to count open iterators and allow the
1143 : * bitmap to return to read/write status when there are no more iterators.)
1144 : */
1145 : void
1146 34770 : tbm_end_iterate(TBMIterator *iterator)
1147 : {
1148 34770 : pfree(iterator);
1149 34770 : }
1150 :
1151 : /*
1152 : * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1153 : *
1154 : * This doesn't free any of the shared state associated with the iterator,
1155 : * just our backend-private state.
1156 : */
1157 : void
1158 676 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
1159 : {
1160 676 : pfree(iterator);
1161 676 : }
1162 :
1163 : /*
1164 : * tbm_find_pageentry - find a PagetableEntry for the pageno
1165 : *
1166 : * Returns NULL if there is no non-lossy entry for the pageno.
1167 : */
1168 : static const PagetableEntry *
1169 5080 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1170 : {
1171 : const PagetableEntry *page;
1172 :
1173 5080 : if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1174 0 : return NULL;
1175 :
1176 5080 : if (tbm->status == TBM_ONE_PAGE)
1177 : {
1178 0 : page = &tbm->entry1;
1179 0 : if (page->blockno != pageno)
1180 0 : return NULL;
1181 : Assert(!page->ischunk);
1182 0 : return page;
1183 : }
1184 :
1185 5080 : page = pagetable_lookup(tbm->pagetable, pageno);
1186 5080 : if (page == NULL)
1187 524 : return NULL;
1188 4556 : if (page->ischunk)
1189 0 : return NULL; /* don't want a lossy chunk header */
1190 4556 : return page;
1191 : }
1192 :
1193 : /*
1194 : * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1195 : *
1196 : * If new, the entry is marked as an exact (non-chunk) entry.
1197 : *
1198 : * This may cause the table to exceed the desired memory size. It is
1199 : * up to the caller to call tbm_lossify() at the next safe point if so.
1200 : */
1201 : static PagetableEntry *
1202 7178116 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1203 : {
1204 : PagetableEntry *page;
1205 : bool found;
1206 :
1207 7178116 : if (tbm->status == TBM_EMPTY)
1208 : {
1209 : /* Use the fixed slot */
1210 11534 : page = &tbm->entry1;
1211 11534 : found = false;
1212 11534 : tbm->status = TBM_ONE_PAGE;
1213 : }
1214 : else
1215 : {
1216 7166582 : if (tbm->status == TBM_ONE_PAGE)
1217 : {
1218 86802 : page = &tbm->entry1;
1219 86802 : if (page->blockno == pageno)
1220 81516 : return page;
1221 : /* Time to switch from one page to a hashtable */
1222 5286 : tbm_create_pagetable(tbm);
1223 : }
1224 :
1225 : /* Look up or create an entry */
1226 7085066 : page = pagetable_insert(tbm->pagetable, pageno, &found);
1227 : }
1228 :
1229 : /* Initialize it if not present before */
1230 7096600 : if (!found)
1231 : {
1232 259954 : char oldstatus = page->status;
1233 :
1234 1819678 : MemSet(page, 0, sizeof(PagetableEntry));
1235 259954 : page->status = oldstatus;
1236 259954 : page->blockno = pageno;
1237 : /* must count it too */
1238 259954 : tbm->nentries++;
1239 259954 : tbm->npages++;
1240 : }
1241 :
1242 7096600 : return page;
1243 : }
1244 :
1245 : /*
1246 : * tbm_page_is_lossy - is the page marked as lossily stored?
1247 : */
1248 : static bool
1249 7188908 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1250 : {
1251 : PagetableEntry *page;
1252 : BlockNumber chunk_pageno;
1253 : int bitno;
1254 :
1255 : /* we can skip the lookup if there are no lossy chunks */
1256 7188908 : if (tbm->nchunks == 0)
1257 7068278 : return false;
1258 : Assert(tbm->status == TBM_HASH);
1259 :
1260 120630 : bitno = pageno % PAGES_PER_CHUNK;
1261 120630 : chunk_pageno = pageno - bitno;
1262 :
1263 120630 : page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1264 :
1265 120630 : if (page != NULL && page->ischunk)
1266 : {
1267 12432 : int wordnum = WORDNUM(bitno);
1268 12432 : int bitnum = BITNUM(bitno);
1269 :
1270 12432 : if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1271 5712 : return true;
1272 : }
1273 114918 : return false;
1274 : }
1275 :
1276 : /*
1277 : * tbm_mark_page_lossy - mark the page number as lossily stored
1278 : *
1279 : * This may cause the table to exceed the desired memory size. It is
1280 : * up to the caller to call tbm_lossify() at the next safe point if so.
1281 : */
1282 : static void
1283 160292 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1284 : {
1285 : PagetableEntry *page;
1286 : bool found;
1287 : BlockNumber chunk_pageno;
1288 : int bitno;
1289 : int wordnum;
1290 : int bitnum;
1291 :
1292 : /* We force the bitmap into hashtable mode whenever it's lossy */
1293 160292 : if (tbm->status != TBM_HASH)
1294 2868 : tbm_create_pagetable(tbm);
1295 :
1296 160292 : bitno = pageno % PAGES_PER_CHUNK;
1297 160292 : chunk_pageno = pageno - bitno;
1298 :
1299 : /*
1300 : * Remove any extant non-lossy entry for the page. If the page is its own
1301 : * chunk header, however, we skip this and handle the case below.
1302 : */
1303 160292 : if (bitno != 0)
1304 : {
1305 157964 : if (pagetable_delete(tbm->pagetable, pageno))
1306 : {
1307 : /* It was present, so adjust counts */
1308 12312 : tbm->nentries--;
1309 12312 : tbm->npages--; /* assume it must have been non-lossy */
1310 : }
1311 : }
1312 :
1313 : /* Look up or create entry for chunk-header page */
1314 160292 : page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1315 :
1316 : /* Initialize it if not present before */
1317 160292 : if (!found)
1318 : {
1319 2868 : char oldstatus = page->status;
1320 :
1321 20076 : MemSet(page, 0, sizeof(PagetableEntry));
1322 2868 : page->status = oldstatus;
1323 2868 : page->blockno = chunk_pageno;
1324 2868 : page->ischunk = true;
1325 : /* must count it too */
1326 2868 : tbm->nentries++;
1327 2868 : tbm->nchunks++;
1328 : }
1329 157424 : else if (!page->ischunk)
1330 : {
1331 102 : char oldstatus = page->status;
1332 :
1333 : /* chunk header page was formerly non-lossy, make it lossy */
1334 714 : MemSet(page, 0, sizeof(PagetableEntry));
1335 102 : page->status = oldstatus;
1336 102 : page->blockno = chunk_pageno;
1337 102 : page->ischunk = true;
1338 : /* we assume it had some tuple bit(s) set, so mark it lossy */
1339 102 : page->words[0] = ((bitmapword) 1 << 0);
1340 : /* adjust counts */
1341 102 : tbm->nchunks++;
1342 102 : tbm->npages--;
1343 : }
1344 :
1345 : /* Now set the original target page's bit */
1346 160292 : wordnum = WORDNUM(bitno);
1347 160292 : bitnum = BITNUM(bitno);
1348 160292 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1349 160292 : }
1350 :
1351 : /*
1352 : * tbm_lossify - lose some information to get back under the memory limit
1353 : */
1354 : static void
1355 24 : tbm_lossify(TIDBitmap *tbm)
1356 : {
1357 : pagetable_iterator i;
1358 : PagetableEntry *page;
1359 :
1360 : /*
1361 : * XXX Really stupid implementation: this just lossifies pages in
1362 : * essentially random order. We should be paying some attention to the
1363 : * number of bits set in each page, instead.
1364 : *
1365 : * Since we are called as soon as nentries exceeds maxentries, we should
1366 : * push nentries down to significantly less than maxentries, or else we'll
1367 : * just end up doing this again very soon. We shoot for maxentries/2.
1368 : */
1369 : Assert(tbm->iterating == TBM_NOT_ITERATING);
1370 : Assert(tbm->status == TBM_HASH);
1371 :
1372 24 : pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1373 12330 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1374 : {
1375 12330 : if (page->ischunk)
1376 0 : continue; /* already a chunk header */
1377 :
1378 : /*
1379 : * If the page would become a chunk header, we won't save anything by
1380 : * converting it to lossy, so skip it.
1381 : */
1382 12330 : if ((page->blockno % PAGES_PER_CHUNK) == 0)
1383 18 : continue;
1384 :
1385 : /* This does the dirty work ... */
1386 12312 : tbm_mark_page_lossy(tbm, page->blockno);
1387 :
1388 12312 : if (tbm->nentries <= tbm->maxentries / 2)
1389 : {
1390 : /*
1391 : * We have made enough room. Remember where to start lossifying
1392 : * next round, so we evenly iterate over the hashtable.
1393 : */
1394 24 : tbm->lossify_start = i.cur;
1395 24 : break;
1396 : }
1397 :
1398 : /*
1399 : * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1400 : * hashtable and may have deleted the non-lossy chunk. We can
1401 : * continue the same hash table scan, since failure to visit one
1402 : * element or visiting the newly inserted element, isn't fatal.
1403 : */
1404 : }
1405 :
1406 : /*
1407 : * With a big bitmap and small work_mem, it's possible that we cannot get
1408 : * under maxentries. Again, if that happens, we'd end up uselessly
1409 : * calling tbm_lossify over and over. To prevent this from becoming a
1410 : * performance sink, force maxentries up to at least double the current
1411 : * number of entries. (In essence, we're admitting inability to fit
1412 : * within work_mem when we do this.) Note that this test will not fire if
1413 : * we broke out of the loop early; and if we didn't, the current number of
1414 : * entries is simply not reducible any further.
1415 : */
1416 24 : if (tbm->nentries > tbm->maxentries / 2)
1417 0 : tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1418 24 : }
1419 :
1420 : /*
1421 : * qsort comparator to handle PagetableEntry pointers.
1422 : */
1423 : static int
1424 1369668 : tbm_comparator(const void *left, const void *right)
1425 : {
1426 1369668 : BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1427 1369668 : BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1428 :
1429 1369668 : return pg_cmp_u32(l, r);
1430 : }
1431 :
1432 : /*
1433 : * As above, but this will get index into PagetableEntry array. Therefore,
1434 : * it needs to get actual PagetableEntry using the index before comparing the
1435 : * blockno.
1436 : */
1437 : static int
1438 238290 : tbm_shared_comparator(const void *left, const void *right, void *arg)
1439 : {
1440 238290 : PagetableEntry *base = (PagetableEntry *) arg;
1441 238290 : PagetableEntry *lpage = &base[*(int *) left];
1442 238290 : PagetableEntry *rpage = &base[*(int *) right];
1443 :
1444 238290 : if (lpage->blockno < rpage->blockno)
1445 108966 : return -1;
1446 129324 : else if (lpage->blockno > rpage->blockno)
1447 129324 : return 1;
1448 0 : return 0;
1449 : }
1450 :
1451 : /*
1452 : * tbm_attach_shared_iterate
1453 : *
1454 : * Allocate a backend-private iterator and attach the shared iterator state
1455 : * to it so that multiple processed can iterate jointly.
1456 : *
1457 : * We also converts the DSA pointers to local pointers and store them into
1458 : * our private iterator.
1459 : */
1460 : TBMSharedIterator *
1461 676 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1462 : {
1463 : TBMSharedIterator *iterator;
1464 : TBMSharedIteratorState *istate;
1465 :
1466 : /*
1467 : * Create the TBMSharedIterator struct, with enough trailing space to
1468 : * serve the needs of the TBMIterateResult sub-struct.
1469 : */
1470 676 : iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1471 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1472 :
1473 676 : istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1474 :
1475 676 : iterator->state = istate;
1476 :
1477 676 : iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1478 :
1479 676 : if (istate->npages)
1480 676 : iterator->ptpages = dsa_get_address(dsa, istate->spages);
1481 676 : if (istate->nchunks)
1482 60 : iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1483 :
1484 676 : return iterator;
1485 : }
1486 :
1487 : /*
1488 : * pagetable_allocate
1489 : *
1490 : * Callback function for allocating the memory for hashtable elements.
1491 : * Allocate memory for hashtable elements, using DSA if available.
1492 : */
1493 : static inline void *
1494 8510 : pagetable_allocate(pagetable_hash *pagetable, Size size)
1495 : {
1496 8510 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1497 : PTEntryArray *ptbase;
1498 :
1499 8510 : if (tbm->dsa == NULL)
1500 8354 : return MemoryContextAllocExtended(pagetable->ctx, size,
1501 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1502 :
1503 : /*
1504 : * Save the dsapagetable reference in dsapagetableold before allocating
1505 : * new memory so that pagetable_free can free the old entry.
1506 : */
1507 156 : tbm->dsapagetableold = tbm->dsapagetable;
1508 156 : tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1509 : sizeof(PTEntryArray) + size,
1510 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1511 156 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1512 :
1513 156 : return ptbase->ptentry;
1514 : }
1515 :
1516 : /*
1517 : * pagetable_free
1518 : *
1519 : * Callback function for freeing hash table elements.
1520 : */
1521 : static inline void
1522 8510 : pagetable_free(pagetable_hash *pagetable, void *pointer)
1523 : {
1524 8510 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1525 :
1526 : /* pfree the input pointer if DSA is not available */
1527 8510 : if (tbm->dsa == NULL)
1528 8354 : pfree(pointer);
1529 156 : else if (DsaPointerIsValid(tbm->dsapagetableold))
1530 : {
1531 84 : dsa_free(tbm->dsa, tbm->dsapagetableold);
1532 84 : tbm->dsapagetableold = InvalidDsaPointer;
1533 : }
1534 8510 : }
1535 :
1536 : /*
1537 : * tbm_calculate_entries
1538 : *
1539 : * Estimate number of hashtable entries we can have within maxbytes.
1540 : */
1541 : long
1542 573672 : tbm_calculate_entries(double maxbytes)
1543 : {
1544 : long nbuckets;
1545 :
1546 : /*
1547 : * Estimate number of hashtable entries we can have within maxbytes. This
1548 : * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1549 : * for our purpose. Also count an extra Pointer per entry for the arrays
1550 : * created during iteration readout.
1551 : */
1552 573672 : nbuckets = maxbytes /
1553 : (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1554 573672 : nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1555 573672 : nbuckets = Max(nbuckets, 16); /* sanity limit */
1556 :
1557 573672 : return nbuckets;
1558 : }
|