Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidbitmap.c
4 : * PostgreSQL tuple-id (TID) bitmap package
5 : *
6 : * This module provides bitmap data structures that are spiritually
7 : * similar to Bitmapsets, but are specially adapted to store sets of
8 : * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 : * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 : * Also, since we wish to be able to store very large tuple sets in
11 : * memory with this data structure, we support "lossy" storage, in which
12 : * we no longer remember individual tuple offsets on a page but only the
13 : * fact that a particular page needs to be visited.
14 : *
15 : * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 : * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 : * of memory. People pushing around tables of that size should have a
18 : * couple of Mb to spare, so we don't worry about providing a second level
19 : * of lossiness. In theory we could fall back to page ranges at some
20 : * point, but for now that seems useless complexity.
21 : *
22 : * We also support the notion of candidate matches, or rechecking. This
23 : * means we know that a search need visit only some tuples on a page,
24 : * but we are not certain that all of those tuples are real matches.
25 : * So the eventual heap scan must recheck the quals for these tuples only,
26 : * rather than rechecking the quals for all tuples on the page as in the
27 : * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 : * into a bitmap, and it can also happen internally when we AND a lossy
29 : * and a non-lossy page.
30 : *
31 : *
32 : * Copyright (c) 2003-2025, PostgreSQL Global Development Group
33 : *
34 : * IDENTIFICATION
35 : * src/backend/nodes/tidbitmap.c
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include <limits.h>
42 :
43 : #include "access/htup_details.h"
44 : #include "common/hashfn.h"
45 : #include "common/int.h"
46 : #include "nodes/bitmapset.h"
47 : #include "nodes/tidbitmap.h"
48 : #include "storage/lwlock.h"
49 : #include "utils/dsa.h"
50 :
51 : /*
52 : * The maximum number of tuples per page is not large (typically 256 with
53 : * 8K pages, or 1024 with 32K pages). So there's not much point in making
54 : * the per-page bitmaps variable size. We just legislate that the size
55 : * is this:
56 : */
57 : #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
58 :
59 : /*
60 : * When we have to switch over to lossy storage, we use a data structure
61 : * with one bit per page, where all pages having the same number DIV
62 : * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
63 : * and has the bit set for a given page, there must not be a per-page entry
64 : * for that page in the page table.
65 : *
66 : * We actually store both exact pages and lossy chunks in the same hash
67 : * table, using identical data structures. (This is because the memory
68 : * management for hashtables doesn't easily/efficiently allow space to be
69 : * transferred easily from one hashtable to another.) Therefore it's best
70 : * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
71 : * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
72 : * avoid expensive integer remainder operations. So, define it like this:
73 : */
74 : #define PAGES_PER_CHUNK (BLCKSZ / 32)
75 :
76 : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
77 :
78 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
79 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
80 :
81 : /* number of active words for an exact page: */
82 : #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
83 : /* number of active words for a lossy chunk: */
84 : #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
85 :
86 : /*
87 : * The hashtable entries are represented by this data structure. For
88 : * an exact page, blockno is the page number and bit k of the bitmap
89 : * represents tuple offset k+1. For a lossy chunk, blockno is the first
90 : * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
91 : * bit k represents page blockno+k. Note that it is not possible to
92 : * have exact storage for the first page of a chunk if we are using
93 : * lossy storage for any page in the chunk's range, since the same
94 : * hashtable entry has to serve both purposes.
95 : *
96 : * recheck is used only on exact pages --- it indicates that although
97 : * only the stated tuples need be checked, the full index qual condition
98 : * must be checked for each (ie, these are candidate matches).
99 : */
100 : typedef struct PagetableEntry
101 : {
102 : BlockNumber blockno; /* page number (hashtable key) */
103 : char status; /* hash entry status */
104 : bool ischunk; /* T = lossy storage, F = exact */
105 : bool recheck; /* should the tuples be rechecked? */
106 : bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
107 : } PagetableEntry;
108 :
109 : /*
110 : * Holds array of pagetable entries.
111 : */
112 : typedef struct PTEntryArray
113 : {
114 : pg_atomic_uint32 refcount; /* no. of iterator attached */
115 : PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
116 : } PTEntryArray;
117 :
118 : /*
119 : * We want to avoid the overhead of creating the hashtable, which is
120 : * comparatively large, when not necessary. Particularly when we are using a
121 : * bitmap scan on the inside of a nestloop join: a bitmap may well live only
122 : * long enough to accumulate one entry in such cases. We therefore avoid
123 : * creating an actual hashtable until we need two pagetable entries. When
124 : * just one pagetable entry is needed, we store it in a fixed field of
125 : * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
126 : * shrinks down to zero or one page again. So, status can be TBM_HASH even
127 : * when nentries is zero or one.)
128 : */
129 : typedef enum
130 : {
131 : TBM_EMPTY, /* no hashtable, nentries == 0 */
132 : TBM_ONE_PAGE, /* entry1 contains the single entry */
133 : TBM_HASH, /* pagetable is valid, entry1 is not */
134 : } TBMStatus;
135 :
136 : /*
137 : * Current iterating state of the TBM.
138 : */
139 : typedef enum
140 : {
141 : TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
142 : TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
143 : TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
144 : } TBMIteratingState;
145 :
146 : /*
147 : * Here is the representation for a whole TIDBitMap:
148 : */
149 : struct TIDBitmap
150 : {
151 : NodeTag type; /* to make it a valid Node */
152 : MemoryContext mcxt; /* memory context containing me */
153 : TBMStatus status; /* see codes above */
154 : struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
155 : int nentries; /* number of entries in pagetable */
156 : int maxentries; /* limit on same to meet maxbytes */
157 : int npages; /* number of exact entries in pagetable */
158 : int nchunks; /* number of lossy entries in pagetable */
159 : TBMIteratingState iterating; /* tbm_begin_iterate called? */
160 : uint32 lossify_start; /* offset to start lossifying hashtable at */
161 : PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
162 : /* these are valid when iterating is true: */
163 : PagetableEntry **spages; /* sorted exact-page list, or NULL */
164 : PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
165 : dsa_pointer dsapagetable; /* dsa_pointer to the element array */
166 : dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
167 : dsa_pointer ptpages; /* dsa_pointer to the page array */
168 : dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
169 : dsa_area *dsa; /* reference to per-query dsa area */
170 : };
171 :
172 : /*
173 : * When iterating over a backend-local bitmap in sorted order, a
174 : * TBMPrivateIterator is used to track our progress. There can be several
175 : * iterators scanning the same bitmap concurrently. Note that the bitmap
176 : * becomes read-only as soon as any iterator is created.
177 : */
178 : struct TBMPrivateIterator
179 : {
180 : TIDBitmap *tbm; /* TIDBitmap we're iterating over */
181 : int spageptr; /* next spages index */
182 : int schunkptr; /* next schunks index */
183 : int schunkbit; /* next bit to check in current schunk */
184 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
185 : };
186 :
187 : /*
188 : * Holds the shared members of the iterator so that multiple processes
189 : * can jointly iterate.
190 : */
191 : typedef struct TBMSharedIteratorState
192 : {
193 : int nentries; /* number of entries in pagetable */
194 : int maxentries; /* limit on same to meet maxbytes */
195 : int npages; /* number of exact entries in pagetable */
196 : int nchunks; /* number of lossy entries in pagetable */
197 : dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
198 : dsa_pointer spages; /* dsa pointer to page array */
199 : dsa_pointer schunks; /* dsa pointer to chunk array */
200 : LWLock lock; /* lock to protect below members */
201 : int spageptr; /* next spages index */
202 : int schunkptr; /* next schunks index */
203 : int schunkbit; /* next bit to check in current schunk */
204 : } TBMSharedIteratorState;
205 :
206 : /*
207 : * pagetable iteration array.
208 : */
209 : typedef struct PTIterationArray
210 : {
211 : pg_atomic_uint32 refcount; /* no. of iterator attached */
212 : int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
213 : } PTIterationArray;
214 :
215 : /*
216 : * same as TBMPrivateIterator, but it is used for joint iteration, therefore
217 : * this also holds a reference to the shared state.
218 : */
219 : struct TBMSharedIterator
220 : {
221 : TBMSharedIteratorState *state; /* shared state */
222 : PTEntryArray *ptbase; /* pagetable element array */
223 : PTIterationArray *ptpages; /* sorted exact page index list */
224 : PTIterationArray *ptchunks; /* sorted lossy page index list */
225 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
226 : };
227 :
228 : /* Local function prototypes */
229 : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
230 : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
231 : const TIDBitmap *b);
232 : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
233 : BlockNumber pageno);
234 : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
235 : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
236 : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
237 : static void tbm_lossify(TIDBitmap *tbm);
238 : static int tbm_comparator(const void *left, const void *right);
239 : static int tbm_shared_comparator(const void *left, const void *right,
240 : void *arg);
241 :
242 : /* define hashtable mapping block numbers to PagetableEntry's */
243 : #define SH_USE_NONDEFAULT_ALLOCATOR
244 : #define SH_PREFIX pagetable
245 : #define SH_ELEMENT_TYPE PagetableEntry
246 : #define SH_KEY_TYPE BlockNumber
247 : #define SH_KEY blockno
248 : #define SH_HASH_KEY(tb, key) murmurhash32(key)
249 : #define SH_EQUAL(tb, a, b) a == b
250 : #define SH_SCOPE static inline
251 : #define SH_DEFINE
252 : #define SH_DECLARE
253 : #include "lib/simplehash.h"
254 :
255 :
256 : /*
257 : * tbm_create - create an initially-empty bitmap
258 : *
259 : * The bitmap will live in the memory context that is CurrentMemoryContext
260 : * at the time of this call. It will be limited to (approximately) maxbytes
261 : * total memory consumption. If the DSA passed to this function is not NULL
262 : * then the memory for storing elements of the underlying page table will
263 : * be allocated from the DSA.
264 : */
265 : TIDBitmap *
266 19956 : tbm_create(long maxbytes, dsa_area *dsa)
267 : {
268 : TIDBitmap *tbm;
269 :
270 : /* Create the TIDBitmap struct and zero all its fields */
271 19956 : tbm = makeNode(TIDBitmap);
272 :
273 19956 : tbm->mcxt = CurrentMemoryContext;
274 19956 : tbm->status = TBM_EMPTY;
275 :
276 19956 : tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
277 19956 : tbm->lossify_start = 0;
278 19956 : tbm->dsa = dsa;
279 19956 : tbm->dsapagetable = InvalidDsaPointer;
280 19956 : tbm->dsapagetableold = InvalidDsaPointer;
281 19956 : tbm->ptpages = InvalidDsaPointer;
282 19956 : tbm->ptchunks = InvalidDsaPointer;
283 :
284 19956 : return tbm;
285 : }
286 :
287 : /*
288 : * Actually create the hashtable. Since this is a moderately expensive
289 : * proposition, we don't do it until we have to.
290 : */
291 : static void
292 8304 : tbm_create_pagetable(TIDBitmap *tbm)
293 : {
294 : Assert(tbm->status != TBM_HASH);
295 : Assert(tbm->pagetable == NULL);
296 :
297 8304 : tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
298 :
299 : /* If entry1 is valid, push it into the hashtable */
300 8304 : if (tbm->status == TBM_ONE_PAGE)
301 : {
302 : PagetableEntry *page;
303 : bool found;
304 : char oldstatus;
305 :
306 5436 : page = pagetable_insert(tbm->pagetable,
307 : tbm->entry1.blockno,
308 : &found);
309 : Assert(!found);
310 5436 : oldstatus = page->status;
311 5436 : memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
312 5436 : page->status = oldstatus;
313 : }
314 :
315 8304 : tbm->status = TBM_HASH;
316 8304 : }
317 :
318 : /*
319 : * tbm_free - free a TIDBitmap
320 : */
321 : void
322 19842 : tbm_free(TIDBitmap *tbm)
323 : {
324 19842 : if (tbm->pagetable)
325 8304 : pagetable_destroy(tbm->pagetable);
326 19842 : if (tbm->spages)
327 5254 : pfree(tbm->spages);
328 19842 : if (tbm->schunks)
329 2880 : pfree(tbm->schunks);
330 19842 : pfree(tbm);
331 19842 : }
332 :
333 : /*
334 : * tbm_free_shared_area - free shared state
335 : *
336 : * Free shared iterator state, Also free shared pagetable and iterator arrays
337 : * memory if they are not referred by any of the shared iterator i.e recount
338 : * is becomes 0.
339 : */
340 : void
341 108 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
342 : {
343 108 : TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
344 : PTEntryArray *ptbase;
345 : PTIterationArray *ptpages;
346 : PTIterationArray *ptchunks;
347 :
348 108 : if (DsaPointerIsValid(istate->pagetable))
349 : {
350 108 : ptbase = dsa_get_address(dsa, istate->pagetable);
351 108 : if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
352 54 : dsa_free(dsa, istate->pagetable);
353 : }
354 108 : if (DsaPointerIsValid(istate->spages))
355 : {
356 108 : ptpages = dsa_get_address(dsa, istate->spages);
357 108 : if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
358 54 : dsa_free(dsa, istate->spages);
359 : }
360 108 : if (DsaPointerIsValid(istate->schunks))
361 : {
362 0 : ptchunks = dsa_get_address(dsa, istate->schunks);
363 0 : if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
364 0 : dsa_free(dsa, istate->schunks);
365 : }
366 :
367 108 : dsa_free(dsa, dp);
368 108 : }
369 :
370 : /*
371 : * tbm_add_tuples - add some tuple IDs to a TIDBitmap
372 : *
373 : * If recheck is true, then the recheck flag will be set in the
374 : * TBMIterateResult when any of these tuples are reported out.
375 : */
376 : void
377 6475964 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
378 : bool recheck)
379 : {
380 6475964 : BlockNumber currblk = InvalidBlockNumber;
381 6475964 : PagetableEntry *page = NULL; /* only valid when currblk is valid */
382 : int i;
383 :
384 : Assert(tbm->iterating == TBM_NOT_ITERATING);
385 15133386 : for (i = 0; i < ntids; i++)
386 : {
387 8657422 : BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
388 8657422 : OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
389 : int wordnum,
390 : bitnum;
391 :
392 : /* safety check to ensure we don't overrun bit array bounds */
393 8657422 : if (off < 1 || off > MAX_TUPLES_PER_PAGE)
394 0 : elog(ERROR, "tuple offset out of range: %u", off);
395 :
396 : /*
397 : * Look up target page unless we already did. This saves cycles when
398 : * the input includes consecutive tuples on the same page, which is
399 : * common enough to justify an extra test here.
400 : */
401 8657422 : if (blk != currblk)
402 : {
403 7252674 : if (tbm_page_is_lossy(tbm, blk))
404 3222 : page = NULL; /* remember page is lossy */
405 : else
406 7249452 : page = tbm_get_pageentry(tbm, blk);
407 7252674 : currblk = blk;
408 : }
409 :
410 8657422 : if (page == NULL)
411 3222 : continue; /* whole page is already marked */
412 :
413 8654200 : if (page->ischunk)
414 : {
415 : /* The page is a lossy chunk header, set bit for itself */
416 0 : wordnum = bitnum = 0;
417 : }
418 : else
419 : {
420 : /* Page is exact, so set bit for individual tuple */
421 8654200 : wordnum = WORDNUM(off - 1);
422 8654200 : bitnum = BITNUM(off - 1);
423 : }
424 8654200 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
425 8654200 : page->recheck |= recheck;
426 :
427 8654200 : if (tbm->nentries > tbm->maxentries)
428 : {
429 24 : tbm_lossify(tbm);
430 : /* Page could have been converted to lossy, so force new lookup */
431 24 : currblk = InvalidBlockNumber;
432 : }
433 : }
434 6475964 : }
435 :
436 : /*
437 : * tbm_add_page - add a whole page to a TIDBitmap
438 : *
439 : * This causes the whole page to be reported (with the recheck flag)
440 : * when the TIDBitmap is scanned.
441 : */
442 : void
443 147980 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
444 : {
445 : /* Enter the page in the bitmap, or mark it lossy if already present */
446 147980 : tbm_mark_page_lossy(tbm, pageno);
447 : /* If we went over the memory limit, lossify some more pages */
448 147980 : if (tbm->nentries > tbm->maxentries)
449 0 : tbm_lossify(tbm);
450 147980 : }
451 :
452 : /*
453 : * tbm_union - set union
454 : *
455 : * a is modified in-place, b is not changed
456 : */
457 : void
458 0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
459 : {
460 : Assert(!a->iterating);
461 : /* Nothing to do if b is empty */
462 0 : if (b->nentries == 0)
463 0 : return;
464 : /* Scan through chunks and pages in b, merge into a */
465 0 : if (b->status == TBM_ONE_PAGE)
466 0 : tbm_union_page(a, &b->entry1);
467 : else
468 : {
469 : pagetable_iterator i;
470 : PagetableEntry *bpage;
471 :
472 : Assert(b->status == TBM_HASH);
473 0 : pagetable_start_iterate(b->pagetable, &i);
474 0 : while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
475 0 : tbm_union_page(a, bpage);
476 : }
477 : }
478 :
479 : /* Process one page of b during a union op */
480 : static void
481 0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
482 : {
483 : PagetableEntry *apage;
484 : int wordnum;
485 :
486 0 : if (bpage->ischunk)
487 : {
488 : /* Scan b's chunk, mark each indicated page lossy in a */
489 0 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
490 : {
491 0 : bitmapword w = bpage->words[wordnum];
492 :
493 0 : if (w != 0)
494 : {
495 : BlockNumber pg;
496 :
497 0 : pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
498 0 : while (w != 0)
499 : {
500 0 : if (w & 1)
501 0 : tbm_mark_page_lossy(a, pg);
502 0 : pg++;
503 0 : w >>= 1;
504 : }
505 : }
506 : }
507 : }
508 0 : else if (tbm_page_is_lossy(a, bpage->blockno))
509 : {
510 : /* page is already lossy in a, nothing to do */
511 0 : return;
512 : }
513 : else
514 : {
515 0 : apage = tbm_get_pageentry(a, bpage->blockno);
516 0 : if (apage->ischunk)
517 : {
518 : /* The page is a lossy chunk header, set bit for itself */
519 0 : apage->words[0] |= ((bitmapword) 1 << 0);
520 : }
521 : else
522 : {
523 : /* Both pages are exact, merge at the bit level */
524 0 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
525 0 : apage->words[wordnum] |= bpage->words[wordnum];
526 0 : apage->recheck |= bpage->recheck;
527 : }
528 : }
529 :
530 0 : if (a->nentries > a->maxentries)
531 0 : tbm_lossify(a);
532 : }
533 :
534 : /*
535 : * tbm_intersect - set intersection
536 : *
537 : * a is modified in-place, b is not changed
538 : */
539 : void
540 104 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
541 : {
542 : Assert(!a->iterating);
543 : /* Nothing to do if a is empty */
544 104 : if (a->nentries == 0)
545 0 : return;
546 : /* Scan through chunks and pages in a, try to match to b */
547 104 : if (a->status == TBM_ONE_PAGE)
548 : {
549 52 : if (tbm_intersect_page(a, &a->entry1, b))
550 : {
551 : /* Page is now empty, remove it from a */
552 : Assert(!a->entry1.ischunk);
553 0 : a->npages--;
554 0 : a->nentries--;
555 : Assert(a->nentries == 0);
556 0 : a->status = TBM_EMPTY;
557 : }
558 : }
559 : else
560 : {
561 : pagetable_iterator i;
562 : PagetableEntry *apage;
563 :
564 : Assert(a->status == TBM_HASH);
565 52 : pagetable_start_iterate(a->pagetable, &i);
566 7044 : while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
567 : {
568 6992 : if (tbm_intersect_page(a, apage, b))
569 : {
570 : /* Page or chunk is now empty, remove it from a */
571 6570 : if (apage->ischunk)
572 0 : a->nchunks--;
573 : else
574 6570 : a->npages--;
575 6570 : a->nentries--;
576 6570 : if (!pagetable_delete(a->pagetable, apage->blockno))
577 0 : elog(ERROR, "hash table corrupted");
578 : }
579 : }
580 : }
581 : }
582 :
583 : /*
584 : * Process one page of a during an intersection op
585 : *
586 : * Returns true if apage is now empty and should be deleted from a
587 : */
588 : static bool
589 7044 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
590 : {
591 : const PagetableEntry *bpage;
592 : int wordnum;
593 :
594 7044 : if (apage->ischunk)
595 : {
596 : /* Scan each bit in chunk, try to clear */
597 30 : bool candelete = true;
598 :
599 150 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
600 : {
601 120 : bitmapword w = apage->words[wordnum];
602 :
603 120 : if (w != 0)
604 : {
605 108 : bitmapword neww = w;
606 : BlockNumber pg;
607 : int bitnum;
608 :
609 108 : pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
610 108 : bitnum = 0;
611 6606 : while (w != 0)
612 : {
613 6498 : if (w & 1)
614 : {
615 3360 : if (!tbm_page_is_lossy(b, pg) &&
616 252 : tbm_find_pageentry(b, pg) == NULL)
617 : {
618 : /* Page is not in b at all, lose lossy bit */
619 0 : neww &= ~((bitmapword) 1 << bitnum);
620 : }
621 : }
622 6498 : pg++;
623 6498 : bitnum++;
624 6498 : w >>= 1;
625 : }
626 108 : apage->words[wordnum] = neww;
627 108 : if (neww != 0)
628 108 : candelete = false;
629 : }
630 : }
631 30 : return candelete;
632 : }
633 7014 : else if (tbm_page_is_lossy(b, apage->blockno))
634 : {
635 : /*
636 : * Some of the tuples in 'a' might not satisfy the quals for 'b', but
637 : * because the page 'b' is lossy, we don't know which ones. Therefore
638 : * we mark 'a' as requiring rechecks, to indicate that at most those
639 : * tuples set in 'a' are matches.
640 : */
641 0 : apage->recheck = true;
642 0 : return false;
643 : }
644 : else
645 : {
646 7014 : bool candelete = true;
647 :
648 7014 : bpage = tbm_find_pageentry(b, apage->blockno);
649 7014 : if (bpage != NULL)
650 : {
651 : /* Both pages are exact, merge at the bit level */
652 : Assert(!bpage->ischunk);
653 30312 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
654 : {
655 25260 : apage->words[wordnum] &= bpage->words[wordnum];
656 25260 : if (apage->words[wordnum] != 0)
657 454 : candelete = false;
658 : }
659 5052 : apage->recheck |= bpage->recheck;
660 : }
661 : /* If there is no matching b page, we can just delete the a page */
662 7014 : return candelete;
663 : }
664 : }
665 :
666 : /*
667 : * tbm_is_empty - is a TIDBitmap completely empty?
668 : */
669 : bool
670 742 : tbm_is_empty(const TIDBitmap *tbm)
671 : {
672 742 : return (tbm->nentries == 0);
673 : }
674 :
675 : /*
676 : * tbm_begin_private_iterate - prepare to iterate through a TIDBitmap
677 : *
678 : * The TBMPrivateIterator struct is created in the caller's memory context.
679 : * For a clean shutdown of the iteration, call tbm_end_private_iterate; but
680 : * it's okay to just allow the memory context to be released, too. It is
681 : * caller's responsibility not to touch the TBMPrivateIterator anymore once
682 : * the TIDBitmap is freed.
683 : *
684 : * NB: after this is called, it is no longer allowed to modify the contents
685 : * of the bitmap. However, you can call this multiple times to scan the
686 : * contents repeatedly, including parallel scans.
687 : */
688 : TBMPrivateIterator *
689 38912 : tbm_begin_private_iterate(TIDBitmap *tbm)
690 : {
691 : TBMPrivateIterator *iterator;
692 :
693 : Assert(tbm->iterating != TBM_ITERATING_SHARED);
694 :
695 : /*
696 : * Create the TBMPrivateIterator struct, with enough trailing space to
697 : * serve the needs of the TBMIterateResult sub-struct.
698 : */
699 38912 : iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator) +
700 : MAX_TUPLES_PER_PAGE *
701 : sizeof(OffsetNumber));
702 38912 : iterator->tbm = tbm;
703 :
704 : /*
705 : * Initialize iteration pointers.
706 : */
707 38912 : iterator->spageptr = 0;
708 38912 : iterator->schunkptr = 0;
709 38912 : iterator->schunkbit = 0;
710 :
711 : /*
712 : * If we have a hashtable, create and fill the sorted page lists, unless
713 : * we already did that for a previous iterator. Note that the lists are
714 : * attached to the bitmap not the iterator, so they can be used by more
715 : * than one iterator.
716 : */
717 38912 : if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
718 : {
719 : pagetable_iterator i;
720 : PagetableEntry *page;
721 : int npages;
722 : int nchunks;
723 :
724 8128 : if (!tbm->spages && tbm->npages > 0)
725 5254 : tbm->spages = (PagetableEntry **)
726 5254 : MemoryContextAlloc(tbm->mcxt,
727 5254 : tbm->npages * sizeof(PagetableEntry *));
728 8128 : if (!tbm->schunks && tbm->nchunks > 0)
729 2880 : tbm->schunks = (PagetableEntry **)
730 2880 : MemoryContextAlloc(tbm->mcxt,
731 2880 : tbm->nchunks * sizeof(PagetableEntry *));
732 :
733 8128 : npages = nchunks = 0;
734 8128 : pagetable_start_iterate(tbm->pagetable, &i);
735 217696 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
736 : {
737 209568 : if (page->ischunk)
738 2928 : tbm->schunks[nchunks++] = page;
739 : else
740 206640 : tbm->spages[npages++] = page;
741 : }
742 : Assert(npages == tbm->npages);
743 : Assert(nchunks == tbm->nchunks);
744 8128 : if (npages > 1)
745 5250 : qsort(tbm->spages, npages, sizeof(PagetableEntry *),
746 : tbm_comparator);
747 8128 : if (nchunks > 1)
748 12 : qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
749 : tbm_comparator);
750 : }
751 :
752 38912 : tbm->iterating = TBM_ITERATING_PRIVATE;
753 :
754 38912 : return iterator;
755 : }
756 :
757 : /*
758 : * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
759 : *
760 : * The necessary shared state will be allocated from the DSA passed to
761 : * tbm_create, so that multiple processes can attach to it and iterate jointly.
762 : *
763 : * This will convert the pagetable hash into page and chunk array of the index
764 : * into pagetable array.
765 : */
766 : dsa_pointer
767 144 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
768 : {
769 : dsa_pointer dp;
770 : TBMSharedIteratorState *istate;
771 144 : PTEntryArray *ptbase = NULL;
772 144 : PTIterationArray *ptpages = NULL;
773 144 : PTIterationArray *ptchunks = NULL;
774 :
775 : Assert(tbm->dsa != NULL);
776 : Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
777 :
778 : /*
779 : * Allocate TBMSharedIteratorState from DSA to hold the shared members and
780 : * lock, this will also be used by multiple worker for shared iterate.
781 : */
782 144 : dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
783 144 : istate = dsa_get_address(tbm->dsa, dp);
784 :
785 : /*
786 : * If we're not already iterating, create and fill the sorted page lists.
787 : * (If we are, the sorted page lists are already stored in the TIDBitmap,
788 : * and we can just reuse them.)
789 : */
790 144 : if (tbm->iterating == TBM_NOT_ITERATING)
791 : {
792 : pagetable_iterator i;
793 : PagetableEntry *page;
794 : int idx;
795 : int npages;
796 : int nchunks;
797 :
798 : /*
799 : * Allocate the page and chunk array memory from the DSA to share
800 : * across multiple processes.
801 : */
802 72 : if (tbm->npages)
803 : {
804 72 : tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
805 : tbm->npages * sizeof(int));
806 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
807 72 : pg_atomic_init_u32(&ptpages->refcount, 0);
808 : }
809 72 : if (tbm->nchunks)
810 : {
811 6 : tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
812 : tbm->nchunks * sizeof(int));
813 6 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
814 6 : pg_atomic_init_u32(&ptchunks->refcount, 0);
815 : }
816 :
817 : /*
818 : * If TBM status is TBM_HASH then iterate over the pagetable and
819 : * convert it to page and chunk arrays. But if it's in the
820 : * TBM_ONE_PAGE mode then directly allocate the space for one entry
821 : * from the DSA.
822 : */
823 72 : npages = nchunks = 0;
824 72 : if (tbm->status == TBM_HASH)
825 : {
826 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
827 :
828 72 : pagetable_start_iterate(tbm->pagetable, &i);
829 27102 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
830 : {
831 27030 : idx = page - ptbase->ptentry;
832 27030 : if (page->ischunk)
833 24 : ptchunks->index[nchunks++] = idx;
834 : else
835 27006 : ptpages->index[npages++] = idx;
836 : }
837 :
838 : Assert(npages == tbm->npages);
839 : Assert(nchunks == tbm->nchunks);
840 : }
841 0 : else if (tbm->status == TBM_ONE_PAGE)
842 : {
843 : /*
844 : * In one page mode allocate the space for one pagetable entry,
845 : * initialize it, and directly store its index (i.e. 0) in the
846 : * page array.
847 : */
848 0 : tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
849 : sizeof(PagetableEntry));
850 0 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
851 0 : memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
852 0 : ptpages->index[0] = 0;
853 : }
854 :
855 72 : if (ptbase != NULL)
856 72 : pg_atomic_init_u32(&ptbase->refcount, 0);
857 72 : if (npages > 1)
858 72 : qsort_arg(ptpages->index, npages, sizeof(int),
859 72 : tbm_shared_comparator, ptbase->ptentry);
860 72 : if (nchunks > 1)
861 6 : qsort_arg(ptchunks->index, nchunks, sizeof(int),
862 6 : tbm_shared_comparator, ptbase->ptentry);
863 : }
864 :
865 : /*
866 : * Store the TBM members in the shared state so that we can share them
867 : * across multiple processes.
868 : */
869 144 : istate->nentries = tbm->nentries;
870 144 : istate->maxentries = tbm->maxentries;
871 144 : istate->npages = tbm->npages;
872 144 : istate->nchunks = tbm->nchunks;
873 144 : istate->pagetable = tbm->dsapagetable;
874 144 : istate->spages = tbm->ptpages;
875 144 : istate->schunks = tbm->ptchunks;
876 :
877 144 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
878 144 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
879 144 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
880 :
881 : /*
882 : * For every shared iterator referring to pagetable and iterator array,
883 : * increase the refcount by 1 so that while freeing the shared iterator we
884 : * don't free pagetable and iterator array until its refcount becomes 0.
885 : */
886 144 : if (ptbase != NULL)
887 144 : pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
888 144 : if (ptpages != NULL)
889 144 : pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
890 144 : if (ptchunks != NULL)
891 12 : pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
892 :
893 : /* Initialize the iterator lock */
894 144 : LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
895 :
896 : /* Initialize the shared iterator state */
897 144 : istate->schunkbit = 0;
898 144 : istate->schunkptr = 0;
899 144 : istate->spageptr = 0;
900 :
901 144 : tbm->iterating = TBM_ITERATING_SHARED;
902 :
903 144 : return dp;
904 : }
905 :
906 : /*
907 : * tbm_extract_page_tuple - extract the tuple offsets from a page
908 : *
909 : * The extracted offsets are stored into TBMIterateResult.
910 : */
911 : static inline int
912 474554 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
913 : {
914 : int wordnum;
915 474554 : int ntuples = 0;
916 :
917 2847324 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
918 : {
919 2372770 : bitmapword w = page->words[wordnum];
920 :
921 2372770 : if (w != 0)
922 : {
923 1108486 : int off = wordnum * BITS_PER_BITMAPWORD + 1;
924 :
925 48015272 : while (w != 0)
926 : {
927 46906786 : if (w & 1)
928 11467238 : output->offsets[ntuples++] = (OffsetNumber) off;
929 46906786 : off++;
930 46906786 : w >>= 1;
931 : }
932 : }
933 : }
934 :
935 474554 : return ntuples;
936 : }
937 :
938 : /*
939 : * tbm_advance_schunkbit - Advance the schunkbit
940 : */
941 : static inline void
942 333616 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
943 : {
944 333616 : int schunkbit = *schunkbitp;
945 :
946 1530444 : while (schunkbit < PAGES_PER_CHUNK)
947 : {
948 1524540 : int wordnum = WORDNUM(schunkbit);
949 1524540 : int bitnum = BITNUM(schunkbit);
950 :
951 1524540 : if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
952 327712 : break;
953 1196828 : schunkbit++;
954 : }
955 :
956 333616 : *schunkbitp = schunkbit;
957 333616 : }
958 :
959 : /*
960 : * tbm_private_iterate - scan through next page of a TIDBitmap
961 : *
962 : * Returns a TBMIterateResult representing one page, or NULL if there are
963 : * no more pages to scan. Pages are guaranteed to be delivered in numerical
964 : * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
965 : * remember the exact tuples to look at on this page --- the caller must
966 : * examine all tuples on the page and check if they meet the intended
967 : * condition. If result->recheck is true, only the indicated tuples need
968 : * be examined, but the condition must be rechecked anyway. (For ease of
969 : * testing, recheck is always set true when ntuples < 0.)
970 : */
971 : TBMIterateResult *
972 767352 : tbm_private_iterate(TBMPrivateIterator *iterator)
973 : {
974 767352 : TIDBitmap *tbm = iterator->tbm;
975 767352 : TBMIterateResult *output = &(iterator->output);
976 :
977 : Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
978 :
979 : /*
980 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
981 : * schunkbit to the next set bit.
982 : */
983 773208 : while (iterator->schunkptr < tbm->nchunks)
984 : {
985 321316 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
986 321316 : int schunkbit = iterator->schunkbit;
987 :
988 321316 : tbm_advance_schunkbit(chunk, &schunkbit);
989 321316 : if (schunkbit < PAGES_PER_CHUNK)
990 : {
991 315460 : iterator->schunkbit = schunkbit;
992 315460 : break;
993 : }
994 : /* advance to next chunk */
995 5856 : iterator->schunkptr++;
996 5856 : iterator->schunkbit = 0;
997 : }
998 :
999 : /*
1000 : * If both chunk and per-page data remain, must output the numerically
1001 : * earlier page.
1002 : */
1003 767352 : if (iterator->schunkptr < tbm->nchunks)
1004 : {
1005 315460 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1006 : BlockNumber chunk_blockno;
1007 :
1008 315460 : chunk_blockno = chunk->blockno + iterator->schunkbit;
1009 315460 : if (iterator->spageptr >= tbm->npages ||
1010 19500 : chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1011 : {
1012 : /* Return a lossy page indicator from the chunk */
1013 308392 : output->blockno = chunk_blockno;
1014 308392 : output->ntuples = -1;
1015 308392 : output->recheck = true;
1016 308392 : iterator->schunkbit++;
1017 308392 : return output;
1018 : }
1019 : }
1020 :
1021 458960 : if (iterator->spageptr < tbm->npages)
1022 : {
1023 : PagetableEntry *page;
1024 : int ntuples;
1025 :
1026 : /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1027 420542 : if (tbm->status == TBM_ONE_PAGE)
1028 14462 : page = &tbm->entry1;
1029 : else
1030 406080 : page = tbm->spages[iterator->spageptr];
1031 :
1032 : /* scan bitmap to extract individual offset numbers */
1033 420542 : ntuples = tbm_extract_page_tuple(page, output);
1034 420542 : output->blockno = page->blockno;
1035 420542 : output->ntuples = ntuples;
1036 420542 : output->recheck = page->recheck;
1037 420542 : iterator->spageptr++;
1038 420542 : return output;
1039 : }
1040 :
1041 : /* Nothing more in the bitmap */
1042 38418 : return NULL;
1043 : }
1044 :
1045 : /*
1046 : * tbm_shared_iterate - scan through next page of a TIDBitmap
1047 : *
1048 : * As above, but this will iterate using an iterator which is shared
1049 : * across multiple processes. We need to acquire the iterator LWLock,
1050 : * before accessing the shared members.
1051 : */
1052 : TBMIterateResult *
1053 60900 : tbm_shared_iterate(TBMSharedIterator *iterator)
1054 : {
1055 60900 : TBMIterateResult *output = &iterator->output;
1056 60900 : TBMSharedIteratorState *istate = iterator->state;
1057 60900 : PagetableEntry *ptbase = NULL;
1058 60900 : int *idxpages = NULL;
1059 60900 : int *idxchunks = NULL;
1060 :
1061 60900 : if (iterator->ptbase != NULL)
1062 60900 : ptbase = iterator->ptbase->ptentry;
1063 60900 : if (iterator->ptpages != NULL)
1064 60900 : idxpages = iterator->ptpages->index;
1065 60900 : if (iterator->ptchunks != NULL)
1066 14880 : idxchunks = iterator->ptchunks->index;
1067 :
1068 : /* Acquire the LWLock before accessing the shared members */
1069 60900 : LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1070 :
1071 : /*
1072 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1073 : * schunkbit to the next set bit.
1074 : */
1075 60948 : while (istate->schunkptr < istate->nchunks)
1076 : {
1077 12300 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1078 12300 : int schunkbit = istate->schunkbit;
1079 :
1080 12300 : tbm_advance_schunkbit(chunk, &schunkbit);
1081 12300 : if (schunkbit < PAGES_PER_CHUNK)
1082 : {
1083 12252 : istate->schunkbit = schunkbit;
1084 12252 : break;
1085 : }
1086 : /* advance to next chunk */
1087 48 : istate->schunkptr++;
1088 48 : istate->schunkbit = 0;
1089 : }
1090 :
1091 : /*
1092 : * If both chunk and per-page data remain, must output the numerically
1093 : * earlier page.
1094 : */
1095 60900 : if (istate->schunkptr < istate->nchunks)
1096 : {
1097 12252 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1098 : BlockNumber chunk_blockno;
1099 :
1100 12252 : chunk_blockno = chunk->blockno + istate->schunkbit;
1101 :
1102 12252 : if (istate->spageptr >= istate->npages ||
1103 12252 : chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1104 : {
1105 : /* Return a lossy page indicator from the chunk */
1106 6204 : output->blockno = chunk_blockno;
1107 6204 : output->ntuples = -1;
1108 6204 : output->recheck = true;
1109 6204 : istate->schunkbit++;
1110 :
1111 6204 : LWLockRelease(&istate->lock);
1112 6204 : return output;
1113 : }
1114 : }
1115 :
1116 54696 : if (istate->spageptr < istate->npages)
1117 : {
1118 54012 : PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1119 : int ntuples;
1120 :
1121 : /* scan bitmap to extract individual offset numbers */
1122 54012 : ntuples = tbm_extract_page_tuple(page, output);
1123 54012 : output->blockno = page->blockno;
1124 54012 : output->ntuples = ntuples;
1125 54012 : output->recheck = page->recheck;
1126 54012 : istate->spageptr++;
1127 :
1128 54012 : LWLockRelease(&istate->lock);
1129 :
1130 54012 : return output;
1131 : }
1132 :
1133 684 : LWLockRelease(&istate->lock);
1134 :
1135 : /* Nothing more in the bitmap */
1136 684 : return NULL;
1137 : }
1138 :
1139 : /*
1140 : * tbm_end_private_iterate - finish an iteration over a TIDBitmap
1141 : *
1142 : * Currently this is just a pfree, but it might do more someday. (For
1143 : * instance, it could be useful to count open iterators and allow the
1144 : * bitmap to return to read/write status when there are no more iterators.)
1145 : */
1146 : void
1147 38792 : tbm_end_private_iterate(TBMPrivateIterator *iterator)
1148 : {
1149 38792 : pfree(iterator);
1150 38792 : }
1151 :
1152 : /*
1153 : * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1154 : *
1155 : * This doesn't free any of the shared state associated with the iterator,
1156 : * just our backend-private state.
1157 : */
1158 : void
1159 684 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
1160 : {
1161 684 : pfree(iterator);
1162 684 : }
1163 :
1164 : /*
1165 : * tbm_find_pageentry - find a PagetableEntry for the pageno
1166 : *
1167 : * Returns NULL if there is no non-lossy entry for the pageno.
1168 : */
1169 : static const PagetableEntry *
1170 7266 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1171 : {
1172 : const PagetableEntry *page;
1173 :
1174 7266 : if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1175 0 : return NULL;
1176 :
1177 7266 : if (tbm->status == TBM_ONE_PAGE)
1178 : {
1179 0 : page = &tbm->entry1;
1180 0 : if (page->blockno != pageno)
1181 0 : return NULL;
1182 : Assert(!page->ischunk);
1183 0 : return page;
1184 : }
1185 :
1186 7266 : page = pagetable_lookup(tbm->pagetable, pageno);
1187 7266 : if (page == NULL)
1188 1962 : return NULL;
1189 5304 : if (page->ischunk)
1190 0 : return NULL; /* don't want a lossy chunk header */
1191 5304 : return page;
1192 : }
1193 :
1194 : /*
1195 : * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1196 : *
1197 : * If new, the entry is marked as an exact (non-chunk) entry.
1198 : *
1199 : * This may cause the table to exceed the desired memory size. It is
1200 : * up to the caller to call tbm_lossify() at the next safe point if so.
1201 : */
1202 : static PagetableEntry *
1203 7249452 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1204 : {
1205 : PagetableEntry *page;
1206 : bool found;
1207 :
1208 7249452 : if (tbm->status == TBM_EMPTY)
1209 : {
1210 : /* Use the fixed slot */
1211 12792 : page = &tbm->entry1;
1212 12792 : found = false;
1213 12792 : tbm->status = TBM_ONE_PAGE;
1214 : }
1215 : else
1216 : {
1217 7236660 : if (tbm->status == TBM_ONE_PAGE)
1218 : {
1219 89024 : page = &tbm->entry1;
1220 89024 : if (page->blockno == pageno)
1221 83588 : return page;
1222 : /* Time to switch from one page to a hashtable */
1223 5436 : tbm_create_pagetable(tbm);
1224 : }
1225 :
1226 : /* Look up or create an entry */
1227 7153072 : page = pagetable_insert(tbm->pagetable, pageno, &found);
1228 : }
1229 :
1230 : /* Initialize it if not present before */
1231 7165864 : if (!found)
1232 : {
1233 267670 : char oldstatus = page->status;
1234 :
1235 1873690 : MemSet(page, 0, sizeof(PagetableEntry));
1236 267670 : page->status = oldstatus;
1237 267670 : page->blockno = pageno;
1238 : /* must count it too */
1239 267670 : tbm->nentries++;
1240 267670 : tbm->npages++;
1241 : }
1242 :
1243 7165864 : return page;
1244 : }
1245 :
1246 : /*
1247 : * tbm_page_is_lossy - is the page marked as lossily stored?
1248 : */
1249 : static bool
1250 7262796 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1251 : {
1252 : PagetableEntry *page;
1253 : BlockNumber chunk_pageno;
1254 : int bitno;
1255 :
1256 : /* we can skip the lookup if there are no lossy chunks */
1257 7262796 : if (tbm->nchunks == 0)
1258 7141932 : return false;
1259 : Assert(tbm->status == TBM_HASH);
1260 :
1261 120864 : bitno = pageno % PAGES_PER_CHUNK;
1262 120864 : chunk_pageno = pageno - bitno;
1263 :
1264 120864 : page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1265 :
1266 120864 : if (page != NULL && page->ischunk)
1267 : {
1268 15600 : int wordnum = WORDNUM(bitno);
1269 15600 : int bitnum = BITNUM(bitno);
1270 :
1271 15600 : if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1272 6078 : return true;
1273 : }
1274 114786 : return false;
1275 : }
1276 :
1277 : /*
1278 : * tbm_mark_page_lossy - mark the page number as lossily stored
1279 : *
1280 : * This may cause the table to exceed the desired memory size. It is
1281 : * up to the caller to call tbm_lossify() at the next safe point if so.
1282 : */
1283 : static void
1284 160292 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1285 : {
1286 : PagetableEntry *page;
1287 : bool found;
1288 : BlockNumber chunk_pageno;
1289 : int bitno;
1290 : int wordnum;
1291 : int bitnum;
1292 :
1293 : /* We force the bitmap into hashtable mode whenever it's lossy */
1294 160292 : if (tbm->status != TBM_HASH)
1295 2868 : tbm_create_pagetable(tbm);
1296 :
1297 160292 : bitno = pageno % PAGES_PER_CHUNK;
1298 160292 : chunk_pageno = pageno - bitno;
1299 :
1300 : /*
1301 : * Remove any extant non-lossy entry for the page. If the page is its own
1302 : * chunk header, however, we skip this and handle the case below.
1303 : */
1304 160292 : if (bitno != 0)
1305 : {
1306 157964 : if (pagetable_delete(tbm->pagetable, pageno))
1307 : {
1308 : /* It was present, so adjust counts */
1309 12312 : tbm->nentries--;
1310 12312 : tbm->npages--; /* assume it must have been non-lossy */
1311 : }
1312 : }
1313 :
1314 : /* Look up or create entry for chunk-header page */
1315 160292 : page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1316 :
1317 : /* Initialize it if not present before */
1318 160292 : if (!found)
1319 : {
1320 2868 : char oldstatus = page->status;
1321 :
1322 20076 : MemSet(page, 0, sizeof(PagetableEntry));
1323 2868 : page->status = oldstatus;
1324 2868 : page->blockno = chunk_pageno;
1325 2868 : page->ischunk = true;
1326 : /* must count it too */
1327 2868 : tbm->nentries++;
1328 2868 : tbm->nchunks++;
1329 : }
1330 157424 : else if (!page->ischunk)
1331 : {
1332 108 : char oldstatus = page->status;
1333 :
1334 : /* chunk header page was formerly non-lossy, make it lossy */
1335 756 : MemSet(page, 0, sizeof(PagetableEntry));
1336 108 : page->status = oldstatus;
1337 108 : page->blockno = chunk_pageno;
1338 108 : page->ischunk = true;
1339 : /* we assume it had some tuple bit(s) set, so mark it lossy */
1340 108 : page->words[0] = ((bitmapword) 1 << 0);
1341 : /* adjust counts */
1342 108 : tbm->nchunks++;
1343 108 : tbm->npages--;
1344 : }
1345 :
1346 : /* Now set the original target page's bit */
1347 160292 : wordnum = WORDNUM(bitno);
1348 160292 : bitnum = BITNUM(bitno);
1349 160292 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1350 160292 : }
1351 :
1352 : /*
1353 : * tbm_lossify - lose some information to get back under the memory limit
1354 : */
1355 : static void
1356 24 : tbm_lossify(TIDBitmap *tbm)
1357 : {
1358 : pagetable_iterator i;
1359 : PagetableEntry *page;
1360 :
1361 : /*
1362 : * XXX Really stupid implementation: this just lossifies pages in
1363 : * essentially random order. We should be paying some attention to the
1364 : * number of bits set in each page, instead.
1365 : *
1366 : * Since we are called as soon as nentries exceeds maxentries, we should
1367 : * push nentries down to significantly less than maxentries, or else we'll
1368 : * just end up doing this again very soon. We shoot for maxentries/2.
1369 : */
1370 : Assert(tbm->iterating == TBM_NOT_ITERATING);
1371 : Assert(tbm->status == TBM_HASH);
1372 :
1373 24 : pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1374 12324 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1375 : {
1376 12324 : if (page->ischunk)
1377 0 : continue; /* already a chunk header */
1378 :
1379 : /*
1380 : * If the page would become a chunk header, we won't save anything by
1381 : * converting it to lossy, so skip it.
1382 : */
1383 12324 : if ((page->blockno % PAGES_PER_CHUNK) == 0)
1384 12 : continue;
1385 :
1386 : /* This does the dirty work ... */
1387 12312 : tbm_mark_page_lossy(tbm, page->blockno);
1388 :
1389 12312 : if (tbm->nentries <= tbm->maxentries / 2)
1390 : {
1391 : /*
1392 : * We have made enough room. Remember where to start lossifying
1393 : * next round, so we evenly iterate over the hashtable.
1394 : */
1395 24 : tbm->lossify_start = i.cur;
1396 24 : break;
1397 : }
1398 :
1399 : /*
1400 : * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1401 : * hashtable and may have deleted the non-lossy chunk. We can
1402 : * continue the same hash table scan, since failure to visit one
1403 : * element or visiting the newly inserted element, isn't fatal.
1404 : */
1405 : }
1406 :
1407 : /*
1408 : * With a big bitmap and small work_mem, it's possible that we cannot get
1409 : * under maxentries. Again, if that happens, we'd end up uselessly
1410 : * calling tbm_lossify over and over. To prevent this from becoming a
1411 : * performance sink, force maxentries up to at least double the current
1412 : * number of entries. (In essence, we're admitting inability to fit
1413 : * within work_mem when we do this.) Note that this test will not fire if
1414 : * we broke out of the loop early; and if we didn't, the current number of
1415 : * entries is simply not reducible any further.
1416 : */
1417 24 : if (tbm->nentries > tbm->maxentries / 2)
1418 0 : tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1419 24 : }
1420 :
1421 : /*
1422 : * qsort comparator to handle PagetableEntry pointers.
1423 : */
1424 : static int
1425 1384418 : tbm_comparator(const void *left, const void *right)
1426 : {
1427 1384418 : BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1428 1384418 : BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1429 :
1430 1384418 : return pg_cmp_u32(l, r);
1431 : }
1432 :
1433 : /*
1434 : * As above, but this will get index into PagetableEntry array. Therefore,
1435 : * it needs to get actual PagetableEntry using the index before comparing the
1436 : * blockno.
1437 : */
1438 : static int
1439 238290 : tbm_shared_comparator(const void *left, const void *right, void *arg)
1440 : {
1441 238290 : PagetableEntry *base = (PagetableEntry *) arg;
1442 238290 : PagetableEntry *lpage = &base[*(int *) left];
1443 238290 : PagetableEntry *rpage = &base[*(int *) right];
1444 :
1445 238290 : if (lpage->blockno < rpage->blockno)
1446 108966 : return -1;
1447 129324 : else if (lpage->blockno > rpage->blockno)
1448 129324 : return 1;
1449 0 : return 0;
1450 : }
1451 :
1452 : /*
1453 : * tbm_attach_shared_iterate
1454 : *
1455 : * Allocate a backend-private iterator and attach the shared iterator state
1456 : * to it so that multiple processed can iterate jointly.
1457 : *
1458 : * We also converts the DSA pointers to local pointers and store them into
1459 : * our private iterator.
1460 : */
1461 : TBMSharedIterator *
1462 684 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1463 : {
1464 : TBMSharedIterator *iterator;
1465 : TBMSharedIteratorState *istate;
1466 :
1467 : /*
1468 : * Create the TBMSharedIterator struct, with enough trailing space to
1469 : * serve the needs of the TBMIterateResult sub-struct.
1470 : */
1471 684 : iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1472 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1473 :
1474 684 : istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1475 :
1476 684 : iterator->state = istate;
1477 :
1478 684 : iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1479 :
1480 684 : if (istate->npages)
1481 684 : iterator->ptpages = dsa_get_address(dsa, istate->spages);
1482 684 : if (istate->nchunks)
1483 60 : iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1484 :
1485 684 : return iterator;
1486 : }
1487 :
1488 : /*
1489 : * pagetable_allocate
1490 : *
1491 : * Callback function for allocating the memory for hashtable elements.
1492 : * Allocate memory for hashtable elements, using DSA if available.
1493 : */
1494 : static inline void *
1495 8670 : pagetable_allocate(pagetable_hash *pagetable, Size size)
1496 : {
1497 8670 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1498 : PTEntryArray *ptbase;
1499 :
1500 8670 : if (tbm->dsa == NULL)
1501 8514 : return MemoryContextAllocExtended(pagetable->ctx, size,
1502 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1503 :
1504 : /*
1505 : * Save the dsapagetable reference in dsapagetableold before allocating
1506 : * new memory so that pagetable_free can free the old entry.
1507 : */
1508 156 : tbm->dsapagetableold = tbm->dsapagetable;
1509 156 : tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1510 : sizeof(PTEntryArray) + size,
1511 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1512 156 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1513 :
1514 156 : return ptbase->ptentry;
1515 : }
1516 :
1517 : /*
1518 : * pagetable_free
1519 : *
1520 : * Callback function for freeing hash table elements.
1521 : */
1522 : static inline void
1523 8670 : pagetable_free(pagetable_hash *pagetable, void *pointer)
1524 : {
1525 8670 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1526 :
1527 : /* pfree the input pointer if DSA is not available */
1528 8670 : if (tbm->dsa == NULL)
1529 8514 : pfree(pointer);
1530 156 : else if (DsaPointerIsValid(tbm->dsapagetableold))
1531 : {
1532 84 : dsa_free(tbm->dsa, tbm->dsapagetableold);
1533 84 : tbm->dsapagetableold = InvalidDsaPointer;
1534 : }
1535 8670 : }
1536 :
1537 : /*
1538 : * tbm_calculate_entries
1539 : *
1540 : * Estimate number of hashtable entries we can have within maxbytes.
1541 : */
1542 : long
1543 620600 : tbm_calculate_entries(double maxbytes)
1544 : {
1545 : long nbuckets;
1546 :
1547 : /*
1548 : * Estimate number of hashtable entries we can have within maxbytes. This
1549 : * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1550 : * for our purpose. Also count an extra Pointer per entry for the arrays
1551 : * created during iteration readout.
1552 : */
1553 620600 : nbuckets = maxbytes /
1554 : (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1555 620600 : nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1556 620600 : nbuckets = Max(nbuckets, 16); /* sanity limit */
1557 :
1558 620600 : return nbuckets;
1559 : }
1560 :
1561 : /*
1562 : * Create a shared or private bitmap iterator and start iteration.
1563 : *
1564 : * `tbm` is only used to create the private iterator and dsa and dsp are only
1565 : * used to create the shared iterator.
1566 : *
1567 : * Before invoking tbm_begin_iterate() to create a shared iterator, one
1568 : * process must already have invoked tbm_prepare_shared_iterate() to create
1569 : * and set up the TBMSharedIteratorState.
1570 : */
1571 : TBMIterator
1572 39176 : tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp)
1573 : {
1574 39176 : TBMIterator iterator = {0};
1575 :
1576 : /* Allocate a private iterator and attach the shared state to it */
1577 39176 : if (DsaPointerIsValid(dsp))
1578 : {
1579 684 : iterator.shared = true;
1580 684 : iterator.i.shared_iterator = tbm_attach_shared_iterate(dsa, dsp);
1581 : }
1582 : else
1583 : {
1584 38492 : iterator.shared = false;
1585 38492 : iterator.i.private_iterator = tbm_begin_private_iterate(tbm);
1586 : }
1587 :
1588 39176 : return iterator;
1589 : }
1590 :
1591 : /*
1592 : * Clean up shared or private bitmap iterator.
1593 : */
1594 : void
1595 39056 : tbm_end_iterate(TBMIterator *iterator)
1596 : {
1597 : Assert(iterator && !tbm_exhausted(iterator));
1598 :
1599 39056 : if (iterator->shared)
1600 684 : tbm_end_shared_iterate(iterator->i.shared_iterator);
1601 : else
1602 38372 : tbm_end_private_iterate(iterator->i.private_iterator);
1603 :
1604 39056 : *iterator = (TBMIterator)
1605 : {
1606 : 0
1607 : };
1608 39056 : }
1609 :
1610 : /*
1611 : * Get the next TBMIterateResult from the shared or private bitmap iterator.
1612 : */
1613 : TBMIterateResult *
1614 823006 : tbm_iterate(TBMIterator *iterator)
1615 : {
1616 : Assert(iterator);
1617 :
1618 823006 : if (iterator->shared)
1619 60900 : return tbm_shared_iterate(iterator->i.shared_iterator);
1620 : else
1621 762106 : return tbm_private_iterate(iterator->i.private_iterator);
1622 : }
|