Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidbitmap.c
4 : * PostgreSQL tuple-id (TID) bitmap package
5 : *
6 : * This module provides bitmap data structures that are spiritually
7 : * similar to Bitmapsets, but are specially adapted to store sets of
8 : * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 : * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 : * Also, since we wish to be able to store very large tuple sets in
11 : * memory with this data structure, we support "lossy" storage, in which
12 : * we no longer remember individual tuple offsets on a page but only the
13 : * fact that a particular page needs to be visited.
14 : *
15 : * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 : * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 : * of memory. People pushing around tables of that size should have a
18 : * couple of Mb to spare, so we don't worry about providing a second level
19 : * of lossiness. In theory we could fall back to page ranges at some
20 : * point, but for now that seems useless complexity.
21 : *
22 : * We also support the notion of candidate matches, or rechecking. This
23 : * means we know that a search need visit only some tuples on a page,
24 : * but we are not certain that all of those tuples are real matches.
25 : * So the eventual heap scan must recheck the quals for these tuples only,
26 : * rather than rechecking the quals for all tuples on the page as in the
27 : * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 : * into a bitmap, and it can also happen internally when we AND a lossy
29 : * and a non-lossy page.
30 : *
31 : *
32 : * Copyright (c) 2003-2025, PostgreSQL Global Development Group
33 : *
34 : * IDENTIFICATION
35 : * src/backend/nodes/tidbitmap.c
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include <limits.h>
42 :
43 : #include "access/htup_details.h"
44 : #include "common/hashfn.h"
45 : #include "common/int.h"
46 : #include "nodes/bitmapset.h"
47 : #include "nodes/tidbitmap.h"
48 : #include "storage/lwlock.h"
49 : #include "utils/dsa.h"
50 :
51 : /*
52 : * When we have to switch over to lossy storage, we use a data structure
53 : * with one bit per page, where all pages having the same number DIV
54 : * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
55 : * and has the bit set for a given page, there must not be a per-page entry
56 : * for that page in the page table.
57 : *
58 : * We actually store both exact pages and lossy chunks in the same hash
59 : * table, using identical data structures. (This is because the memory
60 : * management for hashtables doesn't easily/efficiently allow space to be
61 : * transferred easily from one hashtable to another.) Therefore it's best
62 : * if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not
63 : * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
64 : * avoid expensive integer remainder operations. So, define it like this:
65 : */
66 : #define PAGES_PER_CHUNK (BLCKSZ / 32)
67 :
68 : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
69 :
70 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
71 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
72 :
73 : /* number of active words for an exact page: */
74 : #define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
75 : /* number of active words for a lossy chunk: */
76 : #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
77 :
78 : /*
79 : * The hashtable entries are represented by this data structure. For
80 : * an exact page, blockno is the page number and bit k of the bitmap
81 : * represents tuple offset k+1. For a lossy chunk, blockno is the first
82 : * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
83 : * bit k represents page blockno+k. Note that it is not possible to
84 : * have exact storage for the first page of a chunk if we are using
85 : * lossy storage for any page in the chunk's range, since the same
86 : * hashtable entry has to serve both purposes.
87 : *
88 : * recheck is used only on exact pages --- it indicates that although
89 : * only the stated tuples need be checked, the full index qual condition
90 : * must be checked for each (ie, these are candidate matches).
91 : */
92 : typedef struct PagetableEntry
93 : {
94 : BlockNumber blockno; /* page number (hashtable key) */
95 : char status; /* hash entry status */
96 : bool ischunk; /* T = lossy storage, F = exact */
97 : bool recheck; /* should the tuples be rechecked? */
98 : bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
99 : } PagetableEntry;
100 :
101 : /*
102 : * Holds array of pagetable entries.
103 : */
104 : typedef struct PTEntryArray
105 : {
106 : pg_atomic_uint32 refcount; /* no. of iterator attached */
107 : PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
108 : } PTEntryArray;
109 :
110 : /*
111 : * We want to avoid the overhead of creating the hashtable, which is
112 : * comparatively large, when not necessary. Particularly when we are using a
113 : * bitmap scan on the inside of a nestloop join: a bitmap may well live only
114 : * long enough to accumulate one entry in such cases. We therefore avoid
115 : * creating an actual hashtable until we need two pagetable entries. When
116 : * just one pagetable entry is needed, we store it in a fixed field of
117 : * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
118 : * shrinks down to zero or one page again. So, status can be TBM_HASH even
119 : * when nentries is zero or one.)
120 : */
121 : typedef enum
122 : {
123 : TBM_EMPTY, /* no hashtable, nentries == 0 */
124 : TBM_ONE_PAGE, /* entry1 contains the single entry */
125 : TBM_HASH, /* pagetable is valid, entry1 is not */
126 : } TBMStatus;
127 :
128 : /*
129 : * Current iterating state of the TBM.
130 : */
131 : typedef enum
132 : {
133 : TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
134 : TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
135 : TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
136 : } TBMIteratingState;
137 :
138 : /*
139 : * Here is the representation for a whole TIDBitMap:
140 : */
141 : struct TIDBitmap
142 : {
143 : NodeTag type; /* to make it a valid Node */
144 : MemoryContext mcxt; /* memory context containing me */
145 : TBMStatus status; /* see codes above */
146 : struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
147 : int nentries; /* number of entries in pagetable */
148 : int maxentries; /* limit on same to meet maxbytes */
149 : int npages; /* number of exact entries in pagetable */
150 : int nchunks; /* number of lossy entries in pagetable */
151 : TBMIteratingState iterating; /* tbm_begin_iterate called? */
152 : uint32 lossify_start; /* offset to start lossifying hashtable at */
153 : PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
154 : /* these are valid when iterating is true: */
155 : PagetableEntry **spages; /* sorted exact-page list, or NULL */
156 : PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
157 : dsa_pointer dsapagetable; /* dsa_pointer to the element array */
158 : dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
159 : dsa_pointer ptpages; /* dsa_pointer to the page array */
160 : dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
161 : dsa_area *dsa; /* reference to per-query dsa area */
162 : };
163 :
164 : /*
165 : * When iterating over a backend-local bitmap in sorted order, a
166 : * TBMPrivateIterator is used to track our progress. There can be several
167 : * iterators scanning the same bitmap concurrently. Note that the bitmap
168 : * becomes read-only as soon as any iterator is created.
169 : */
170 : struct TBMPrivateIterator
171 : {
172 : TIDBitmap *tbm; /* TIDBitmap we're iterating over */
173 : int spageptr; /* next spages index */
174 : int schunkptr; /* next schunks index */
175 : int schunkbit; /* next bit to check in current schunk */
176 : };
177 :
178 : /*
179 : * Holds the shared members of the iterator so that multiple processes
180 : * can jointly iterate.
181 : */
182 : typedef struct TBMSharedIteratorState
183 : {
184 : int nentries; /* number of entries in pagetable */
185 : int maxentries; /* limit on same to meet maxbytes */
186 : int npages; /* number of exact entries in pagetable */
187 : int nchunks; /* number of lossy entries in pagetable */
188 : dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
189 : dsa_pointer spages; /* dsa pointer to page array */
190 : dsa_pointer schunks; /* dsa pointer to chunk array */
191 : LWLock lock; /* lock to protect below members */
192 : int spageptr; /* next spages index */
193 : int schunkptr; /* next schunks index */
194 : int schunkbit; /* next bit to check in current schunk */
195 : } TBMSharedIteratorState;
196 :
197 : /*
198 : * pagetable iteration array.
199 : */
200 : typedef struct PTIterationArray
201 : {
202 : pg_atomic_uint32 refcount; /* no. of iterator attached */
203 : int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
204 : } PTIterationArray;
205 :
206 : /*
207 : * same as TBMPrivateIterator, but it is used for joint iteration, therefore
208 : * this also holds a reference to the shared state.
209 : */
210 : struct TBMSharedIterator
211 : {
212 : TBMSharedIteratorState *state; /* shared state */
213 : PTEntryArray *ptbase; /* pagetable element array */
214 : PTIterationArray *ptpages; /* sorted exact page index list */
215 : PTIterationArray *ptchunks; /* sorted lossy page index list */
216 : };
217 :
218 : /* Local function prototypes */
219 : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
220 : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
221 : const TIDBitmap *b);
222 : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
223 : BlockNumber pageno);
224 : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
225 : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
226 : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
227 : static void tbm_lossify(TIDBitmap *tbm);
228 : static int tbm_comparator(const void *left, const void *right);
229 : static int tbm_shared_comparator(const void *left, const void *right,
230 : void *arg);
231 :
232 : /* define hashtable mapping block numbers to PagetableEntry's */
233 : #define SH_USE_NONDEFAULT_ALLOCATOR
234 : #define SH_PREFIX pagetable
235 : #define SH_ELEMENT_TYPE PagetableEntry
236 : #define SH_KEY_TYPE BlockNumber
237 : #define SH_KEY blockno
238 : #define SH_HASH_KEY(tb, key) murmurhash32(key)
239 : #define SH_EQUAL(tb, a, b) a == b
240 : #define SH_SCOPE static inline
241 : #define SH_DEFINE
242 : #define SH_DECLARE
243 : #include "lib/simplehash.h"
244 :
245 :
246 : /*
247 : * tbm_create - create an initially-empty bitmap
248 : *
249 : * The bitmap will live in the memory context that is CurrentMemoryContext
250 : * at the time of this call. It will be limited to (approximately) maxbytes
251 : * total memory consumption. If the DSA passed to this function is not NULL
252 : * then the memory for storing elements of the underlying page table will
253 : * be allocated from the DSA.
254 : */
255 : TIDBitmap *
256 21342 : tbm_create(Size maxbytes, dsa_area *dsa)
257 : {
258 : TIDBitmap *tbm;
259 :
260 : /* Create the TIDBitmap struct and zero all its fields */
261 21342 : tbm = makeNode(TIDBitmap);
262 :
263 21342 : tbm->mcxt = CurrentMemoryContext;
264 21342 : tbm->status = TBM_EMPTY;
265 :
266 21342 : tbm->maxentries = tbm_calculate_entries(maxbytes);
267 21342 : tbm->lossify_start = 0;
268 21342 : tbm->dsa = dsa;
269 21342 : tbm->dsapagetable = InvalidDsaPointer;
270 21342 : tbm->dsapagetableold = InvalidDsaPointer;
271 21342 : tbm->ptpages = InvalidDsaPointer;
272 21342 : tbm->ptchunks = InvalidDsaPointer;
273 :
274 21342 : return tbm;
275 : }
276 :
277 : /*
278 : * Actually create the hashtable. Since this is a moderately expensive
279 : * proposition, we don't do it until we have to.
280 : */
281 : static void
282 9128 : tbm_create_pagetable(TIDBitmap *tbm)
283 : {
284 : Assert(tbm->status != TBM_HASH);
285 : Assert(tbm->pagetable == NULL);
286 :
287 9128 : tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
288 :
289 : /* If entry1 is valid, push it into the hashtable */
290 9128 : if (tbm->status == TBM_ONE_PAGE)
291 : {
292 : PagetableEntry *page;
293 : bool found;
294 : char oldstatus;
295 :
296 6260 : page = pagetable_insert(tbm->pagetable,
297 : tbm->entry1.blockno,
298 : &found);
299 : Assert(!found);
300 6260 : oldstatus = page->status;
301 6260 : memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
302 6260 : page->status = oldstatus;
303 : }
304 :
305 9128 : tbm->status = TBM_HASH;
306 9128 : }
307 :
308 : /*
309 : * tbm_free - free a TIDBitmap
310 : */
311 : void
312 21228 : tbm_free(TIDBitmap *tbm)
313 : {
314 21228 : if (tbm->pagetable)
315 9124 : pagetable_destroy(tbm->pagetable);
316 21228 : if (tbm->spages)
317 5980 : pfree(tbm->spages);
318 21228 : if (tbm->schunks)
319 2886 : pfree(tbm->schunks);
320 21228 : pfree(tbm);
321 21228 : }
322 :
323 : /*
324 : * tbm_free_shared_area - free shared state
325 : *
326 : * Free shared iterator state, Also free shared pagetable and iterator arrays
327 : * memory if they are not referred by any of the shared iterator i.e recount
328 : * is becomes 0.
329 : */
330 : void
331 54 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
332 : {
333 54 : TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
334 : PTEntryArray *ptbase;
335 : PTIterationArray *ptpages;
336 : PTIterationArray *ptchunks;
337 :
338 54 : if (DsaPointerIsValid(istate->pagetable))
339 : {
340 54 : ptbase = dsa_get_address(dsa, istate->pagetable);
341 54 : if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
342 54 : dsa_free(dsa, istate->pagetable);
343 : }
344 54 : if (DsaPointerIsValid(istate->spages))
345 : {
346 54 : ptpages = dsa_get_address(dsa, istate->spages);
347 54 : if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
348 54 : dsa_free(dsa, istate->spages);
349 : }
350 54 : if (DsaPointerIsValid(istate->schunks))
351 : {
352 0 : ptchunks = dsa_get_address(dsa, istate->schunks);
353 0 : if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
354 0 : dsa_free(dsa, istate->schunks);
355 : }
356 :
357 54 : dsa_free(dsa, dp);
358 54 : }
359 :
360 : /*
361 : * tbm_add_tuples - add some tuple IDs to a TIDBitmap
362 : *
363 : * If recheck is true, then the recheck flag will be set in the
364 : * TBMIterateResult when any of these tuples are reported out.
365 : */
366 : void
367 6597316 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
368 : bool recheck)
369 : {
370 6597316 : BlockNumber currblk = InvalidBlockNumber;
371 6597316 : PagetableEntry *page = NULL; /* only valid when currblk is valid */
372 : int i;
373 :
374 : Assert(tbm->iterating == TBM_NOT_ITERATING);
375 15376090 : for (i = 0; i < ntids; i++)
376 : {
377 8778774 : BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
378 8778774 : OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
379 : int wordnum,
380 : bitnum;
381 :
382 : /* safety check to ensure we don't overrun bit array bounds */
383 8778774 : if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE)
384 0 : elog(ERROR, "tuple offset out of range: %u", off);
385 :
386 : /*
387 : * Look up target page unless we already did. This saves cycles when
388 : * the input includes consecutive tuples on the same page, which is
389 : * common enough to justify an extra test here.
390 : */
391 8778774 : if (blk != currblk)
392 : {
393 7374026 : if (tbm_page_is_lossy(tbm, blk))
394 2856 : page = NULL; /* remember page is lossy */
395 : else
396 7371170 : page = tbm_get_pageentry(tbm, blk);
397 7374026 : currblk = blk;
398 : }
399 :
400 8778774 : if (page == NULL)
401 2856 : continue; /* whole page is already marked */
402 :
403 8775918 : if (page->ischunk)
404 : {
405 : /* The page is a lossy chunk header, set bit for itself */
406 0 : wordnum = bitnum = 0;
407 : }
408 : else
409 : {
410 : /* Page is exact, so set bit for individual tuple */
411 8775918 : wordnum = WORDNUM(off - 1);
412 8775918 : bitnum = BITNUM(off - 1);
413 : }
414 8775918 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
415 8775918 : page->recheck |= recheck;
416 :
417 8775918 : if (tbm->nentries > tbm->maxentries)
418 : {
419 36 : tbm_lossify(tbm);
420 : /* Page could have been converted to lossy, so force new lookup */
421 36 : currblk = InvalidBlockNumber;
422 : }
423 : }
424 6597316 : }
425 :
426 : /*
427 : * tbm_add_page - add a whole page to a TIDBitmap
428 : *
429 : * This causes the whole page to be reported (with the recheck flag)
430 : * when the TIDBitmap is scanned.
431 : */
432 : void
433 147990 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
434 : {
435 : /* Enter the page in the bitmap, or mark it lossy if already present */
436 147990 : tbm_mark_page_lossy(tbm, pageno);
437 : /* If we went over the memory limit, lossify some more pages */
438 147990 : if (tbm->nentries > tbm->maxentries)
439 0 : tbm_lossify(tbm);
440 147990 : }
441 :
442 : /*
443 : * tbm_union - set union
444 : *
445 : * a is modified in-place, b is not changed
446 : */
447 : void
448 0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
449 : {
450 : Assert(!a->iterating);
451 : /* Nothing to do if b is empty */
452 0 : if (b->nentries == 0)
453 0 : return;
454 : /* Scan through chunks and pages in b, merge into a */
455 0 : if (b->status == TBM_ONE_PAGE)
456 0 : tbm_union_page(a, &b->entry1);
457 : else
458 : {
459 : pagetable_iterator i;
460 : PagetableEntry *bpage;
461 :
462 : Assert(b->status == TBM_HASH);
463 0 : pagetable_start_iterate(b->pagetable, &i);
464 0 : while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
465 0 : tbm_union_page(a, bpage);
466 : }
467 : }
468 :
469 : /* Process one page of b during a union op */
470 : static void
471 0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
472 : {
473 : PagetableEntry *apage;
474 : int wordnum;
475 :
476 0 : if (bpage->ischunk)
477 : {
478 : /* Scan b's chunk, mark each indicated page lossy in a */
479 0 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
480 : {
481 0 : bitmapword w = bpage->words[wordnum];
482 :
483 0 : if (w != 0)
484 : {
485 : BlockNumber pg;
486 :
487 0 : pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
488 0 : while (w != 0)
489 : {
490 0 : if (w & 1)
491 0 : tbm_mark_page_lossy(a, pg);
492 0 : pg++;
493 0 : w >>= 1;
494 : }
495 : }
496 : }
497 : }
498 0 : else if (tbm_page_is_lossy(a, bpage->blockno))
499 : {
500 : /* page is already lossy in a, nothing to do */
501 0 : return;
502 : }
503 : else
504 : {
505 0 : apage = tbm_get_pageentry(a, bpage->blockno);
506 0 : if (apage->ischunk)
507 : {
508 : /* The page is a lossy chunk header, set bit for itself */
509 0 : apage->words[0] |= ((bitmapword) 1 << 0);
510 : }
511 : else
512 : {
513 : /* Both pages are exact, merge at the bit level */
514 0 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
515 0 : apage->words[wordnum] |= bpage->words[wordnum];
516 0 : apage->recheck |= bpage->recheck;
517 : }
518 : }
519 :
520 0 : if (a->nentries > a->maxentries)
521 0 : tbm_lossify(a);
522 : }
523 :
524 : /*
525 : * tbm_intersect - set intersection
526 : *
527 : * a is modified in-place, b is not changed
528 : */
529 : void
530 198 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
531 : {
532 : Assert(!a->iterating);
533 : /* Nothing to do if a is empty */
534 198 : if (a->nentries == 0)
535 0 : return;
536 : /* Scan through chunks and pages in a, try to match to b */
537 198 : if (a->status == TBM_ONE_PAGE)
538 : {
539 120 : if (tbm_intersect_page(a, &a->entry1, b))
540 : {
541 : /* Page is now empty, remove it from a */
542 : Assert(!a->entry1.ischunk);
543 0 : a->npages--;
544 0 : a->nentries--;
545 : Assert(a->nentries == 0);
546 0 : a->status = TBM_EMPTY;
547 : }
548 : }
549 : else
550 : {
551 : pagetable_iterator i;
552 : PagetableEntry *apage;
553 :
554 : Assert(a->status == TBM_HASH);
555 78 : pagetable_start_iterate(a->pagetable, &i);
556 11260 : while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
557 : {
558 11104 : if (tbm_intersect_page(a, apage, b))
559 : {
560 : /* Page or chunk is now empty, remove it from a */
561 10532 : if (apage->ischunk)
562 0 : a->nchunks--;
563 : else
564 10532 : a->npages--;
565 10532 : a->nentries--;
566 10532 : if (!pagetable_delete(a->pagetable, apage->blockno))
567 0 : elog(ERROR, "hash table corrupted");
568 : }
569 : }
570 : }
571 : }
572 :
573 : /*
574 : * Process one page of a during an intersection op
575 : *
576 : * Returns true if apage is now empty and should be deleted from a
577 : */
578 : static bool
579 11224 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
580 : {
581 : const PagetableEntry *bpage;
582 : int wordnum;
583 :
584 11224 : if (apage->ischunk)
585 : {
586 : /* Scan each bit in chunk, try to clear */
587 60 : bool candelete = true;
588 :
589 300 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
590 : {
591 240 : bitmapword w = apage->words[wordnum];
592 :
593 240 : if (w != 0)
594 : {
595 216 : bitmapword neww = w;
596 : BlockNumber pg;
597 : int bitnum;
598 :
599 216 : pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
600 216 : bitnum = 0;
601 13212 : while (w != 0)
602 : {
603 12996 : if (w & 1)
604 : {
605 6720 : if (!tbm_page_is_lossy(b, pg) &&
606 504 : tbm_find_pageentry(b, pg) == NULL)
607 : {
608 : /* Page is not in b at all, lose lossy bit */
609 0 : neww &= ~((bitmapword) 1 << bitnum);
610 : }
611 : }
612 12996 : pg++;
613 12996 : bitnum++;
614 12996 : w >>= 1;
615 : }
616 216 : apage->words[wordnum] = neww;
617 216 : if (neww != 0)
618 216 : candelete = false;
619 : }
620 : }
621 60 : return candelete;
622 : }
623 11164 : else if (tbm_page_is_lossy(b, apage->blockno))
624 : {
625 : /*
626 : * Some of the tuples in 'a' might not satisfy the quals for 'b', but
627 : * because the page 'b' is lossy, we don't know which ones. Therefore
628 : * we mark 'a' as requiring rechecks, to indicate that at most those
629 : * tuples set in 'a' are matches.
630 : */
631 0 : apage->recheck = true;
632 0 : return false;
633 : }
634 : else
635 : {
636 11164 : bool candelete = true;
637 :
638 11164 : bpage = tbm_find_pageentry(b, apage->blockno);
639 11164 : if (bpage != NULL)
640 : {
641 : /* Both pages are exact, merge at the bit level */
642 : Assert(!bpage->ischunk);
643 55140 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
644 : {
645 45950 : apage->words[wordnum] &= bpage->words[wordnum];
646 45950 : if (apage->words[wordnum] != 0)
647 634 : candelete = false;
648 : }
649 9190 : apage->recheck |= bpage->recheck;
650 : }
651 : /* If there is no matching b page, we can just delete the a page */
652 11164 : return candelete;
653 : }
654 : }
655 :
656 : /*
657 : * tbm_is_empty - is a TIDBitmap completely empty?
658 : */
659 : bool
660 1276 : tbm_is_empty(const TIDBitmap *tbm)
661 : {
662 1276 : return (tbm->nentries == 0);
663 : }
664 :
665 : /*
666 : * tbm_begin_private_iterate - prepare to iterate through a TIDBitmap
667 : *
668 : * The TBMPrivateIterator struct is created in the caller's memory context.
669 : * For a clean shutdown of the iteration, call tbm_end_private_iterate; but
670 : * it's okay to just allow the memory context to be released, too. It is
671 : * caller's responsibility not to touch the TBMPrivateIterator anymore once
672 : * the TIDBitmap is freed.
673 : *
674 : * NB: after this is called, it is no longer allowed to modify the contents
675 : * of the bitmap. However, you can call this multiple times to scan the
676 : * contents repeatedly, including parallel scans.
677 : */
678 : TBMPrivateIterator *
679 20912 : tbm_begin_private_iterate(TIDBitmap *tbm)
680 : {
681 : TBMPrivateIterator *iterator;
682 :
683 : Assert(tbm->iterating != TBM_ITERATING_SHARED);
684 :
685 : /*
686 : * Create the TBMPrivateIterator struct, with enough trailing space to
687 : * serve the needs of the TBMIterateResult sub-struct.
688 : */
689 20912 : iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator));
690 20912 : iterator->tbm = tbm;
691 :
692 : /*
693 : * Initialize iteration pointers.
694 : */
695 20912 : iterator->spageptr = 0;
696 20912 : iterator->schunkptr = 0;
697 20912 : iterator->schunkbit = 0;
698 :
699 : /*
700 : * If we have a hashtable, create and fill the sorted page lists, unless
701 : * we already did that for a previous iterator. Note that the lists are
702 : * attached to the bitmap not the iterator, so they can be used by more
703 : * than one iterator.
704 : */
705 20912 : if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
706 : {
707 : pagetable_iterator i;
708 : PagetableEntry *page;
709 : int npages;
710 : int nchunks;
711 :
712 8858 : if (!tbm->spages && tbm->npages > 0)
713 5984 : tbm->spages = (PagetableEntry **)
714 5984 : MemoryContextAlloc(tbm->mcxt,
715 5984 : tbm->npages * sizeof(PagetableEntry *));
716 8858 : if (!tbm->schunks && tbm->nchunks > 0)
717 2886 : tbm->schunks = (PagetableEntry **)
718 2886 : MemoryContextAlloc(tbm->mcxt,
719 2886 : tbm->nchunks * sizeof(PagetableEntry *));
720 :
721 8858 : npages = nchunks = 0;
722 8858 : pagetable_start_iterate(tbm->pagetable, &i);
723 224110 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
724 : {
725 215252 : if (page->ischunk)
726 2952 : tbm->schunks[nchunks++] = page;
727 : else
728 212300 : tbm->spages[npages++] = page;
729 : }
730 : Assert(npages == tbm->npages);
731 : Assert(nchunks == tbm->nchunks);
732 8858 : if (npages > 1)
733 5984 : qsort(tbm->spages, npages, sizeof(PagetableEntry *),
734 : tbm_comparator);
735 8858 : if (nchunks > 1)
736 18 : qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
737 : tbm_comparator);
738 : }
739 :
740 20912 : tbm->iterating = TBM_ITERATING_PRIVATE;
741 :
742 20912 : return iterator;
743 : }
744 :
745 : /*
746 : * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
747 : *
748 : * The necessary shared state will be allocated from the DSA passed to
749 : * tbm_create, so that multiple processes can attach to it and iterate jointly.
750 : *
751 : * This will convert the pagetable hash into page and chunk array of the index
752 : * into pagetable array.
753 : */
754 : dsa_pointer
755 72 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
756 : {
757 : dsa_pointer dp;
758 : TBMSharedIteratorState *istate;
759 72 : PTEntryArray *ptbase = NULL;
760 72 : PTIterationArray *ptpages = NULL;
761 72 : PTIterationArray *ptchunks = NULL;
762 :
763 : Assert(tbm->dsa != NULL);
764 : Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
765 :
766 : /*
767 : * Allocate TBMSharedIteratorState from DSA to hold the shared members and
768 : * lock, this will also be used by multiple worker for shared iterate.
769 : */
770 72 : dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
771 72 : istate = dsa_get_address(tbm->dsa, dp);
772 :
773 : /*
774 : * If we're not already iterating, create and fill the sorted page lists.
775 : * (If we are, the sorted page lists are already stored in the TIDBitmap,
776 : * and we can just reuse them.)
777 : */
778 72 : if (tbm->iterating == TBM_NOT_ITERATING)
779 : {
780 : pagetable_iterator i;
781 : PagetableEntry *page;
782 : int idx;
783 : int npages;
784 : int nchunks;
785 :
786 : /*
787 : * Allocate the page and chunk array memory from the DSA to share
788 : * across multiple processes.
789 : */
790 72 : if (tbm->npages)
791 : {
792 72 : tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
793 : tbm->npages * sizeof(int));
794 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
795 72 : pg_atomic_init_u32(&ptpages->refcount, 0);
796 : }
797 72 : if (tbm->nchunks)
798 : {
799 6 : tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
800 : tbm->nchunks * sizeof(int));
801 6 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
802 6 : pg_atomic_init_u32(&ptchunks->refcount, 0);
803 : }
804 :
805 : /*
806 : * If TBM status is TBM_HASH then iterate over the pagetable and
807 : * convert it to page and chunk arrays. But if it's in the
808 : * TBM_ONE_PAGE mode then directly allocate the space for one entry
809 : * from the DSA.
810 : */
811 72 : npages = nchunks = 0;
812 72 : if (tbm->status == TBM_HASH)
813 : {
814 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
815 :
816 72 : pagetable_start_iterate(tbm->pagetable, &i);
817 27102 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
818 : {
819 27030 : idx = page - ptbase->ptentry;
820 27030 : if (page->ischunk)
821 24 : ptchunks->index[nchunks++] = idx;
822 : else
823 27006 : ptpages->index[npages++] = idx;
824 : }
825 :
826 : Assert(npages == tbm->npages);
827 : Assert(nchunks == tbm->nchunks);
828 : }
829 0 : else if (tbm->status == TBM_ONE_PAGE)
830 : {
831 : /*
832 : * In one page mode allocate the space for one pagetable entry,
833 : * initialize it, and directly store its index (i.e. 0) in the
834 : * page array.
835 : */
836 0 : tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
837 : sizeof(PagetableEntry));
838 0 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
839 0 : memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
840 0 : ptpages->index[0] = 0;
841 : }
842 :
843 72 : if (ptbase != NULL)
844 72 : pg_atomic_init_u32(&ptbase->refcount, 0);
845 72 : if (npages > 1)
846 72 : qsort_arg(ptpages->index, npages, sizeof(int),
847 72 : tbm_shared_comparator, ptbase->ptentry);
848 72 : if (nchunks > 1)
849 6 : qsort_arg(ptchunks->index, nchunks, sizeof(int),
850 6 : tbm_shared_comparator, ptbase->ptentry);
851 : }
852 :
853 : /*
854 : * Store the TBM members in the shared state so that we can share them
855 : * across multiple processes.
856 : */
857 72 : istate->nentries = tbm->nentries;
858 72 : istate->maxentries = tbm->maxentries;
859 72 : istate->npages = tbm->npages;
860 72 : istate->nchunks = tbm->nchunks;
861 72 : istate->pagetable = tbm->dsapagetable;
862 72 : istate->spages = tbm->ptpages;
863 72 : istate->schunks = tbm->ptchunks;
864 :
865 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
866 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
867 72 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
868 :
869 : /*
870 : * For every shared iterator referring to pagetable and iterator array,
871 : * increase the refcount by 1 so that while freeing the shared iterator we
872 : * don't free pagetable and iterator array until its refcount becomes 0.
873 : */
874 72 : if (ptbase != NULL)
875 72 : pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
876 72 : if (ptpages != NULL)
877 72 : pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
878 72 : if (ptchunks != NULL)
879 6 : pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
880 :
881 : /* Initialize the iterator lock */
882 72 : LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
883 :
884 : /* Initialize the shared iterator state */
885 72 : istate->schunkbit = 0;
886 72 : istate->schunkptr = 0;
887 72 : istate->spageptr = 0;
888 :
889 72 : tbm->iterating = TBM_ITERATING_SHARED;
890 :
891 72 : return dp;
892 : }
893 :
894 : /*
895 : * tbm_extract_page_tuple - extract the tuple offsets from a page
896 : *
897 : * Returns the number of offsets it filled in if <= max_offsets. Otherwise,
898 : * fills in as many offsets as fit and returns the total number of offsets in
899 : * the page.
900 : */
901 : int
902 245934 : tbm_extract_page_tuple(TBMIterateResult *iteritem,
903 : OffsetNumber *offsets,
904 : uint32 max_offsets)
905 : {
906 245934 : PagetableEntry *page = iteritem->internal_page;
907 : int wordnum;
908 245934 : int ntuples = 0;
909 :
910 1475604 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
911 : {
912 1229670 : bitmapword w = page->words[wordnum];
913 :
914 1229670 : if (w != 0)
915 : {
916 567510 : int off = wordnum * BITS_PER_BITMAPWORD + 1;
917 :
918 24444170 : while (w != 0)
919 : {
920 23876660 : if (w & 1)
921 : {
922 5970670 : if (ntuples < max_offsets)
923 5970670 : offsets[ntuples] = (OffsetNumber) off;
924 5970670 : ntuples++;
925 : }
926 23876660 : off++;
927 23876660 : w >>= 1;
928 : }
929 : }
930 : }
931 :
932 245934 : return ntuples;
933 : }
934 :
935 : /*
936 : * tbm_advance_schunkbit - Advance the schunkbit
937 : */
938 : static inline void
939 169542 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
940 : {
941 169542 : int schunkbit = *schunkbitp;
942 :
943 770988 : while (schunkbit < PAGES_PER_CHUNK)
944 : {
945 768012 : int wordnum = WORDNUM(schunkbit);
946 768012 : int bitnum = BITNUM(schunkbit);
947 :
948 768012 : if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
949 166566 : break;
950 601446 : schunkbit++;
951 : }
952 :
953 169542 : *schunkbitp = schunkbit;
954 169542 : }
955 :
956 : /*
957 : * tbm_private_iterate - scan through next page of a TIDBitmap
958 : *
959 : * Caller must pass in a TBMIterateResult to be filled.
960 : *
961 : * Pages are guaranteed to be delivered in numerical order.
962 : *
963 : * Returns false when there are no more pages to scan and true otherwise. When
964 : * there are no more pages to scan, tbmres->blockno is set to
965 : * InvalidBlockNumber.
966 : *
967 : * If lossy is true, then the bitmap is "lossy" and failed to remember
968 : * the exact tuples to look at on this page --- the caller must examine all
969 : * tuples on the page and check if they meet the intended condition. If lossy
970 : * is false, the caller must later extract the tuple offsets from the page
971 : * pointed to by internal_page with tbm_extract_page_tuple.
972 : *
973 : * If tbmres->recheck is true, only the indicated tuples need be examined, but
974 : * the condition must be rechecked anyway. (For ease of testing, recheck is
975 : * always set true when lossy is true.)
976 : */
977 : bool
978 397148 : tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres)
979 : {
980 397148 : TIDBitmap *tbm = iterator->tbm;
981 :
982 : Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
983 :
984 : /*
985 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
986 : * schunkbit to the next set bit.
987 : */
988 400100 : while (iterator->schunkptr < tbm->nchunks)
989 : {
990 163392 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
991 163392 : int schunkbit = iterator->schunkbit;
992 :
993 163392 : tbm_advance_schunkbit(chunk, &schunkbit);
994 163392 : if (schunkbit < PAGES_PER_CHUNK)
995 : {
996 160440 : iterator->schunkbit = schunkbit;
997 160440 : break;
998 : }
999 : /* advance to next chunk */
1000 2952 : iterator->schunkptr++;
1001 2952 : iterator->schunkbit = 0;
1002 : }
1003 :
1004 : /*
1005 : * If both chunk and per-page data remain, must output the numerically
1006 : * earlier page.
1007 : */
1008 397148 : if (iterator->schunkptr < tbm->nchunks)
1009 : {
1010 160440 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1011 : BlockNumber chunk_blockno;
1012 :
1013 160440 : chunk_blockno = chunk->blockno + iterator->schunkbit;
1014 160440 : if (iterator->spageptr >= tbm->npages ||
1015 12450 : chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1016 : {
1017 : /* Return a lossy page indicator from the chunk */
1018 157308 : tbmres->blockno = chunk_blockno;
1019 157308 : tbmres->lossy = true;
1020 157308 : tbmres->recheck = true;
1021 157308 : tbmres->internal_page = NULL;
1022 157308 : iterator->schunkbit++;
1023 157308 : return true;
1024 : }
1025 : }
1026 :
1027 239840 : if (iterator->spageptr < tbm->npages)
1028 : {
1029 : PagetableEntry *page;
1030 :
1031 : /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1032 218960 : if (tbm->status == TBM_ONE_PAGE)
1033 7958 : page = &tbm->entry1;
1034 : else
1035 211002 : page = tbm->spages[iterator->spageptr];
1036 :
1037 218960 : tbmres->internal_page = page;
1038 218960 : tbmres->blockno = page->blockno;
1039 218960 : tbmres->lossy = false;
1040 218960 : tbmres->recheck = page->recheck;
1041 218960 : iterator->spageptr++;
1042 218960 : return true;
1043 : }
1044 :
1045 : /* Nothing more in the bitmap */
1046 20880 : tbmres->blockno = InvalidBlockNumber;
1047 20880 : return false;
1048 : }
1049 :
1050 : /*
1051 : * tbm_shared_iterate - scan through next page of a TIDBitmap
1052 : *
1053 : * As above, but this will iterate using an iterator which is shared
1054 : * across multiple processes. We need to acquire the iterator LWLock,
1055 : * before accessing the shared members.
1056 : */
1057 : bool
1058 30448 : tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres)
1059 : {
1060 30448 : TBMSharedIteratorState *istate = iterator->state;
1061 30448 : PagetableEntry *ptbase = NULL;
1062 30448 : int *idxpages = NULL;
1063 30448 : int *idxchunks = NULL;
1064 :
1065 30448 : if (iterator->ptbase != NULL)
1066 30448 : ptbase = iterator->ptbase->ptentry;
1067 30448 : if (iterator->ptpages != NULL)
1068 30448 : idxpages = iterator->ptpages->index;
1069 30448 : if (iterator->ptchunks != NULL)
1070 7440 : idxchunks = iterator->ptchunks->index;
1071 :
1072 : /* Acquire the LWLock before accessing the shared members */
1073 30448 : LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1074 :
1075 : /*
1076 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1077 : * schunkbit to the next set bit.
1078 : */
1079 30472 : while (istate->schunkptr < istate->nchunks)
1080 : {
1081 6150 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1082 6150 : int schunkbit = istate->schunkbit;
1083 :
1084 6150 : tbm_advance_schunkbit(chunk, &schunkbit);
1085 6150 : if (schunkbit < PAGES_PER_CHUNK)
1086 : {
1087 6126 : istate->schunkbit = schunkbit;
1088 6126 : break;
1089 : }
1090 : /* advance to next chunk */
1091 24 : istate->schunkptr++;
1092 24 : istate->schunkbit = 0;
1093 : }
1094 :
1095 : /*
1096 : * If both chunk and per-page data remain, must output the numerically
1097 : * earlier page.
1098 : */
1099 30448 : if (istate->schunkptr < istate->nchunks)
1100 : {
1101 6126 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1102 : BlockNumber chunk_blockno;
1103 :
1104 6126 : chunk_blockno = chunk->blockno + istate->schunkbit;
1105 :
1106 6126 : if (istate->spageptr >= istate->npages ||
1107 6126 : chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1108 : {
1109 : /* Return a lossy page indicator from the chunk */
1110 3102 : tbmres->blockno = chunk_blockno;
1111 3102 : tbmres->lossy = true;
1112 3102 : tbmres->recheck = true;
1113 3102 : tbmres->internal_page = NULL;
1114 3102 : istate->schunkbit++;
1115 :
1116 3102 : LWLockRelease(&istate->lock);
1117 3102 : return true;
1118 : }
1119 : }
1120 :
1121 27346 : if (istate->spageptr < istate->npages)
1122 : {
1123 27006 : PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1124 :
1125 27006 : tbmres->internal_page = page;
1126 27006 : tbmres->blockno = page->blockno;
1127 27006 : tbmres->lossy = false;
1128 27006 : tbmres->recheck = page->recheck;
1129 27006 : istate->spageptr++;
1130 :
1131 27006 : LWLockRelease(&istate->lock);
1132 :
1133 27006 : return true;
1134 : }
1135 :
1136 340 : LWLockRelease(&istate->lock);
1137 :
1138 : /* Nothing more in the bitmap */
1139 340 : tbmres->blockno = InvalidBlockNumber;
1140 340 : return false;
1141 : }
1142 :
1143 : /*
1144 : * tbm_end_private_iterate - finish an iteration over a TIDBitmap
1145 : *
1146 : * Currently this is just a pfree, but it might do more someday. (For
1147 : * instance, it could be useful to count open iterators and allow the
1148 : * bitmap to return to read/write status when there are no more iterators.)
1149 : */
1150 : void
1151 20798 : tbm_end_private_iterate(TBMPrivateIterator *iterator)
1152 : {
1153 20798 : pfree(iterator);
1154 20798 : }
1155 :
1156 : /*
1157 : * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1158 : *
1159 : * This doesn't free any of the shared state associated with the iterator,
1160 : * just our backend-private state.
1161 : */
1162 : void
1163 340 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
1164 : {
1165 340 : pfree(iterator);
1166 340 : }
1167 :
1168 : /*
1169 : * tbm_find_pageentry - find a PagetableEntry for the pageno
1170 : *
1171 : * Returns NULL if there is no non-lossy entry for the pageno.
1172 : */
1173 : static const PagetableEntry *
1174 11668 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1175 : {
1176 : const PagetableEntry *page;
1177 :
1178 11668 : if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1179 0 : return NULL;
1180 :
1181 11668 : if (tbm->status == TBM_ONE_PAGE)
1182 : {
1183 0 : page = &tbm->entry1;
1184 0 : if (page->blockno != pageno)
1185 0 : return NULL;
1186 : Assert(!page->ischunk);
1187 0 : return page;
1188 : }
1189 :
1190 11668 : page = pagetable_lookup(tbm->pagetable, pageno);
1191 11668 : if (page == NULL)
1192 1974 : return NULL;
1193 9694 : if (page->ischunk)
1194 0 : return NULL; /* don't want a lossy chunk header */
1195 9694 : return page;
1196 : }
1197 :
1198 : /*
1199 : * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1200 : *
1201 : * If new, the entry is marked as an exact (non-chunk) entry.
1202 : *
1203 : * This may cause the table to exceed the desired memory size. It is
1204 : * up to the caller to call tbm_lossify() at the next safe point if so.
1205 : */
1206 : static PagetableEntry *
1207 7371170 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1208 : {
1209 : PagetableEntry *page;
1210 : bool found;
1211 :
1212 7371170 : if (tbm->status == TBM_EMPTY)
1213 : {
1214 : /* Use the fixed slot */
1215 14218 : page = &tbm->entry1;
1216 14218 : found = false;
1217 14218 : tbm->status = TBM_ONE_PAGE;
1218 : }
1219 : else
1220 : {
1221 7356952 : if (tbm->status == TBM_ONE_PAGE)
1222 : {
1223 100340 : page = &tbm->entry1;
1224 100340 : if (page->blockno == pageno)
1225 94080 : return page;
1226 : /* Time to switch from one page to a hashtable */
1227 6260 : tbm_create_pagetable(tbm);
1228 : }
1229 :
1230 : /* Look up or create an entry */
1231 7262872 : page = pagetable_insert(tbm->pagetable, pageno, &found);
1232 : }
1233 :
1234 : /* Initialize it if not present before */
1235 7277090 : if (!found)
1236 : {
1237 288868 : char oldstatus = page->status;
1238 :
1239 2022076 : MemSet(page, 0, sizeof(PagetableEntry));
1240 288868 : page->status = oldstatus;
1241 288868 : page->blockno = pageno;
1242 : /* must count it too */
1243 288868 : tbm->nentries++;
1244 288868 : tbm->npages++;
1245 : }
1246 :
1247 7277090 : return page;
1248 : }
1249 :
1250 : /*
1251 : * tbm_page_is_lossy - is the page marked as lossily stored?
1252 : */
1253 : static bool
1254 7391406 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1255 : {
1256 : PagetableEntry *page;
1257 : BlockNumber chunk_pageno;
1258 : int bitno;
1259 :
1260 : /* we can skip the lookup if there are no lossy chunks */
1261 7391406 : if (tbm->nchunks == 0)
1262 7261140 : return false;
1263 : Assert(tbm->status == TBM_HASH);
1264 :
1265 130266 : bitno = pageno % PAGES_PER_CHUNK;
1266 130266 : chunk_pageno = pageno - bitno;
1267 :
1268 130266 : page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1269 :
1270 130266 : if (page != NULL && page->ischunk)
1271 : {
1272 19134 : int wordnum = WORDNUM(bitno);
1273 19134 : int bitnum = BITNUM(bitno);
1274 :
1275 19134 : if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1276 8568 : return true;
1277 : }
1278 121698 : return false;
1279 : }
1280 :
1281 : /*
1282 : * tbm_mark_page_lossy - mark the page number as lossily stored
1283 : *
1284 : * This may cause the table to exceed the desired memory size. It is
1285 : * up to the caller to call tbm_lossify() at the next safe point if so.
1286 : */
1287 : static void
1288 166458 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1289 : {
1290 : PagetableEntry *page;
1291 : bool found;
1292 : BlockNumber chunk_pageno;
1293 : int bitno;
1294 : int wordnum;
1295 : int bitnum;
1296 :
1297 : /* We force the bitmap into hashtable mode whenever it's lossy */
1298 166458 : if (tbm->status != TBM_HASH)
1299 2868 : tbm_create_pagetable(tbm);
1300 :
1301 166458 : bitno = pageno % PAGES_PER_CHUNK;
1302 166458 : chunk_pageno = pageno - bitno;
1303 :
1304 : /*
1305 : * Remove any extant non-lossy entry for the page. If the page is its own
1306 : * chunk header, however, we skip this and handle the case below.
1307 : */
1308 166458 : if (bitno != 0)
1309 : {
1310 164130 : if (pagetable_delete(tbm->pagetable, pageno))
1311 : {
1312 : /* It was present, so adjust counts */
1313 18468 : tbm->nentries--;
1314 18468 : tbm->npages--; /* assume it must have been non-lossy */
1315 : }
1316 : }
1317 :
1318 : /* Look up or create entry for chunk-header page */
1319 166458 : page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1320 :
1321 : /* Initialize it if not present before */
1322 166458 : if (!found)
1323 : {
1324 2868 : char oldstatus = page->status;
1325 :
1326 20076 : MemSet(page, 0, sizeof(PagetableEntry));
1327 2868 : page->status = oldstatus;
1328 2868 : page->blockno = chunk_pageno;
1329 2868 : page->ischunk = true;
1330 : /* must count it too */
1331 2868 : tbm->nentries++;
1332 2868 : tbm->nchunks++;
1333 : }
1334 163590 : else if (!page->ischunk)
1335 : {
1336 156 : char oldstatus = page->status;
1337 :
1338 : /* chunk header page was formerly non-lossy, make it lossy */
1339 1092 : MemSet(page, 0, sizeof(PagetableEntry));
1340 156 : page->status = oldstatus;
1341 156 : page->blockno = chunk_pageno;
1342 156 : page->ischunk = true;
1343 : /* we assume it had some tuple bit(s) set, so mark it lossy */
1344 156 : page->words[0] = ((bitmapword) 1 << 0);
1345 : /* adjust counts */
1346 156 : tbm->nchunks++;
1347 156 : tbm->npages--;
1348 : }
1349 :
1350 : /* Now set the original target page's bit */
1351 166458 : wordnum = WORDNUM(bitno);
1352 166458 : bitnum = BITNUM(bitno);
1353 166458 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1354 166458 : }
1355 :
1356 : /*
1357 : * tbm_lossify - lose some information to get back under the memory limit
1358 : */
1359 : static void
1360 36 : tbm_lossify(TIDBitmap *tbm)
1361 : {
1362 : pagetable_iterator i;
1363 : PagetableEntry *page;
1364 :
1365 : /*
1366 : * XXX Really stupid implementation: this just lossifies pages in
1367 : * essentially random order. We should be paying some attention to the
1368 : * number of bits set in each page, instead.
1369 : *
1370 : * Since we are called as soon as nentries exceeds maxentries, we should
1371 : * push nentries down to significantly less than maxentries, or else we'll
1372 : * just end up doing this again very soon. We shoot for maxentries/2.
1373 : */
1374 : Assert(tbm->iterating == TBM_NOT_ITERATING);
1375 : Assert(tbm->status == TBM_HASH);
1376 :
1377 36 : pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1378 18492 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1379 : {
1380 18492 : if (page->ischunk)
1381 0 : continue; /* already a chunk header */
1382 :
1383 : /*
1384 : * If the page would become a chunk header, we won't save anything by
1385 : * converting it to lossy, so skip it.
1386 : */
1387 18492 : if ((page->blockno % PAGES_PER_CHUNK) == 0)
1388 24 : continue;
1389 :
1390 : /* This does the dirty work ... */
1391 18468 : tbm_mark_page_lossy(tbm, page->blockno);
1392 :
1393 18468 : if (tbm->nentries <= tbm->maxentries / 2)
1394 : {
1395 : /*
1396 : * We have made enough room. Remember where to start lossifying
1397 : * next round, so we evenly iterate over the hashtable.
1398 : */
1399 36 : tbm->lossify_start = i.cur;
1400 36 : break;
1401 : }
1402 :
1403 : /*
1404 : * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1405 : * hashtable and may have deleted the non-lossy chunk. We can
1406 : * continue the same hash table scan, since failure to visit one
1407 : * element or visiting the newly inserted element, isn't fatal.
1408 : */
1409 : }
1410 :
1411 : /*
1412 : * With a big bitmap and small work_mem, it's possible that we cannot get
1413 : * under maxentries. Again, if that happens, we'd end up uselessly
1414 : * calling tbm_lossify over and over. To prevent this from becoming a
1415 : * performance sink, force maxentries up to at least double the current
1416 : * number of entries. (In essence, we're admitting inability to fit
1417 : * within work_mem when we do this.) Note that this test will not fire if
1418 : * we broke out of the loop early; and if we didn't, the current number of
1419 : * entries is simply not reducible any further.
1420 : */
1421 36 : if (tbm->nentries > tbm->maxentries / 2)
1422 0 : tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1423 36 : }
1424 :
1425 : /*
1426 : * qsort comparator to handle PagetableEntry pointers.
1427 : */
1428 : static int
1429 1409520 : tbm_comparator(const void *left, const void *right)
1430 : {
1431 1409520 : BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1432 1409520 : BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1433 :
1434 1409520 : return pg_cmp_u32(l, r);
1435 : }
1436 :
1437 : /*
1438 : * As above, but this will get index into PagetableEntry array. Therefore,
1439 : * it needs to get actual PagetableEntry using the index before comparing the
1440 : * blockno.
1441 : */
1442 : static int
1443 238290 : tbm_shared_comparator(const void *left, const void *right, void *arg)
1444 : {
1445 238290 : PagetableEntry *base = (PagetableEntry *) arg;
1446 238290 : PagetableEntry *lpage = &base[*(int *) left];
1447 238290 : PagetableEntry *rpage = &base[*(int *) right];
1448 :
1449 238290 : if (lpage->blockno < rpage->blockno)
1450 108966 : return -1;
1451 129324 : else if (lpage->blockno > rpage->blockno)
1452 129324 : return 1;
1453 0 : return 0;
1454 : }
1455 :
1456 : /*
1457 : * tbm_attach_shared_iterate
1458 : *
1459 : * Allocate a backend-private iterator and attach the shared iterator state
1460 : * to it so that multiple processed can iterate jointly.
1461 : *
1462 : * We also converts the DSA pointers to local pointers and store them into
1463 : * our private iterator.
1464 : */
1465 : TBMSharedIterator *
1466 340 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1467 : {
1468 : TBMSharedIterator *iterator;
1469 : TBMSharedIteratorState *istate;
1470 :
1471 : /*
1472 : * Create the TBMSharedIterator struct, with enough trailing space to
1473 : * serve the needs of the TBMIterateResult sub-struct.
1474 : */
1475 340 : iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
1476 :
1477 340 : istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1478 :
1479 340 : iterator->state = istate;
1480 :
1481 340 : iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1482 :
1483 340 : if (istate->npages)
1484 340 : iterator->ptpages = dsa_get_address(dsa, istate->spages);
1485 340 : if (istate->nchunks)
1486 30 : iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1487 :
1488 340 : return iterator;
1489 : }
1490 :
1491 : /*
1492 : * pagetable_allocate
1493 : *
1494 : * Callback function for allocating the memory for hashtable elements.
1495 : * Allocate memory for hashtable elements, using DSA if available.
1496 : */
1497 : static inline void *
1498 9530 : pagetable_allocate(pagetable_hash *pagetable, Size size)
1499 : {
1500 9530 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1501 : PTEntryArray *ptbase;
1502 :
1503 9530 : if (tbm->dsa == NULL)
1504 9374 : return MemoryContextAllocExtended(pagetable->ctx, size,
1505 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1506 :
1507 : /*
1508 : * Save the dsapagetable reference in dsapagetableold before allocating
1509 : * new memory so that pagetable_free can free the old entry.
1510 : */
1511 156 : tbm->dsapagetableold = tbm->dsapagetable;
1512 156 : tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1513 : sizeof(PTEntryArray) + size,
1514 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1515 156 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1516 :
1517 156 : return ptbase->ptentry;
1518 : }
1519 :
1520 : /*
1521 : * pagetable_free
1522 : *
1523 : * Callback function for freeing hash table elements.
1524 : */
1525 : static inline void
1526 9526 : pagetable_free(pagetable_hash *pagetable, void *pointer)
1527 : {
1528 9526 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1529 :
1530 : /* pfree the input pointer if DSA is not available */
1531 9526 : if (tbm->dsa == NULL)
1532 9370 : pfree(pointer);
1533 156 : else if (DsaPointerIsValid(tbm->dsapagetableold))
1534 : {
1535 84 : dsa_free(tbm->dsa, tbm->dsapagetableold);
1536 84 : tbm->dsapagetableold = InvalidDsaPointer;
1537 : }
1538 9526 : }
1539 :
1540 : /*
1541 : * tbm_calculate_entries
1542 : *
1543 : * Estimate number of hashtable entries we can have within maxbytes.
1544 : */
1545 : int
1546 709324 : tbm_calculate_entries(Size maxbytes)
1547 : {
1548 : Size nbuckets;
1549 :
1550 : /*
1551 : * Estimate number of hashtable entries we can have within maxbytes. This
1552 : * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1553 : * for our purpose. Also count an extra Pointer per entry for the arrays
1554 : * created during iteration readout.
1555 : */
1556 709324 : nbuckets = maxbytes /
1557 : (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1558 709324 : nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1559 709324 : nbuckets = Max(nbuckets, 16); /* sanity limit */
1560 :
1561 709324 : return (int) nbuckets;
1562 : }
1563 :
1564 : /*
1565 : * Create a shared or private bitmap iterator and start iteration.
1566 : *
1567 : * `tbm` is only used to create the private iterator and dsa and dsp are only
1568 : * used to create the shared iterator.
1569 : *
1570 : * Before invoking tbm_begin_iterate() to create a shared iterator, one
1571 : * process must already have invoked tbm_prepare_shared_iterate() to create
1572 : * and set up the TBMSharedIteratorState.
1573 : */
1574 : TBMIterator
1575 20532 : tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp)
1576 : {
1577 20532 : TBMIterator iterator = {0};
1578 :
1579 : /* Allocate a private iterator and attach the shared state to it */
1580 20532 : if (DsaPointerIsValid(dsp))
1581 : {
1582 340 : iterator.shared = true;
1583 340 : iterator.i.shared_iterator = tbm_attach_shared_iterate(dsa, dsp);
1584 : }
1585 : else
1586 : {
1587 20192 : iterator.shared = false;
1588 20192 : iterator.i.private_iterator = tbm_begin_private_iterate(tbm);
1589 : }
1590 :
1591 20532 : return iterator;
1592 : }
1593 :
1594 : /*
1595 : * Clean up shared or private bitmap iterator.
1596 : */
1597 : void
1598 20418 : tbm_end_iterate(TBMIterator *iterator)
1599 : {
1600 : Assert(iterator && !tbm_exhausted(iterator));
1601 :
1602 20418 : if (iterator->shared)
1603 340 : tbm_end_shared_iterate(iterator->i.shared_iterator);
1604 : else
1605 20078 : tbm_end_private_iterate(iterator->i.private_iterator);
1606 :
1607 20418 : *iterator = (TBMIterator)
1608 : {
1609 : 0
1610 : };
1611 20418 : }
1612 :
1613 : /*
1614 : * Populate the next TBMIterateResult using the shared or private bitmap
1615 : * iterator. Returns false when there is nothing more to scan.
1616 : */
1617 : bool
1618 421750 : tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
1619 : {
1620 : Assert(iterator);
1621 : Assert(tbmres);
1622 :
1623 421750 : if (iterator->shared)
1624 30448 : return tbm_shared_iterate(iterator->i.shared_iterator, tbmres);
1625 : else
1626 391302 : return tbm_private_iterate(iterator->i.private_iterator, tbmres);
1627 : }
|