Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidbitmap.c
4 : * PostgreSQL tuple-id (TID) bitmap package
5 : *
6 : * This module provides bitmap data structures that are spiritually
7 : * similar to Bitmapsets, but are specially adapted to store sets of
8 : * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 : * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 : * Also, since we wish to be able to store very large tuple sets in
11 : * memory with this data structure, we support "lossy" storage, in which
12 : * we no longer remember individual tuple offsets on a page but only the
13 : * fact that a particular page needs to be visited.
14 : *
15 : * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 : * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 : * of memory. People pushing around tables of that size should have a
18 : * couple of Mb to spare, so we don't worry about providing a second level
19 : * of lossiness. In theory we could fall back to page ranges at some
20 : * point, but for now that seems useless complexity.
21 : *
22 : * We also support the notion of candidate matches, or rechecking. This
23 : * means we know that a search need visit only some tuples on a page,
24 : * but we are not certain that all of those tuples are real matches.
25 : * So the eventual heap scan must recheck the quals for these tuples only,
26 : * rather than rechecking the quals for all tuples on the page as in the
27 : * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 : * into a bitmap, and it can also happen internally when we AND a lossy
29 : * and a non-lossy page.
30 : *
31 : *
32 : * Copyright (c) 2003-2025, PostgreSQL Global Development Group
33 : *
34 : * IDENTIFICATION
35 : * src/backend/nodes/tidbitmap.c
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include <limits.h>
42 :
43 : #include "common/hashfn.h"
44 : #include "common/int.h"
45 : #include "nodes/bitmapset.h"
46 : #include "nodes/tidbitmap.h"
47 : #include "storage/lwlock.h"
48 : #include "utils/dsa.h"
49 :
50 : /*
51 : * When we have to switch over to lossy storage, we use a data structure
52 : * with one bit per page, where all pages having the same number DIV
53 : * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
54 : * and has the bit set for a given page, there must not be a per-page entry
55 : * for that page in the page table.
56 : *
57 : * We actually store both exact pages and lossy chunks in the same hash
58 : * table, using identical data structures. (This is because the memory
59 : * management for hashtables doesn't easily/efficiently allow space to be
60 : * transferred easily from one hashtable to another.) Therefore it's best
61 : * if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not
62 : * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
63 : * avoid expensive integer remainder operations. So, define it like this:
64 : */
65 : #define PAGES_PER_CHUNK (BLCKSZ / 32)
66 :
67 : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
68 :
69 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
70 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
71 :
72 : /* number of active words for an exact page: */
73 : #define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
74 : /* number of active words for a lossy chunk: */
75 : #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
76 :
77 : /*
78 : * The hashtable entries are represented by this data structure. For
79 : * an exact page, blockno is the page number and bit k of the bitmap
80 : * represents tuple offset k+1. For a lossy chunk, blockno is the first
81 : * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
82 : * bit k represents page blockno+k. Note that it is not possible to
83 : * have exact storage for the first page of a chunk if we are using
84 : * lossy storage for any page in the chunk's range, since the same
85 : * hashtable entry has to serve both purposes.
86 : *
87 : * recheck is used only on exact pages --- it indicates that although
88 : * only the stated tuples need be checked, the full index qual condition
89 : * must be checked for each (ie, these are candidate matches).
90 : */
91 : typedef struct PagetableEntry
92 : {
93 : BlockNumber blockno; /* page number (hashtable key) */
94 : char status; /* hash entry status */
95 : bool ischunk; /* T = lossy storage, F = exact */
96 : bool recheck; /* should the tuples be rechecked? */
97 : bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
98 : } PagetableEntry;
99 :
100 : /*
101 : * Holds array of pagetable entries.
102 : */
103 : typedef struct PTEntryArray
104 : {
105 : pg_atomic_uint32 refcount; /* no. of iterator attached */
106 : PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
107 : } PTEntryArray;
108 :
109 : /*
110 : * We want to avoid the overhead of creating the hashtable, which is
111 : * comparatively large, when not necessary. Particularly when we are using a
112 : * bitmap scan on the inside of a nestloop join: a bitmap may well live only
113 : * long enough to accumulate one entry in such cases. We therefore avoid
114 : * creating an actual hashtable until we need two pagetable entries. When
115 : * just one pagetable entry is needed, we store it in a fixed field of
116 : * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
117 : * shrinks down to zero or one page again. So, status can be TBM_HASH even
118 : * when nentries is zero or one.)
119 : */
120 : typedef enum
121 : {
122 : TBM_EMPTY, /* no hashtable, nentries == 0 */
123 : TBM_ONE_PAGE, /* entry1 contains the single entry */
124 : TBM_HASH, /* pagetable is valid, entry1 is not */
125 : } TBMStatus;
126 :
127 : /*
128 : * Current iterating state of the TBM.
129 : */
130 : typedef enum
131 : {
132 : TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
133 : TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
134 : TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
135 : } TBMIteratingState;
136 :
137 : /*
138 : * Here is the representation for a whole TIDBitMap:
139 : */
140 : struct TIDBitmap
141 : {
142 : NodeTag type; /* to make it a valid Node */
143 : MemoryContext mcxt; /* memory context containing me */
144 : TBMStatus status; /* see codes above */
145 : struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
146 : int nentries; /* number of entries in pagetable */
147 : int maxentries; /* limit on same to meet maxbytes */
148 : int npages; /* number of exact entries in pagetable */
149 : int nchunks; /* number of lossy entries in pagetable */
150 : TBMIteratingState iterating; /* tbm_begin_iterate called? */
151 : uint32 lossify_start; /* offset to start lossifying hashtable at */
152 : PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
153 : /* these are valid when iterating is true: */
154 : PagetableEntry **spages; /* sorted exact-page list, or NULL */
155 : PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
156 : dsa_pointer dsapagetable; /* dsa_pointer to the element array */
157 : dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
158 : dsa_pointer ptpages; /* dsa_pointer to the page array */
159 : dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
160 : dsa_area *dsa; /* reference to per-query dsa area */
161 : };
162 :
163 : /*
164 : * When iterating over a backend-local bitmap in sorted order, a
165 : * TBMPrivateIterator is used to track our progress. There can be several
166 : * iterators scanning the same bitmap concurrently. Note that the bitmap
167 : * becomes read-only as soon as any iterator is created.
168 : */
169 : struct TBMPrivateIterator
170 : {
171 : TIDBitmap *tbm; /* TIDBitmap we're iterating over */
172 : int spageptr; /* next spages index */
173 : int schunkptr; /* next schunks index */
174 : int schunkbit; /* next bit to check in current schunk */
175 : };
176 :
177 : /*
178 : * Holds the shared members of the iterator so that multiple processes
179 : * can jointly iterate.
180 : */
181 : typedef struct TBMSharedIteratorState
182 : {
183 : int nentries; /* number of entries in pagetable */
184 : int maxentries; /* limit on same to meet maxbytes */
185 : int npages; /* number of exact entries in pagetable */
186 : int nchunks; /* number of lossy entries in pagetable */
187 : dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
188 : dsa_pointer spages; /* dsa pointer to page array */
189 : dsa_pointer schunks; /* dsa pointer to chunk array */
190 : LWLock lock; /* lock to protect below members */
191 : int spageptr; /* next spages index */
192 : int schunkptr; /* next schunks index */
193 : int schunkbit; /* next bit to check in current schunk */
194 : } TBMSharedIteratorState;
195 :
196 : /*
197 : * pagetable iteration array.
198 : */
199 : typedef struct PTIterationArray
200 : {
201 : pg_atomic_uint32 refcount; /* no. of iterator attached */
202 : int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
203 : } PTIterationArray;
204 :
205 : /*
206 : * same as TBMPrivateIterator, but it is used for joint iteration, therefore
207 : * this also holds a reference to the shared state.
208 : */
209 : struct TBMSharedIterator
210 : {
211 : TBMSharedIteratorState *state; /* shared state */
212 : PTEntryArray *ptbase; /* pagetable element array */
213 : PTIterationArray *ptpages; /* sorted exact page index list */
214 : PTIterationArray *ptchunks; /* sorted lossy page index list */
215 : };
216 :
217 : /* Local function prototypes */
218 : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
219 : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
220 : const TIDBitmap *b);
221 : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
222 : BlockNumber pageno);
223 : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
224 : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
225 : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
226 : static void tbm_lossify(TIDBitmap *tbm);
227 : static int tbm_comparator(const void *left, const void *right);
228 : static int tbm_shared_comparator(const void *left, const void *right,
229 : void *arg);
230 :
231 : /* define hashtable mapping block numbers to PagetableEntry's */
232 : #define SH_USE_NONDEFAULT_ALLOCATOR
233 : #define SH_PREFIX pagetable
234 : #define SH_ELEMENT_TYPE PagetableEntry
235 : #define SH_KEY_TYPE BlockNumber
236 : #define SH_KEY blockno
237 : #define SH_HASH_KEY(tb, key) murmurhash32(key)
238 : #define SH_EQUAL(tb, a, b) a == b
239 : #define SH_SCOPE static inline
240 : #define SH_DEFINE
241 : #define SH_DECLARE
242 : #include "lib/simplehash.h"
243 :
244 :
245 : /*
246 : * tbm_create - create an initially-empty bitmap
247 : *
248 : * The bitmap will live in the memory context that is CurrentMemoryContext
249 : * at the time of this call. It will be limited to (approximately) maxbytes
250 : * total memory consumption. If the DSA passed to this function is not NULL
251 : * then the memory for storing elements of the underlying page table will
252 : * be allocated from the DSA.
253 : */
254 : TIDBitmap *
255 26036 : tbm_create(Size maxbytes, dsa_area *dsa)
256 : {
257 : TIDBitmap *tbm;
258 :
259 : /* Create the TIDBitmap struct and zero all its fields */
260 26036 : tbm = makeNode(TIDBitmap);
261 :
262 26036 : tbm->mcxt = CurrentMemoryContext;
263 26036 : tbm->status = TBM_EMPTY;
264 :
265 26036 : tbm->maxentries = tbm_calculate_entries(maxbytes);
266 26036 : tbm->lossify_start = 0;
267 26036 : tbm->dsa = dsa;
268 26036 : tbm->dsapagetable = InvalidDsaPointer;
269 26036 : tbm->dsapagetableold = InvalidDsaPointer;
270 26036 : tbm->ptpages = InvalidDsaPointer;
271 26036 : tbm->ptchunks = InvalidDsaPointer;
272 :
273 26036 : return tbm;
274 : }
275 :
276 : /*
277 : * Actually create the hashtable. Since this is a moderately expensive
278 : * proposition, we don't do it until we have to.
279 : */
280 : static void
281 8624 : tbm_create_pagetable(TIDBitmap *tbm)
282 : {
283 : Assert(tbm->status != TBM_HASH);
284 : Assert(tbm->pagetable == NULL);
285 :
286 8624 : tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
287 :
288 : /* If entry1 is valid, push it into the hashtable */
289 8624 : if (tbm->status == TBM_ONE_PAGE)
290 : {
291 : PagetableEntry *page;
292 : bool found;
293 : char oldstatus;
294 :
295 5756 : page = pagetable_insert(tbm->pagetable,
296 : tbm->entry1.blockno,
297 : &found);
298 : Assert(!found);
299 5756 : oldstatus = page->status;
300 5756 : memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
301 5756 : page->status = oldstatus;
302 : }
303 :
304 8624 : tbm->status = TBM_HASH;
305 8624 : }
306 :
307 : /*
308 : * tbm_free - free a TIDBitmap
309 : */
310 : void
311 25934 : tbm_free(TIDBitmap *tbm)
312 : {
313 25934 : if (tbm->pagetable)
314 8620 : pagetable_destroy(tbm->pagetable);
315 25934 : if (tbm->spages)
316 5496 : pfree(tbm->spages);
317 25934 : if (tbm->schunks)
318 2886 : pfree(tbm->schunks);
319 25934 : pfree(tbm);
320 25934 : }
321 :
322 : /*
323 : * tbm_free_shared_area - free shared state
324 : *
325 : * Free shared iterator state, Also free shared pagetable and iterator arrays
326 : * memory if they are not referred by any of the shared iterator i.e recount
327 : * is becomes 0.
328 : */
329 : void
330 54 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
331 : {
332 54 : TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
333 : PTEntryArray *ptbase;
334 : PTIterationArray *ptpages;
335 : PTIterationArray *ptchunks;
336 :
337 54 : if (DsaPointerIsValid(istate->pagetable))
338 : {
339 54 : ptbase = dsa_get_address(dsa, istate->pagetable);
340 54 : if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
341 54 : dsa_free(dsa, istate->pagetable);
342 : }
343 54 : if (DsaPointerIsValid(istate->spages))
344 : {
345 54 : ptpages = dsa_get_address(dsa, istate->spages);
346 54 : if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
347 54 : dsa_free(dsa, istate->spages);
348 : }
349 54 : if (DsaPointerIsValid(istate->schunks))
350 : {
351 0 : ptchunks = dsa_get_address(dsa, istate->schunks);
352 0 : if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
353 0 : dsa_free(dsa, istate->schunks);
354 : }
355 :
356 54 : dsa_free(dsa, dp);
357 54 : }
358 :
359 : /*
360 : * tbm_add_tuples - add some tuple IDs to a TIDBitmap
361 : *
362 : * If recheck is true, then the recheck flag will be set in the
363 : * TBMIterateResult when any of these tuples are reported out.
364 : */
365 : void
366 6541018 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
367 : bool recheck)
368 : {
369 6541018 : BlockNumber currblk = InvalidBlockNumber;
370 6541018 : PagetableEntry *page = NULL; /* only valid when currblk is valid */
371 : int i;
372 :
373 : Assert(tbm->iterating == TBM_NOT_ITERATING);
374 15263494 : for (i = 0; i < ntids; i++)
375 : {
376 8722476 : BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
377 8722476 : OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
378 : int wordnum,
379 : bitnum;
380 :
381 : /* safety check to ensure we don't overrun bit array bounds */
382 8722476 : if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE)
383 0 : elog(ERROR, "tuple offset out of range: %u", off);
384 :
385 : /*
386 : * Look up target page unless we already did. This saves cycles when
387 : * the input includes consecutive tuples on the same page, which is
388 : * common enough to justify an extra test here.
389 : */
390 8722476 : if (blk != currblk)
391 : {
392 7317728 : if (tbm_page_is_lossy(tbm, blk))
393 2856 : page = NULL; /* remember page is lossy */
394 : else
395 7314872 : page = tbm_get_pageentry(tbm, blk);
396 7317728 : currblk = blk;
397 : }
398 :
399 8722476 : if (page == NULL)
400 2856 : continue; /* whole page is already marked */
401 :
402 8719620 : if (page->ischunk)
403 : {
404 : /* The page is a lossy chunk header, set bit for itself */
405 0 : wordnum = bitnum = 0;
406 : }
407 : else
408 : {
409 : /* Page is exact, so set bit for individual tuple */
410 8719620 : wordnum = WORDNUM(off - 1);
411 8719620 : bitnum = BITNUM(off - 1);
412 : }
413 8719620 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
414 8719620 : page->recheck |= recheck;
415 :
416 8719620 : if (tbm->nentries > tbm->maxentries)
417 : {
418 36 : tbm_lossify(tbm);
419 : /* Page could have been converted to lossy, so force new lookup */
420 36 : currblk = InvalidBlockNumber;
421 : }
422 : }
423 6541018 : }
424 :
425 : /*
426 : * tbm_add_page - add a whole page to a TIDBitmap
427 : *
428 : * This causes the whole page to be reported (with the recheck flag)
429 : * when the TIDBitmap is scanned.
430 : */
431 : void
432 147980 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
433 : {
434 : /* Enter the page in the bitmap, or mark it lossy if already present */
435 147980 : tbm_mark_page_lossy(tbm, pageno);
436 : /* If we went over the memory limit, lossify some more pages */
437 147980 : if (tbm->nentries > tbm->maxentries)
438 0 : tbm_lossify(tbm);
439 147980 : }
440 :
441 : /*
442 : * tbm_union - set union
443 : *
444 : * a is modified in-place, b is not changed
445 : */
446 : void
447 0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
448 : {
449 : Assert(!a->iterating);
450 : /* Nothing to do if b is empty */
451 0 : if (b->nentries == 0)
452 0 : return;
453 : /* Scan through chunks and pages in b, merge into a */
454 0 : if (b->status == TBM_ONE_PAGE)
455 0 : tbm_union_page(a, &b->entry1);
456 : else
457 : {
458 : pagetable_iterator i;
459 : PagetableEntry *bpage;
460 :
461 : Assert(b->status == TBM_HASH);
462 0 : pagetable_start_iterate(b->pagetable, &i);
463 0 : while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
464 0 : tbm_union_page(a, bpage);
465 : }
466 : }
467 :
468 : /* Process one page of b during a union op */
469 : static void
470 0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
471 : {
472 : PagetableEntry *apage;
473 : int wordnum;
474 :
475 0 : if (bpage->ischunk)
476 : {
477 : /* Scan b's chunk, mark each indicated page lossy in a */
478 0 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
479 : {
480 0 : bitmapword w = bpage->words[wordnum];
481 :
482 0 : if (w != 0)
483 : {
484 : BlockNumber pg;
485 :
486 0 : pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
487 0 : while (w != 0)
488 : {
489 0 : if (w & 1)
490 0 : tbm_mark_page_lossy(a, pg);
491 0 : pg++;
492 0 : w >>= 1;
493 : }
494 : }
495 : }
496 : }
497 0 : else if (tbm_page_is_lossy(a, bpage->blockno))
498 : {
499 : /* page is already lossy in a, nothing to do */
500 0 : return;
501 : }
502 : else
503 : {
504 0 : apage = tbm_get_pageentry(a, bpage->blockno);
505 0 : if (apage->ischunk)
506 : {
507 : /* The page is a lossy chunk header, set bit for itself */
508 0 : apage->words[0] |= ((bitmapword) 1 << 0);
509 : }
510 : else
511 : {
512 : /* Both pages are exact, merge at the bit level */
513 0 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
514 0 : apage->words[wordnum] |= bpage->words[wordnum];
515 0 : apage->recheck |= bpage->recheck;
516 : }
517 : }
518 :
519 0 : if (a->nentries > a->maxentries)
520 0 : tbm_lossify(a);
521 : }
522 :
523 : /*
524 : * tbm_intersect - set intersection
525 : *
526 : * a is modified in-place, b is not changed
527 : */
528 : void
529 178 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
530 : {
531 : Assert(!a->iterating);
532 : /* Nothing to do if a is empty */
533 178 : if (a->nentries == 0)
534 0 : return;
535 : /* Scan through chunks and pages in a, try to match to b */
536 178 : if (a->status == TBM_ONE_PAGE)
537 : {
538 120 : if (tbm_intersect_page(a, &a->entry1, b))
539 : {
540 : /* Page is now empty, remove it from a */
541 : Assert(!a->entry1.ischunk);
542 0 : a->npages--;
543 0 : a->nentries--;
544 : Assert(a->nentries == 0);
545 0 : a->status = TBM_EMPTY;
546 : }
547 : }
548 : else
549 : {
550 : pagetable_iterator i;
551 : PagetableEntry *apage;
552 :
553 : Assert(a->status == TBM_HASH);
554 58 : pagetable_start_iterate(a->pagetable, &i);
555 11088 : while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
556 : {
557 11030 : if (tbm_intersect_page(a, apage, b))
558 : {
559 : /* Page or chunk is now empty, remove it from a */
560 10500 : if (apage->ischunk)
561 0 : a->nchunks--;
562 : else
563 10500 : a->npages--;
564 10500 : a->nentries--;
565 10500 : if (!pagetable_delete(a->pagetable, apage->blockno))
566 0 : elog(ERROR, "hash table corrupted");
567 : }
568 : }
569 : }
570 : }
571 :
572 : /*
573 : * Process one page of a during an intersection op
574 : *
575 : * Returns true if apage is now empty and should be deleted from a
576 : */
577 : static bool
578 11150 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
579 : {
580 : const PagetableEntry *bpage;
581 : int wordnum;
582 :
583 11150 : if (apage->ischunk)
584 : {
585 : /* Scan each bit in chunk, try to clear */
586 60 : bool candelete = true;
587 :
588 300 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
589 : {
590 240 : bitmapword w = apage->words[wordnum];
591 :
592 240 : if (w != 0)
593 : {
594 216 : bitmapword neww = w;
595 : BlockNumber pg;
596 : int bitnum;
597 :
598 216 : pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
599 216 : bitnum = 0;
600 13212 : while (w != 0)
601 : {
602 12996 : if (w & 1)
603 : {
604 6720 : if (!tbm_page_is_lossy(b, pg) &&
605 504 : tbm_find_pageentry(b, pg) == NULL)
606 : {
607 : /* Page is not in b at all, lose lossy bit */
608 0 : neww &= ~((bitmapword) 1 << bitnum);
609 : }
610 : }
611 12996 : pg++;
612 12996 : bitnum++;
613 12996 : w >>= 1;
614 : }
615 216 : apage->words[wordnum] = neww;
616 216 : if (neww != 0)
617 216 : candelete = false;
618 : }
619 : }
620 60 : return candelete;
621 : }
622 11090 : else if (tbm_page_is_lossy(b, apage->blockno))
623 : {
624 : /*
625 : * Some of the tuples in 'a' might not satisfy the quals for 'b', but
626 : * because the page 'b' is lossy, we don't know which ones. Therefore
627 : * we mark 'a' as requiring rechecks, to indicate that at most those
628 : * tuples set in 'a' are matches.
629 : */
630 0 : apage->recheck = true;
631 0 : return false;
632 : }
633 : else
634 : {
635 11090 : bool candelete = true;
636 :
637 11090 : bpage = tbm_find_pageentry(b, apage->blockno);
638 11090 : if (bpage != NULL)
639 : {
640 : /* Both pages are exact, merge at the bit level */
641 : Assert(!bpage->ischunk);
642 54828 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
643 : {
644 45690 : apage->words[wordnum] &= bpage->words[wordnum];
645 45690 : if (apage->words[wordnum] != 0)
646 594 : candelete = false;
647 : }
648 9138 : apage->recheck |= bpage->recheck;
649 : }
650 : /* If there is no matching b page, we can just delete the a page */
651 11090 : return candelete;
652 : }
653 : }
654 :
655 : /*
656 : * tbm_is_empty - is a TIDBitmap completely empty?
657 : */
658 : bool
659 890 : tbm_is_empty(const TIDBitmap *tbm)
660 : {
661 890 : return (tbm->nentries == 0);
662 : }
663 :
664 : /*
665 : * tbm_begin_private_iterate - prepare to iterate through a TIDBitmap
666 : *
667 : * The TBMPrivateIterator struct is created in the caller's memory context.
668 : * For a clean shutdown of the iteration, call tbm_end_private_iterate; but
669 : * it's okay to just allow the memory context to be released, too. It is
670 : * caller's responsibility not to touch the TBMPrivateIterator anymore once
671 : * the TIDBitmap is freed.
672 : *
673 : * NB: after this is called, it is no longer allowed to modify the contents
674 : * of the bitmap. However, you can call this multiple times to scan the
675 : * contents repeatedly, including parallel scans.
676 : */
677 : TBMPrivateIterator *
678 25672 : tbm_begin_private_iterate(TIDBitmap *tbm)
679 : {
680 : TBMPrivateIterator *iterator;
681 :
682 : Assert(tbm->iterating != TBM_ITERATING_SHARED);
683 :
684 : /*
685 : * Create the TBMPrivateIterator struct, with enough trailing space to
686 : * serve the needs of the TBMIterateResult sub-struct.
687 : */
688 25672 : iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator));
689 25672 : iterator->tbm = tbm;
690 :
691 : /*
692 : * Initialize iteration pointers.
693 : */
694 25672 : iterator->spageptr = 0;
695 25672 : iterator->schunkptr = 0;
696 25672 : iterator->schunkbit = 0;
697 :
698 : /*
699 : * If we have a hashtable, create and fill the sorted page lists, unless
700 : * we already did that for a previous iterator. Note that the lists are
701 : * attached to the bitmap not the iterator, so they can be used by more
702 : * than one iterator.
703 : */
704 25672 : if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
705 : {
706 : pagetable_iterator i;
707 : PagetableEntry *page;
708 : int npages;
709 : int nchunks;
710 :
711 8374 : if (!tbm->spages && tbm->npages > 0)
712 5500 : tbm->spages = (PagetableEntry **)
713 5500 : MemoryContextAlloc(tbm->mcxt,
714 5500 : tbm->npages * sizeof(PagetableEntry *));
715 8374 : if (!tbm->schunks && tbm->nchunks > 0)
716 2886 : tbm->schunks = (PagetableEntry **)
717 2886 : MemoryContextAlloc(tbm->mcxt,
718 2886 : tbm->nchunks * sizeof(PagetableEntry *));
719 :
720 8374 : npages = nchunks = 0;
721 8374 : pagetable_start_iterate(tbm->pagetable, &i);
722 219632 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
723 : {
724 211258 : if (page->ischunk)
725 2952 : tbm->schunks[nchunks++] = page;
726 : else
727 208306 : tbm->spages[npages++] = page;
728 : }
729 : Assert(npages == tbm->npages);
730 : Assert(nchunks == tbm->nchunks);
731 8374 : if (npages > 1)
732 5500 : qsort(tbm->spages, npages, sizeof(PagetableEntry *),
733 : tbm_comparator);
734 8374 : if (nchunks > 1)
735 18 : qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
736 : tbm_comparator);
737 : }
738 :
739 25672 : tbm->iterating = TBM_ITERATING_PRIVATE;
740 :
741 25672 : return iterator;
742 : }
743 :
744 : /*
745 : * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
746 : *
747 : * The necessary shared state will be allocated from the DSA passed to
748 : * tbm_create, so that multiple processes can attach to it and iterate jointly.
749 : *
750 : * This will convert the pagetable hash into page and chunk array of the index
751 : * into pagetable array.
752 : */
753 : dsa_pointer
754 72 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
755 : {
756 : dsa_pointer dp;
757 : TBMSharedIteratorState *istate;
758 72 : PTEntryArray *ptbase = NULL;
759 72 : PTIterationArray *ptpages = NULL;
760 72 : PTIterationArray *ptchunks = NULL;
761 :
762 : Assert(tbm->dsa != NULL);
763 : Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
764 :
765 : /*
766 : * Allocate TBMSharedIteratorState from DSA to hold the shared members and
767 : * lock, this will also be used by multiple worker for shared iterate.
768 : */
769 72 : dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
770 72 : istate = dsa_get_address(tbm->dsa, dp);
771 :
772 : /*
773 : * If we're not already iterating, create and fill the sorted page lists.
774 : * (If we are, the sorted page lists are already stored in the TIDBitmap,
775 : * and we can just reuse them.)
776 : */
777 72 : if (tbm->iterating == TBM_NOT_ITERATING)
778 : {
779 : pagetable_iterator i;
780 : PagetableEntry *page;
781 : int idx;
782 : int npages;
783 : int nchunks;
784 :
785 : /*
786 : * Allocate the page and chunk array memory from the DSA to share
787 : * across multiple processes.
788 : */
789 72 : if (tbm->npages)
790 : {
791 72 : tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
792 : tbm->npages * sizeof(int));
793 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
794 72 : pg_atomic_init_u32(&ptpages->refcount, 0);
795 : }
796 72 : if (tbm->nchunks)
797 : {
798 6 : tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
799 : tbm->nchunks * sizeof(int));
800 6 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
801 6 : pg_atomic_init_u32(&ptchunks->refcount, 0);
802 : }
803 :
804 : /*
805 : * If TBM status is TBM_HASH then iterate over the pagetable and
806 : * convert it to page and chunk arrays. But if it's in the
807 : * TBM_ONE_PAGE mode then directly allocate the space for one entry
808 : * from the DSA.
809 : */
810 72 : npages = nchunks = 0;
811 72 : if (tbm->status == TBM_HASH)
812 : {
813 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
814 :
815 72 : pagetable_start_iterate(tbm->pagetable, &i);
816 27102 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
817 : {
818 27030 : idx = page - ptbase->ptentry;
819 27030 : if (page->ischunk)
820 24 : ptchunks->index[nchunks++] = idx;
821 : else
822 27006 : ptpages->index[npages++] = idx;
823 : }
824 :
825 : Assert(npages == tbm->npages);
826 : Assert(nchunks == tbm->nchunks);
827 : }
828 0 : else if (tbm->status == TBM_ONE_PAGE)
829 : {
830 : /*
831 : * In one page mode allocate the space for one pagetable entry,
832 : * initialize it, and directly store its index (i.e. 0) in the
833 : * page array.
834 : */
835 0 : tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
836 : sizeof(PagetableEntry));
837 0 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
838 0 : memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
839 0 : ptpages->index[0] = 0;
840 : }
841 :
842 72 : if (ptbase != NULL)
843 72 : pg_atomic_init_u32(&ptbase->refcount, 0);
844 72 : if (npages > 1)
845 72 : qsort_arg(ptpages->index, npages, sizeof(int),
846 72 : tbm_shared_comparator, ptbase->ptentry);
847 72 : if (nchunks > 1)
848 6 : qsort_arg(ptchunks->index, nchunks, sizeof(int),
849 6 : tbm_shared_comparator, ptbase->ptentry);
850 : }
851 :
852 : /*
853 : * Store the TBM members in the shared state so that we can share them
854 : * across multiple processes.
855 : */
856 72 : istate->nentries = tbm->nentries;
857 72 : istate->maxentries = tbm->maxentries;
858 72 : istate->npages = tbm->npages;
859 72 : istate->nchunks = tbm->nchunks;
860 72 : istate->pagetable = tbm->dsapagetable;
861 72 : istate->spages = tbm->ptpages;
862 72 : istate->schunks = tbm->ptchunks;
863 :
864 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
865 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
866 72 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
867 :
868 : /*
869 : * For every shared iterator referring to pagetable and iterator array,
870 : * increase the refcount by 1 so that while freeing the shared iterator we
871 : * don't free pagetable and iterator array until its refcount becomes 0.
872 : */
873 72 : if (ptbase != NULL)
874 72 : pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
875 72 : if (ptpages != NULL)
876 72 : pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
877 72 : if (ptchunks != NULL)
878 6 : pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
879 :
880 : /* Initialize the iterator lock */
881 72 : LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
882 :
883 : /* Initialize the shared iterator state */
884 72 : istate->schunkbit = 0;
885 72 : istate->schunkptr = 0;
886 72 : istate->spageptr = 0;
887 :
888 72 : tbm->iterating = TBM_ITERATING_SHARED;
889 :
890 72 : return dp;
891 : }
892 :
893 : /*
894 : * tbm_extract_page_tuple - extract the tuple offsets from a page
895 : *
896 : * Returns the number of offsets it filled in if <= max_offsets. Otherwise,
897 : * fills in as many offsets as fit and returns the total number of offsets in
898 : * the page.
899 : */
900 : int
901 241676 : tbm_extract_page_tuple(TBMIterateResult *iteritem,
902 : OffsetNumber *offsets,
903 : uint32 max_offsets)
904 : {
905 241676 : PagetableEntry *page = iteritem->internal_page;
906 : int wordnum;
907 241676 : int ntuples = 0;
908 :
909 1450056 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
910 : {
911 1208380 : bitmapword w = page->words[wordnum];
912 :
913 1208380 : if (w != 0)
914 : {
915 561358 : int off = wordnum * BITS_PER_BITMAPWORD + 1;
916 :
917 24298048 : while (w != 0)
918 : {
919 23736690 : if (w & 1)
920 : {
921 5914300 : if (ntuples < max_offsets)
922 5914300 : offsets[ntuples] = (OffsetNumber) off;
923 5914300 : ntuples++;
924 : }
925 23736690 : off++;
926 23736690 : w >>= 1;
927 : }
928 : }
929 : }
930 :
931 241676 : return ntuples;
932 : }
933 :
934 : /*
935 : * tbm_advance_schunkbit - Advance the schunkbit
936 : */
937 : static inline void
938 169532 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
939 : {
940 169532 : int schunkbit = *schunkbitp;
941 :
942 770988 : while (schunkbit < PAGES_PER_CHUNK)
943 : {
944 768012 : int wordnum = WORDNUM(schunkbit);
945 768012 : int bitnum = BITNUM(schunkbit);
946 :
947 768012 : if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
948 166556 : break;
949 601456 : schunkbit++;
950 : }
951 :
952 169532 : *schunkbitp = schunkbit;
953 169532 : }
954 :
955 : /*
956 : * tbm_private_iterate - scan through next page of a TIDBitmap
957 : *
958 : * Caller must pass in a TBMIterateResult to be filled.
959 : *
960 : * Pages are guaranteed to be delivered in numerical order.
961 : *
962 : * Returns false when there are no more pages to scan and true otherwise. When
963 : * there are no more pages to scan, tbmres->blockno is set to
964 : * InvalidBlockNumber.
965 : *
966 : * If lossy is true, then the bitmap is "lossy" and failed to remember
967 : * the exact tuples to look at on this page --- the caller must examine all
968 : * tuples on the page and check if they meet the intended condition. If lossy
969 : * is false, the caller must later extract the tuple offsets from the page
970 : * pointed to by internal_page with tbm_extract_page_tuple.
971 : *
972 : * If tbmres->recheck is true, only the indicated tuples need be examined, but
973 : * the condition must be rechecked anyway. (For ease of testing, recheck is
974 : * always set true when lossy is true.)
975 : */
976 : bool
977 397640 : tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres)
978 : {
979 397640 : TIDBitmap *tbm = iterator->tbm;
980 :
981 : Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
982 :
983 : /*
984 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
985 : * schunkbit to the next set bit.
986 : */
987 400592 : while (iterator->schunkptr < tbm->nchunks)
988 : {
989 163382 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
990 163382 : int schunkbit = iterator->schunkbit;
991 :
992 163382 : tbm_advance_schunkbit(chunk, &schunkbit);
993 163382 : if (schunkbit < PAGES_PER_CHUNK)
994 : {
995 160430 : iterator->schunkbit = schunkbit;
996 160430 : break;
997 : }
998 : /* advance to next chunk */
999 2952 : iterator->schunkptr++;
1000 2952 : iterator->schunkbit = 0;
1001 : }
1002 :
1003 : /*
1004 : * If both chunk and per-page data remain, must output the numerically
1005 : * earlier page.
1006 : */
1007 397640 : if (iterator->schunkptr < tbm->nchunks)
1008 : {
1009 160430 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1010 : BlockNumber chunk_blockno;
1011 :
1012 160430 : chunk_blockno = chunk->blockno + iterator->schunkbit;
1013 160430 : if (iterator->spageptr >= tbm->npages ||
1014 12450 : chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1015 : {
1016 : /* Return a lossy page indicator from the chunk */
1017 157298 : tbmres->blockno = chunk_blockno;
1018 157298 : tbmres->lossy = true;
1019 157298 : tbmres->recheck = true;
1020 157298 : tbmres->internal_page = NULL;
1021 157298 : iterator->schunkbit++;
1022 157298 : return true;
1023 : }
1024 : }
1025 :
1026 240342 : if (iterator->spageptr < tbm->npages)
1027 : {
1028 : PagetableEntry *page;
1029 :
1030 : /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1031 214696 : if (tbm->status == TBM_ONE_PAGE)
1032 7688 : page = &tbm->entry1;
1033 : else
1034 207008 : page = tbm->spages[iterator->spageptr];
1035 :
1036 214696 : tbmres->internal_page = page;
1037 214696 : tbmres->blockno = page->blockno;
1038 214696 : tbmres->lossy = false;
1039 214696 : tbmres->recheck = page->recheck;
1040 214696 : iterator->spageptr++;
1041 214696 : return true;
1042 : }
1043 :
1044 : /* Nothing more in the bitmap */
1045 25646 : tbmres->blockno = InvalidBlockNumber;
1046 25646 : return false;
1047 : }
1048 :
1049 : /*
1050 : * tbm_shared_iterate - scan through next page of a TIDBitmap
1051 : *
1052 : * As above, but this will iterate using an iterator which is shared
1053 : * across multiple processes. We need to acquire the iterator LWLock,
1054 : * before accessing the shared members.
1055 : */
1056 : bool
1057 30448 : tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres)
1058 : {
1059 30448 : TBMSharedIteratorState *istate = iterator->state;
1060 30448 : PagetableEntry *ptbase = NULL;
1061 30448 : int *idxpages = NULL;
1062 30448 : int *idxchunks = NULL;
1063 :
1064 30448 : if (iterator->ptbase != NULL)
1065 30448 : ptbase = iterator->ptbase->ptentry;
1066 30448 : if (iterator->ptpages != NULL)
1067 30448 : idxpages = iterator->ptpages->index;
1068 30448 : if (iterator->ptchunks != NULL)
1069 7440 : idxchunks = iterator->ptchunks->index;
1070 :
1071 : /* Acquire the LWLock before accessing the shared members */
1072 30448 : LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1073 :
1074 : /*
1075 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1076 : * schunkbit to the next set bit.
1077 : */
1078 30472 : while (istate->schunkptr < istate->nchunks)
1079 : {
1080 6150 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1081 6150 : int schunkbit = istate->schunkbit;
1082 :
1083 6150 : tbm_advance_schunkbit(chunk, &schunkbit);
1084 6150 : if (schunkbit < PAGES_PER_CHUNK)
1085 : {
1086 6126 : istate->schunkbit = schunkbit;
1087 6126 : break;
1088 : }
1089 : /* advance to next chunk */
1090 24 : istate->schunkptr++;
1091 24 : istate->schunkbit = 0;
1092 : }
1093 :
1094 : /*
1095 : * If both chunk and per-page data remain, must output the numerically
1096 : * earlier page.
1097 : */
1098 30448 : if (istate->schunkptr < istate->nchunks)
1099 : {
1100 6126 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1101 : BlockNumber chunk_blockno;
1102 :
1103 6126 : chunk_blockno = chunk->blockno + istate->schunkbit;
1104 :
1105 6126 : if (istate->spageptr >= istate->npages ||
1106 6126 : chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1107 : {
1108 : /* Return a lossy page indicator from the chunk */
1109 3102 : tbmres->blockno = chunk_blockno;
1110 3102 : tbmres->lossy = true;
1111 3102 : tbmres->recheck = true;
1112 3102 : tbmres->internal_page = NULL;
1113 3102 : istate->schunkbit++;
1114 :
1115 3102 : LWLockRelease(&istate->lock);
1116 3102 : return true;
1117 : }
1118 : }
1119 :
1120 27346 : if (istate->spageptr < istate->npages)
1121 : {
1122 27006 : PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1123 :
1124 27006 : tbmres->internal_page = page;
1125 27006 : tbmres->blockno = page->blockno;
1126 27006 : tbmres->lossy = false;
1127 27006 : tbmres->recheck = page->recheck;
1128 27006 : istate->spageptr++;
1129 :
1130 27006 : LWLockRelease(&istate->lock);
1131 :
1132 27006 : return true;
1133 : }
1134 :
1135 340 : LWLockRelease(&istate->lock);
1136 :
1137 : /* Nothing more in the bitmap */
1138 340 : tbmres->blockno = InvalidBlockNumber;
1139 340 : return false;
1140 : }
1141 :
1142 : /*
1143 : * tbm_end_private_iterate - finish an iteration over a TIDBitmap
1144 : *
1145 : * Currently this is just a pfree, but it might do more someday. (For
1146 : * instance, it could be useful to count open iterators and allow the
1147 : * bitmap to return to read/write status when there are no more iterators.)
1148 : */
1149 : void
1150 25570 : tbm_end_private_iterate(TBMPrivateIterator *iterator)
1151 : {
1152 25570 : pfree(iterator);
1153 25570 : }
1154 :
1155 : /*
1156 : * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1157 : *
1158 : * This doesn't free any of the shared state associated with the iterator,
1159 : * just our backend-private state.
1160 : */
1161 : void
1162 340 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
1163 : {
1164 340 : pfree(iterator);
1165 340 : }
1166 :
1167 : /*
1168 : * tbm_find_pageentry - find a PagetableEntry for the pageno
1169 : *
1170 : * Returns NULL if there is no non-lossy entry for the pageno.
1171 : */
1172 : static const PagetableEntry *
1173 11594 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1174 : {
1175 : const PagetableEntry *page;
1176 :
1177 11594 : if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1178 0 : return NULL;
1179 :
1180 11594 : if (tbm->status == TBM_ONE_PAGE)
1181 : {
1182 0 : page = &tbm->entry1;
1183 0 : if (page->blockno != pageno)
1184 0 : return NULL;
1185 : Assert(!page->ischunk);
1186 0 : return page;
1187 : }
1188 :
1189 11594 : page = pagetable_lookup(tbm->pagetable, pageno);
1190 11594 : if (page == NULL)
1191 1952 : return NULL;
1192 9642 : if (page->ischunk)
1193 0 : return NULL; /* don't want a lossy chunk header */
1194 9642 : return page;
1195 : }
1196 :
1197 : /*
1198 : * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1199 : *
1200 : * If new, the entry is marked as an exact (non-chunk) entry.
1201 : *
1202 : * This may cause the table to exceed the desired memory size. It is
1203 : * up to the caller to call tbm_lossify() at the next safe point if so.
1204 : */
1205 : static PagetableEntry *
1206 7314872 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1207 : {
1208 : PagetableEntry *page;
1209 : bool found;
1210 :
1211 7314872 : if (tbm->status == TBM_EMPTY)
1212 : {
1213 : /* Use the fixed slot */
1214 13444 : page = &tbm->entry1;
1215 13444 : found = false;
1216 13444 : tbm->status = TBM_ONE_PAGE;
1217 : }
1218 : else
1219 : {
1220 7301428 : if (tbm->status == TBM_ONE_PAGE)
1221 : {
1222 95974 : page = &tbm->entry1;
1223 95974 : if (page->blockno == pageno)
1224 90218 : return page;
1225 : /* Time to switch from one page to a hashtable */
1226 5756 : tbm_create_pagetable(tbm);
1227 : }
1228 :
1229 : /* Look up or create an entry */
1230 7211210 : page = pagetable_insert(tbm->pagetable, pageno, &found);
1231 : }
1232 :
1233 : /* Initialize it if not present before */
1234 7224654 : if (!found)
1235 : {
1236 284492 : char oldstatus = page->status;
1237 :
1238 1991444 : MemSet(page, 0, sizeof(PagetableEntry));
1239 284492 : page->status = oldstatus;
1240 284492 : page->blockno = pageno;
1241 : /* must count it too */
1242 284492 : tbm->nentries++;
1243 284492 : tbm->npages++;
1244 : }
1245 :
1246 7224654 : return page;
1247 : }
1248 :
1249 : /*
1250 : * tbm_page_is_lossy - is the page marked as lossily stored?
1251 : */
1252 : static bool
1253 7335034 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1254 : {
1255 : PagetableEntry *page;
1256 : BlockNumber chunk_pageno;
1257 : int bitno;
1258 :
1259 : /* we can skip the lookup if there are no lossy chunks */
1260 7335034 : if (tbm->nchunks == 0)
1261 7204768 : return false;
1262 : Assert(tbm->status == TBM_HASH);
1263 :
1264 130266 : bitno = pageno % PAGES_PER_CHUNK;
1265 130266 : chunk_pageno = pageno - bitno;
1266 :
1267 130266 : page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1268 :
1269 130266 : if (page != NULL && page->ischunk)
1270 : {
1271 19134 : int wordnum = WORDNUM(bitno);
1272 19134 : int bitnum = BITNUM(bitno);
1273 :
1274 19134 : if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1275 8568 : return true;
1276 : }
1277 121698 : return false;
1278 : }
1279 :
1280 : /*
1281 : * tbm_mark_page_lossy - mark the page number as lossily stored
1282 : *
1283 : * This may cause the table to exceed the desired memory size. It is
1284 : * up to the caller to call tbm_lossify() at the next safe point if so.
1285 : */
1286 : static void
1287 166448 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1288 : {
1289 : PagetableEntry *page;
1290 : bool found;
1291 : BlockNumber chunk_pageno;
1292 : int bitno;
1293 : int wordnum;
1294 : int bitnum;
1295 :
1296 : /* We force the bitmap into hashtable mode whenever it's lossy */
1297 166448 : if (tbm->status != TBM_HASH)
1298 2868 : tbm_create_pagetable(tbm);
1299 :
1300 166448 : bitno = pageno % PAGES_PER_CHUNK;
1301 166448 : chunk_pageno = pageno - bitno;
1302 :
1303 : /*
1304 : * Remove any extant non-lossy entry for the page. If the page is its own
1305 : * chunk header, however, we skip this and handle the case below.
1306 : */
1307 166448 : if (bitno != 0)
1308 : {
1309 164120 : if (pagetable_delete(tbm->pagetable, pageno))
1310 : {
1311 : /* It was present, so adjust counts */
1312 18468 : tbm->nentries--;
1313 18468 : tbm->npages--; /* assume it must have been non-lossy */
1314 : }
1315 : }
1316 :
1317 : /* Look up or create entry for chunk-header page */
1318 166448 : page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1319 :
1320 : /* Initialize it if not present before */
1321 166448 : if (!found)
1322 : {
1323 2868 : char oldstatus = page->status;
1324 :
1325 20076 : MemSet(page, 0, sizeof(PagetableEntry));
1326 2868 : page->status = oldstatus;
1327 2868 : page->blockno = chunk_pageno;
1328 2868 : page->ischunk = true;
1329 : /* must count it too */
1330 2868 : tbm->nentries++;
1331 2868 : tbm->nchunks++;
1332 : }
1333 163580 : else if (!page->ischunk)
1334 : {
1335 156 : char oldstatus = page->status;
1336 :
1337 : /* chunk header page was formerly non-lossy, make it lossy */
1338 1092 : MemSet(page, 0, sizeof(PagetableEntry));
1339 156 : page->status = oldstatus;
1340 156 : page->blockno = chunk_pageno;
1341 156 : page->ischunk = true;
1342 : /* we assume it had some tuple bit(s) set, so mark it lossy */
1343 156 : page->words[0] = ((bitmapword) 1 << 0);
1344 : /* adjust counts */
1345 156 : tbm->nchunks++;
1346 156 : tbm->npages--;
1347 : }
1348 :
1349 : /* Now set the original target page's bit */
1350 166448 : wordnum = WORDNUM(bitno);
1351 166448 : bitnum = BITNUM(bitno);
1352 166448 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1353 166448 : }
1354 :
1355 : /*
1356 : * tbm_lossify - lose some information to get back under the memory limit
1357 : */
1358 : static void
1359 36 : tbm_lossify(TIDBitmap *tbm)
1360 : {
1361 : pagetable_iterator i;
1362 : PagetableEntry *page;
1363 :
1364 : /*
1365 : * XXX Really stupid implementation: this just lossifies pages in
1366 : * essentially random order. We should be paying some attention to the
1367 : * number of bits set in each page, instead.
1368 : *
1369 : * Since we are called as soon as nentries exceeds maxentries, we should
1370 : * push nentries down to significantly less than maxentries, or else we'll
1371 : * just end up doing this again very soon. We shoot for maxentries/2.
1372 : */
1373 : Assert(tbm->iterating == TBM_NOT_ITERATING);
1374 : Assert(tbm->status == TBM_HASH);
1375 :
1376 36 : pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1377 18492 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1378 : {
1379 18492 : if (page->ischunk)
1380 0 : continue; /* already a chunk header */
1381 :
1382 : /*
1383 : * If the page would become a chunk header, we won't save anything by
1384 : * converting it to lossy, so skip it.
1385 : */
1386 18492 : if ((page->blockno % PAGES_PER_CHUNK) == 0)
1387 24 : continue;
1388 :
1389 : /* This does the dirty work ... */
1390 18468 : tbm_mark_page_lossy(tbm, page->blockno);
1391 :
1392 18468 : if (tbm->nentries <= tbm->maxentries / 2)
1393 : {
1394 : /*
1395 : * We have made enough room. Remember where to start lossifying
1396 : * next round, so we evenly iterate over the hashtable.
1397 : */
1398 36 : tbm->lossify_start = i.cur;
1399 36 : break;
1400 : }
1401 :
1402 : /*
1403 : * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1404 : * hashtable and may have deleted the non-lossy chunk. We can
1405 : * continue the same hash table scan, since failure to visit one
1406 : * element or visiting the newly inserted element, isn't fatal.
1407 : */
1408 : }
1409 :
1410 : /*
1411 : * With a big bitmap and small work_mem, it's possible that we cannot get
1412 : * under maxentries. Again, if that happens, we'd end up uselessly
1413 : * calling tbm_lossify over and over. To prevent this from becoming a
1414 : * performance sink, force maxentries up to at least double the current
1415 : * number of entries. (In essence, we're admitting inability to fit
1416 : * within work_mem when we do this.) Note that this test will not fire if
1417 : * we broke out of the loop early; and if we didn't, the current number of
1418 : * entries is simply not reducible any further.
1419 : */
1420 36 : if (tbm->nentries > tbm->maxentries / 2)
1421 0 : tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1422 36 : }
1423 :
1424 : /*
1425 : * qsort comparator to handle PagetableEntry pointers.
1426 : */
1427 : static int
1428 1392574 : tbm_comparator(const void *left, const void *right)
1429 : {
1430 1392574 : BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1431 1392574 : BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1432 :
1433 1392574 : return pg_cmp_u32(l, r);
1434 : }
1435 :
1436 : /*
1437 : * As above, but this will get index into PagetableEntry array. Therefore,
1438 : * it needs to get actual PagetableEntry using the index before comparing the
1439 : * blockno.
1440 : */
1441 : static int
1442 238290 : tbm_shared_comparator(const void *left, const void *right, void *arg)
1443 : {
1444 238290 : PagetableEntry *base = (PagetableEntry *) arg;
1445 238290 : PagetableEntry *lpage = &base[*(int *) left];
1446 238290 : PagetableEntry *rpage = &base[*(int *) right];
1447 :
1448 238290 : if (lpage->blockno < rpage->blockno)
1449 108966 : return -1;
1450 129324 : else if (lpage->blockno > rpage->blockno)
1451 129324 : return 1;
1452 0 : return 0;
1453 : }
1454 :
1455 : /*
1456 : * tbm_attach_shared_iterate
1457 : *
1458 : * Allocate a backend-private iterator and attach the shared iterator state
1459 : * to it so that multiple processed can iterate jointly.
1460 : *
1461 : * We also converts the DSA pointers to local pointers and store them into
1462 : * our private iterator.
1463 : */
1464 : TBMSharedIterator *
1465 340 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1466 : {
1467 : TBMSharedIterator *iterator;
1468 : TBMSharedIteratorState *istate;
1469 :
1470 : /*
1471 : * Create the TBMSharedIterator struct, with enough trailing space to
1472 : * serve the needs of the TBMIterateResult sub-struct.
1473 : */
1474 340 : iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
1475 :
1476 340 : istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477 :
1478 340 : iterator->state = istate;
1479 :
1480 340 : iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481 :
1482 340 : if (istate->npages)
1483 340 : iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484 340 : if (istate->nchunks)
1485 30 : iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486 :
1487 340 : return iterator;
1488 : }
1489 :
1490 : /*
1491 : * pagetable_allocate
1492 : *
1493 : * Callback function for allocating the memory for hashtable elements.
1494 : * Allocate memory for hashtable elements, using DSA if available.
1495 : */
1496 : static inline void *
1497 9026 : pagetable_allocate(pagetable_hash *pagetable, Size size)
1498 : {
1499 9026 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1500 : PTEntryArray *ptbase;
1501 :
1502 9026 : if (tbm->dsa == NULL)
1503 8870 : return MemoryContextAllocExtended(pagetable->ctx, size,
1504 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1505 :
1506 : /*
1507 : * Save the dsapagetable reference in dsapagetableold before allocating
1508 : * new memory so that pagetable_free can free the old entry.
1509 : */
1510 156 : tbm->dsapagetableold = tbm->dsapagetable;
1511 156 : tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1512 : sizeof(PTEntryArray) + size,
1513 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1514 156 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515 :
1516 156 : return ptbase->ptentry;
1517 : }
1518 :
1519 : /*
1520 : * pagetable_free
1521 : *
1522 : * Callback function for freeing hash table elements.
1523 : */
1524 : static inline void
1525 9022 : pagetable_free(pagetable_hash *pagetable, void *pointer)
1526 : {
1527 9022 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1528 :
1529 : /* pfree the input pointer if DSA is not available */
1530 9022 : if (tbm->dsa == NULL)
1531 8866 : pfree(pointer);
1532 156 : else if (DsaPointerIsValid(tbm->dsapagetableold))
1533 : {
1534 84 : dsa_free(tbm->dsa, tbm->dsapagetableold);
1535 84 : tbm->dsapagetableold = InvalidDsaPointer;
1536 : }
1537 9022 : }
1538 :
1539 : /*
1540 : * tbm_calculate_entries
1541 : *
1542 : * Estimate number of hashtable entries we can have within maxbytes.
1543 : */
1544 : int
1545 678308 : tbm_calculate_entries(Size maxbytes)
1546 : {
1547 : Size nbuckets;
1548 :
1549 : /*
1550 : * Estimate number of hashtable entries we can have within maxbytes. This
1551 : * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552 : * for our purpose. Also count an extra Pointer per entry for the arrays
1553 : * created during iteration readout.
1554 : */
1555 678308 : nbuckets = maxbytes /
1556 : (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557 678308 : nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1558 678308 : nbuckets = Max(nbuckets, 16); /* sanity limit */
1559 :
1560 678308 : return (int) nbuckets;
1561 : }
1562 :
1563 : /*
1564 : * Create a shared or private bitmap iterator and start iteration.
1565 : *
1566 : * `tbm` is only used to create the private iterator and dsa and dsp are only
1567 : * used to create the shared iterator.
1568 : *
1569 : * Before invoking tbm_begin_iterate() to create a shared iterator, one
1570 : * process must already have invoked tbm_prepare_shared_iterate() to create
1571 : * and set up the TBMSharedIteratorState.
1572 : */
1573 : TBMIterator
1574 25592 : tbm_begin_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer dsp)
1575 : {
1576 25592 : TBMIterator iterator = {0};
1577 :
1578 : /* Allocate a private iterator and attach the shared state to it */
1579 25592 : if (DsaPointerIsValid(dsp))
1580 : {
1581 340 : iterator.shared = true;
1582 340 : iterator.i.shared_iterator = tbm_attach_shared_iterate(dsa, dsp);
1583 : }
1584 : else
1585 : {
1586 25252 : iterator.shared = false;
1587 25252 : iterator.i.private_iterator = tbm_begin_private_iterate(tbm);
1588 : }
1589 :
1590 25592 : return iterator;
1591 : }
1592 :
1593 : /*
1594 : * Clean up shared or private bitmap iterator.
1595 : */
1596 : void
1597 25490 : tbm_end_iterate(TBMIterator *iterator)
1598 : {
1599 : Assert(iterator && !tbm_exhausted(iterator));
1600 :
1601 25490 : if (iterator->shared)
1602 340 : tbm_end_shared_iterate(iterator->i.shared_iterator);
1603 : else
1604 25150 : tbm_end_private_iterate(iterator->i.private_iterator);
1605 :
1606 25490 : *iterator = (TBMIterator)
1607 : {
1608 : 0
1609 : };
1610 25490 : }
1611 :
1612 : /*
1613 : * Populate the next TBMIterateResult using the shared or private bitmap
1614 : * iterator. Returns false when there is nothing more to scan.
1615 : */
1616 : bool
1617 422842 : tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
1618 : {
1619 : Assert(iterator);
1620 : Assert(tbmres);
1621 :
1622 422842 : if (iterator->shared)
1623 30448 : return tbm_shared_iterate(iterator->i.shared_iterator, tbmres);
1624 : else
1625 392394 : return tbm_private_iterate(iterator->i.private_iterator, tbmres);
1626 : }
|