Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tidbitmap.c
4 : * PostgreSQL tuple-id (TID) bitmap package
5 : *
6 : * This module provides bitmap data structures that are spiritually
7 : * similar to Bitmapsets, but are specially adapted to store sets of
8 : * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 : * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 : * Also, since we wish to be able to store very large tuple sets in
11 : * memory with this data structure, we support "lossy" storage, in which
12 : * we no longer remember individual tuple offsets on a page but only the
13 : * fact that a particular page needs to be visited.
14 : *
15 : * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 : * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 : * of memory. People pushing around tables of that size should have a
18 : * couple of Mb to spare, so we don't worry about providing a second level
19 : * of lossiness. In theory we could fall back to page ranges at some
20 : * point, but for now that seems useless complexity.
21 : *
22 : * We also support the notion of candidate matches, or rechecking. This
23 : * means we know that a search need visit only some tuples on a page,
24 : * but we are not certain that all of those tuples are real matches.
25 : * So the eventual heap scan must recheck the quals for these tuples only,
26 : * rather than rechecking the quals for all tuples on the page as in the
27 : * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 : * into a bitmap, and it can also happen internally when we AND a lossy
29 : * and a non-lossy page.
30 : *
31 : *
32 : * Copyright (c) 2003-2023, PostgreSQL Global Development Group
33 : *
34 : * IDENTIFICATION
35 : * src/backend/nodes/tidbitmap.c
36 : *
37 : *-------------------------------------------------------------------------
38 : */
39 : #include "postgres.h"
40 :
41 : #include <limits.h>
42 :
43 : #include "access/htup_details.h"
44 : #include "common/hashfn.h"
45 : #include "nodes/bitmapset.h"
46 : #include "nodes/tidbitmap.h"
47 : #include "storage/lwlock.h"
48 : #include "utils/dsa.h"
49 :
50 : /*
51 : * The maximum number of tuples per page is not large (typically 256 with
52 : * 8K pages, or 1024 with 32K pages). So there's not much point in making
53 : * the per-page bitmaps variable size. We just legislate that the size
54 : * is this:
55 : */
56 : #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57 :
58 : /*
59 : * When we have to switch over to lossy storage, we use a data structure
60 : * with one bit per page, where all pages having the same number DIV
61 : * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
62 : * and has the bit set for a given page, there must not be a per-page entry
63 : * for that page in the page table.
64 : *
65 : * We actually store both exact pages and lossy chunks in the same hash
66 : * table, using identical data structures. (This is because the memory
67 : * management for hashtables doesn't easily/efficiently allow space to be
68 : * transferred easily from one hashtable to another.) Therefore it's best
69 : * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
70 : * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
71 : * avoid expensive integer remainder operations. So, define it like this:
72 : */
73 : #define PAGES_PER_CHUNK (BLCKSZ / 32)
74 :
75 : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 :
77 : #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
78 : #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79 :
80 : /* number of active words for an exact page: */
81 : #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
82 : /* number of active words for a lossy chunk: */
83 : #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84 :
85 : /*
86 : * The hashtable entries are represented by this data structure. For
87 : * an exact page, blockno is the page number and bit k of the bitmap
88 : * represents tuple offset k+1. For a lossy chunk, blockno is the first
89 : * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
90 : * bit k represents page blockno+k. Note that it is not possible to
91 : * have exact storage for the first page of a chunk if we are using
92 : * lossy storage for any page in the chunk's range, since the same
93 : * hashtable entry has to serve both purposes.
94 : *
95 : * recheck is used only on exact pages --- it indicates that although
96 : * only the stated tuples need be checked, the full index qual condition
97 : * must be checked for each (ie, these are candidate matches).
98 : */
99 : typedef struct PagetableEntry
100 : {
101 : BlockNumber blockno; /* page number (hashtable key) */
102 : char status; /* hash entry status */
103 : bool ischunk; /* T = lossy storage, F = exact */
104 : bool recheck; /* should the tuples be rechecked? */
105 : bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106 : } PagetableEntry;
107 :
108 : /*
109 : * Holds array of pagetable entries.
110 : */
111 : typedef struct PTEntryArray
112 : {
113 : pg_atomic_uint32 refcount; /* no. of iterator attached */
114 : PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
115 : } PTEntryArray;
116 :
117 : /*
118 : * We want to avoid the overhead of creating the hashtable, which is
119 : * comparatively large, when not necessary. Particularly when we are using a
120 : * bitmap scan on the inside of a nestloop join: a bitmap may well live only
121 : * long enough to accumulate one entry in such cases. We therefore avoid
122 : * creating an actual hashtable until we need two pagetable entries. When
123 : * just one pagetable entry is needed, we store it in a fixed field of
124 : * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
125 : * shrinks down to zero or one page again. So, status can be TBM_HASH even
126 : * when nentries is zero or one.)
127 : */
128 : typedef enum
129 : {
130 : TBM_EMPTY, /* no hashtable, nentries == 0 */
131 : TBM_ONE_PAGE, /* entry1 contains the single entry */
132 : TBM_HASH, /* pagetable is valid, entry1 is not */
133 : } TBMStatus;
134 :
135 : /*
136 : * Current iterating state of the TBM.
137 : */
138 : typedef enum
139 : {
140 : TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
141 : TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
142 : TBM_ITERATING_SHARED, /* converted to shared page and chunk array */
143 : } TBMIteratingState;
144 :
145 : /*
146 : * Here is the representation for a whole TIDBitMap:
147 : */
148 : struct TIDBitmap
149 : {
150 : NodeTag type; /* to make it a valid Node */
151 : MemoryContext mcxt; /* memory context containing me */
152 : TBMStatus status; /* see codes above */
153 : struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
154 : int nentries; /* number of entries in pagetable */
155 : int maxentries; /* limit on same to meet maxbytes */
156 : int npages; /* number of exact entries in pagetable */
157 : int nchunks; /* number of lossy entries in pagetable */
158 : TBMIteratingState iterating; /* tbm_begin_iterate called? */
159 : uint32 lossify_start; /* offset to start lossifying hashtable at */
160 : PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
161 : /* these are valid when iterating is true: */
162 : PagetableEntry **spages; /* sorted exact-page list, or NULL */
163 : PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
164 : dsa_pointer dsapagetable; /* dsa_pointer to the element array */
165 : dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
166 : dsa_pointer ptpages; /* dsa_pointer to the page array */
167 : dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
168 : dsa_area *dsa; /* reference to per-query dsa area */
169 : };
170 :
171 : /*
172 : * When iterating over a bitmap in sorted order, a TBMIterator is used to
173 : * track our progress. There can be several iterators scanning the same
174 : * bitmap concurrently. Note that the bitmap becomes read-only as soon as
175 : * any iterator is created.
176 : */
177 : struct TBMIterator
178 : {
179 : TIDBitmap *tbm; /* TIDBitmap we're iterating over */
180 : int spageptr; /* next spages index */
181 : int schunkptr; /* next schunks index */
182 : int schunkbit; /* next bit to check in current schunk */
183 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
184 : };
185 :
186 : /*
187 : * Holds the shared members of the iterator so that multiple processes
188 : * can jointly iterate.
189 : */
190 : typedef struct TBMSharedIteratorState
191 : {
192 : int nentries; /* number of entries in pagetable */
193 : int maxentries; /* limit on same to meet maxbytes */
194 : int npages; /* number of exact entries in pagetable */
195 : int nchunks; /* number of lossy entries in pagetable */
196 : dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
197 : dsa_pointer spages; /* dsa pointer to page array */
198 : dsa_pointer schunks; /* dsa pointer to chunk array */
199 : LWLock lock; /* lock to protect below members */
200 : int spageptr; /* next spages index */
201 : int schunkptr; /* next schunks index */
202 : int schunkbit; /* next bit to check in current schunk */
203 : } TBMSharedIteratorState;
204 :
205 : /*
206 : * pagetable iteration array.
207 : */
208 : typedef struct PTIterationArray
209 : {
210 : pg_atomic_uint32 refcount; /* no. of iterator attached */
211 : int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
212 : } PTIterationArray;
213 :
214 : /*
215 : * same as TBMIterator, but it is used for joint iteration, therefore this
216 : * also holds a reference to the shared state.
217 : */
218 : struct TBMSharedIterator
219 : {
220 : TBMSharedIteratorState *state; /* shared state */
221 : PTEntryArray *ptbase; /* pagetable element array */
222 : PTIterationArray *ptpages; /* sorted exact page index list */
223 : PTIterationArray *ptchunks; /* sorted lossy page index list */
224 : TBMIterateResult output; /* MUST BE LAST (because variable-size) */
225 : };
226 :
227 : /* Local function prototypes */
228 : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
229 : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230 : const TIDBitmap *b);
231 : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232 : BlockNumber pageno);
233 : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
234 : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
235 : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
236 : static void tbm_lossify(TIDBitmap *tbm);
237 : static int tbm_comparator(const void *left, const void *right);
238 : static int tbm_shared_comparator(const void *left, const void *right,
239 : void *arg);
240 :
241 : /* define hashtable mapping block numbers to PagetableEntry's */
242 : #define SH_USE_NONDEFAULT_ALLOCATOR
243 : #define SH_PREFIX pagetable
244 : #define SH_ELEMENT_TYPE PagetableEntry
245 : #define SH_KEY_TYPE BlockNumber
246 : #define SH_KEY blockno
247 : #define SH_HASH_KEY(tb, key) murmurhash32(key)
248 : #define SH_EQUAL(tb, a, b) a == b
249 : #define SH_SCOPE static inline
250 : #define SH_DEFINE
251 : #define SH_DECLARE
252 : #include "lib/simplehash.h"
253 :
254 :
255 : /*
256 : * tbm_create - create an initially-empty bitmap
257 : *
258 : * The bitmap will live in the memory context that is CurrentMemoryContext
259 : * at the time of this call. It will be limited to (approximately) maxbytes
260 : * total memory consumption. If the DSA passed to this function is not NULL
261 : * then the memory for storing elements of the underlying page table will
262 : * be allocated from the DSA.
263 : */
264 : TIDBitmap *
265 16720 : tbm_create(long maxbytes, dsa_area *dsa)
266 : {
267 : TIDBitmap *tbm;
268 :
269 : /* Create the TIDBitmap struct and zero all its fields */
270 16720 : tbm = makeNode(TIDBitmap);
271 :
272 16720 : tbm->mcxt = CurrentMemoryContext;
273 16720 : tbm->status = TBM_EMPTY;
274 :
275 16720 : tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
276 16720 : tbm->lossify_start = 0;
277 16720 : tbm->dsa = dsa;
278 16720 : tbm->dsapagetable = InvalidDsaPointer;
279 16720 : tbm->dsapagetableold = InvalidDsaPointer;
280 16720 : tbm->ptpages = InvalidDsaPointer;
281 16720 : tbm->ptchunks = InvalidDsaPointer;
282 :
283 16720 : return tbm;
284 : }
285 :
286 : /*
287 : * Actually create the hashtable. Since this is a moderately expensive
288 : * proposition, we don't do it until we have to.
289 : */
290 : static void
291 7918 : tbm_create_pagetable(TIDBitmap *tbm)
292 : {
293 : Assert(tbm->status != TBM_HASH);
294 : Assert(tbm->pagetable == NULL);
295 :
296 7918 : tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
297 :
298 : /* If entry1 is valid, push it into the hashtable */
299 7918 : if (tbm->status == TBM_ONE_PAGE)
300 : {
301 : PagetableEntry *page;
302 : bool found;
303 : char oldstatus;
304 :
305 5050 : page = pagetable_insert(tbm->pagetable,
306 : tbm->entry1.blockno,
307 : &found);
308 : Assert(!found);
309 5050 : oldstatus = page->status;
310 5050 : memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
311 5050 : page->status = oldstatus;
312 : }
313 :
314 7918 : tbm->status = TBM_HASH;
315 7918 : }
316 :
317 : /*
318 : * tbm_free - free a TIDBitmap
319 : */
320 : void
321 16654 : tbm_free(TIDBitmap *tbm)
322 : {
323 16654 : if (tbm->pagetable)
324 7914 : pagetable_destroy(tbm->pagetable);
325 16654 : if (tbm->spages)
326 4900 : pfree(tbm->spages);
327 16654 : if (tbm->schunks)
328 2880 : pfree(tbm->schunks);
329 16654 : pfree(tbm);
330 16654 : }
331 :
332 : /*
333 : * tbm_free_shared_area - free shared state
334 : *
335 : * Free shared iterator state, Also free shared pagetable and iterator arrays
336 : * memory if they are not referred by any of the shared iterator i.e recount
337 : * is becomes 0.
338 : */
339 : void
340 108 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
341 : {
342 108 : TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
343 : PTEntryArray *ptbase;
344 : PTIterationArray *ptpages;
345 : PTIterationArray *ptchunks;
346 :
347 108 : if (DsaPointerIsValid(istate->pagetable))
348 : {
349 108 : ptbase = dsa_get_address(dsa, istate->pagetable);
350 108 : if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
351 54 : dsa_free(dsa, istate->pagetable);
352 : }
353 108 : if (DsaPointerIsValid(istate->spages))
354 : {
355 108 : ptpages = dsa_get_address(dsa, istate->spages);
356 108 : if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
357 54 : dsa_free(dsa, istate->spages);
358 : }
359 108 : if (DsaPointerIsValid(istate->schunks))
360 : {
361 0 : ptchunks = dsa_get_address(dsa, istate->schunks);
362 0 : if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
363 0 : dsa_free(dsa, istate->schunks);
364 : }
365 :
366 108 : dsa_free(dsa, dp);
367 108 : }
368 :
369 : /*
370 : * tbm_add_tuples - add some tuple IDs to a TIDBitmap
371 : *
372 : * If recheck is true, then the recheck flag will be set in the
373 : * TBMIterateResult when any of these tuples are reported out.
374 : */
375 : void
376 6313118 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
377 : bool recheck)
378 : {
379 6313118 : BlockNumber currblk = InvalidBlockNumber;
380 6313118 : PagetableEntry *page = NULL; /* only valid when currblk is valid */
381 : int i;
382 :
383 : Assert(tbm->iterating == TBM_NOT_ITERATING);
384 14807670 : for (i = 0; i < ntids; i++)
385 : {
386 8494552 : BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
387 8494552 : OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
388 : int wordnum,
389 : bitnum;
390 :
391 : /* safety check to ensure we don't overrun bit array bounds */
392 8494552 : if (off < 1 || off > MAX_TUPLES_PER_PAGE)
393 0 : elog(ERROR, "tuple offset out of range: %u", off);
394 :
395 : /*
396 : * Look up target page unless we already did. This saves cycles when
397 : * the input includes consecutive tuples on the same page, which is
398 : * common enough to justify an extra test here.
399 : */
400 8494552 : if (blk != currblk)
401 : {
402 7089824 : if (tbm_page_is_lossy(tbm, blk))
403 2856 : page = NULL; /* remember page is lossy */
404 : else
405 7086968 : page = tbm_get_pageentry(tbm, blk);
406 7089824 : currblk = blk;
407 : }
408 :
409 8494552 : if (page == NULL)
410 2856 : continue; /* whole page is already marked */
411 :
412 8491696 : if (page->ischunk)
413 : {
414 : /* The page is a lossy chunk header, set bit for itself */
415 0 : wordnum = bitnum = 0;
416 : }
417 : else
418 : {
419 : /* Page is exact, so set bit for individual tuple */
420 8491696 : wordnum = WORDNUM(off - 1);
421 8491696 : bitnum = BITNUM(off - 1);
422 : }
423 8491696 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
424 8491696 : page->recheck |= recheck;
425 :
426 8491696 : if (tbm->nentries > tbm->maxentries)
427 : {
428 24 : tbm_lossify(tbm);
429 : /* Page could have been converted to lossy, so force new lookup */
430 24 : currblk = InvalidBlockNumber;
431 : }
432 : }
433 6313118 : }
434 :
435 : /*
436 : * tbm_add_page - add a whole page to a TIDBitmap
437 : *
438 : * This causes the whole page to be reported (with the recheck flag)
439 : * when the TIDBitmap is scanned.
440 : */
441 : void
442 147980 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
443 : {
444 : /* Enter the page in the bitmap, or mark it lossy if already present */
445 147980 : tbm_mark_page_lossy(tbm, pageno);
446 : /* If we went over the memory limit, lossify some more pages */
447 147980 : if (tbm->nentries > tbm->maxentries)
448 0 : tbm_lossify(tbm);
449 147980 : }
450 :
451 : /*
452 : * tbm_union - set union
453 : *
454 : * a is modified in-place, b is not changed
455 : */
456 : void
457 0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
458 : {
459 : Assert(!a->iterating);
460 : /* Nothing to do if b is empty */
461 0 : if (b->nentries == 0)
462 0 : return;
463 : /* Scan through chunks and pages in b, merge into a */
464 0 : if (b->status == TBM_ONE_PAGE)
465 0 : tbm_union_page(a, &b->entry1);
466 : else
467 : {
468 : pagetable_iterator i;
469 : PagetableEntry *bpage;
470 :
471 : Assert(b->status == TBM_HASH);
472 0 : pagetable_start_iterate(b->pagetable, &i);
473 0 : while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
474 0 : tbm_union_page(a, bpage);
475 : }
476 : }
477 :
478 : /* Process one page of b during a union op */
479 : static void
480 0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
481 : {
482 : PagetableEntry *apage;
483 : int wordnum;
484 :
485 0 : if (bpage->ischunk)
486 : {
487 : /* Scan b's chunk, mark each indicated page lossy in a */
488 0 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
489 : {
490 0 : bitmapword w = bpage->words[wordnum];
491 :
492 0 : if (w != 0)
493 : {
494 : BlockNumber pg;
495 :
496 0 : pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
497 0 : while (w != 0)
498 : {
499 0 : if (w & 1)
500 0 : tbm_mark_page_lossy(a, pg);
501 0 : pg++;
502 0 : w >>= 1;
503 : }
504 : }
505 : }
506 : }
507 0 : else if (tbm_page_is_lossy(a, bpage->blockno))
508 : {
509 : /* page is already lossy in a, nothing to do */
510 0 : return;
511 : }
512 : else
513 : {
514 0 : apage = tbm_get_pageentry(a, bpage->blockno);
515 0 : if (apage->ischunk)
516 : {
517 : /* The page is a lossy chunk header, set bit for itself */
518 0 : apage->words[0] |= ((bitmapword) 1 << 0);
519 : }
520 : else
521 : {
522 : /* Both pages are exact, merge at the bit level */
523 0 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
524 0 : apage->words[wordnum] |= bpage->words[wordnum];
525 0 : apage->recheck |= bpage->recheck;
526 : }
527 : }
528 :
529 0 : if (a->nentries > a->maxentries)
530 0 : tbm_lossify(a);
531 : }
532 :
533 : /*
534 : * tbm_intersect - set intersection
535 : *
536 : * a is modified in-place, b is not changed
537 : */
538 : void
539 68 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
540 : {
541 : Assert(!a->iterating);
542 : /* Nothing to do if a is empty */
543 68 : if (a->nentries == 0)
544 0 : return;
545 : /* Scan through chunks and pages in a, try to match to b */
546 68 : if (a->status == TBM_ONE_PAGE)
547 : {
548 40 : if (tbm_intersect_page(a, &a->entry1, b))
549 : {
550 : /* Page is now empty, remove it from a */
551 : Assert(!a->entry1.ischunk);
552 0 : a->npages--;
553 0 : a->nentries--;
554 : Assert(a->nentries == 0);
555 0 : a->status = TBM_EMPTY;
556 : }
557 : }
558 : else
559 : {
560 : pagetable_iterator i;
561 : PagetableEntry *apage;
562 :
563 : Assert(a->status == TBM_HASH);
564 28 : pagetable_start_iterate(a->pagetable, &i);
565 4814 : while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
566 : {
567 4786 : if (tbm_intersect_page(a, apage, b))
568 : {
569 : /* Page or chunk is now empty, remove it from a */
570 4608 : if (apage->ischunk)
571 0 : a->nchunks--;
572 : else
573 4608 : a->npages--;
574 4608 : a->nentries--;
575 4608 : if (!pagetable_delete(a->pagetable, apage->blockno))
576 0 : elog(ERROR, "hash table corrupted");
577 : }
578 : }
579 : }
580 : }
581 :
582 : /*
583 : * Process one page of a during an intersection op
584 : *
585 : * Returns true if apage is now empty and should be deleted from a
586 : */
587 : static bool
588 4826 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
589 : {
590 : const PagetableEntry *bpage;
591 : int wordnum;
592 :
593 4826 : if (apage->ischunk)
594 : {
595 : /* Scan each bit in chunk, try to clear */
596 30 : bool candelete = true;
597 :
598 150 : for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
599 : {
600 120 : bitmapword w = apage->words[wordnum];
601 :
602 120 : if (w != 0)
603 : {
604 108 : bitmapword neww = w;
605 : BlockNumber pg;
606 : int bitnum;
607 :
608 108 : pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
609 108 : bitnum = 0;
610 6606 : while (w != 0)
611 : {
612 6498 : if (w & 1)
613 : {
614 3360 : if (!tbm_page_is_lossy(b, pg) &&
615 252 : tbm_find_pageentry(b, pg) == NULL)
616 : {
617 : /* Page is not in b at all, lose lossy bit */
618 0 : neww &= ~((bitmapword) 1 << bitnum);
619 : }
620 : }
621 6498 : pg++;
622 6498 : bitnum++;
623 6498 : w >>= 1;
624 : }
625 108 : apage->words[wordnum] = neww;
626 108 : if (neww != 0)
627 108 : candelete = false;
628 : }
629 : }
630 30 : return candelete;
631 : }
632 4796 : else if (tbm_page_is_lossy(b, apage->blockno))
633 : {
634 : /*
635 : * Some of the tuples in 'a' might not satisfy the quals for 'b', but
636 : * because the page 'b' is lossy, we don't know which ones. Therefore
637 : * we mark 'a' as requiring rechecks, to indicate that at most those
638 : * tuples set in 'a' are matches.
639 : */
640 0 : apage->recheck = true;
641 0 : return false;
642 : }
643 : else
644 : {
645 4796 : bool candelete = true;
646 :
647 4796 : bpage = tbm_find_pageentry(b, apage->blockno);
648 4796 : if (bpage != NULL)
649 : {
650 : /* Both pages are exact, merge at the bit level */
651 : Assert(!bpage->ischunk);
652 25704 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
653 : {
654 21420 : apage->words[wordnum] &= bpage->words[wordnum];
655 21420 : if (apage->words[wordnum] != 0)
656 190 : candelete = false;
657 : }
658 4284 : apage->recheck |= bpage->recheck;
659 : }
660 : /* If there is no matching b page, we can just delete the a page */
661 4796 : return candelete;
662 : }
663 : }
664 :
665 : /*
666 : * tbm_is_empty - is a TIDBitmap completely empty?
667 : */
668 : bool
669 670 : tbm_is_empty(const TIDBitmap *tbm)
670 : {
671 670 : return (tbm->nentries == 0);
672 : }
673 :
674 : /*
675 : * tbm_begin_iterate - prepare to iterate through a TIDBitmap
676 : *
677 : * The TBMIterator struct is created in the caller's memory context.
678 : * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
679 : * okay to just allow the memory context to be released, too. It is caller's
680 : * responsibility not to touch the TBMIterator anymore once the TIDBitmap
681 : * is freed.
682 : *
683 : * NB: after this is called, it is no longer allowed to modify the contents
684 : * of the bitmap. However, you can call this multiple times to scan the
685 : * contents repeatedly, including parallel scans.
686 : */
687 : TBMIterator *
688 32512 : tbm_begin_iterate(TIDBitmap *tbm)
689 : {
690 : TBMIterator *iterator;
691 :
692 : Assert(tbm->iterating != TBM_ITERATING_SHARED);
693 :
694 : /*
695 : * Create the TBMIterator struct, with enough trailing space to serve the
696 : * needs of the TBMIterateResult sub-struct.
697 : */
698 32512 : iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
699 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
700 32512 : iterator->tbm = tbm;
701 :
702 : /*
703 : * Initialize iteration pointers.
704 : */
705 32512 : iterator->spageptr = 0;
706 32512 : iterator->schunkptr = 0;
707 32512 : iterator->schunkbit = 0;
708 :
709 : /*
710 : * If we have a hashtable, create and fill the sorted page lists, unless
711 : * we already did that for a previous iterator. Note that the lists are
712 : * attached to the bitmap not the iterator, so they can be used by more
713 : * than one iterator.
714 : */
715 32512 : if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
716 : {
717 : pagetable_iterator i;
718 : PagetableEntry *page;
719 : int npages;
720 : int nchunks;
721 :
722 7778 : if (!tbm->spages && tbm->npages > 0)
723 4904 : tbm->spages = (PagetableEntry **)
724 4904 : MemoryContextAlloc(tbm->mcxt,
725 4904 : tbm->npages * sizeof(PagetableEntry *));
726 7778 : if (!tbm->schunks && tbm->nchunks > 0)
727 2880 : tbm->schunks = (PagetableEntry **)
728 2880 : MemoryContextAlloc(tbm->mcxt,
729 2880 : tbm->nchunks * sizeof(PagetableEntry *));
730 :
731 7778 : npages = nchunks = 0;
732 7778 : pagetable_start_iterate(tbm->pagetable, &i);
733 210916 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
734 : {
735 203138 : if (page->ischunk)
736 2922 : tbm->schunks[nchunks++] = page;
737 : else
738 200216 : tbm->spages[npages++] = page;
739 : }
740 : Assert(npages == tbm->npages);
741 : Assert(nchunks == tbm->nchunks);
742 7778 : if (npages > 1)
743 4900 : qsort(tbm->spages, npages, sizeof(PagetableEntry *),
744 : tbm_comparator);
745 7778 : if (nchunks > 1)
746 12 : qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
747 : tbm_comparator);
748 : }
749 :
750 32512 : tbm->iterating = TBM_ITERATING_PRIVATE;
751 :
752 32512 : return iterator;
753 : }
754 :
755 : /*
756 : * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
757 : *
758 : * The necessary shared state will be allocated from the DSA passed to
759 : * tbm_create, so that multiple processes can attach to it and iterate jointly.
760 : *
761 : * This will convert the pagetable hash into page and chunk array of the index
762 : * into pagetable array.
763 : */
764 : dsa_pointer
765 144 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
766 : {
767 : dsa_pointer dp;
768 : TBMSharedIteratorState *istate;
769 144 : PTEntryArray *ptbase = NULL;
770 144 : PTIterationArray *ptpages = NULL;
771 144 : PTIterationArray *ptchunks = NULL;
772 :
773 : Assert(tbm->dsa != NULL);
774 : Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
775 :
776 : /*
777 : * Allocate TBMSharedIteratorState from DSA to hold the shared members and
778 : * lock, this will also be used by multiple worker for shared iterate.
779 : */
780 144 : dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
781 144 : istate = dsa_get_address(tbm->dsa, dp);
782 :
783 : /*
784 : * If we're not already iterating, create and fill the sorted page lists.
785 : * (If we are, the sorted page lists are already stored in the TIDBitmap,
786 : * and we can just reuse them.)
787 : */
788 144 : if (tbm->iterating == TBM_NOT_ITERATING)
789 : {
790 : pagetable_iterator i;
791 : PagetableEntry *page;
792 : int idx;
793 : int npages;
794 : int nchunks;
795 :
796 : /*
797 : * Allocate the page and chunk array memory from the DSA to share
798 : * across multiple processes.
799 : */
800 72 : if (tbm->npages)
801 : {
802 72 : tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
803 : tbm->npages * sizeof(int));
804 72 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
805 72 : pg_atomic_init_u32(&ptpages->refcount, 0);
806 : }
807 72 : if (tbm->nchunks)
808 : {
809 6 : tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
810 : tbm->nchunks * sizeof(int));
811 6 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
812 6 : pg_atomic_init_u32(&ptchunks->refcount, 0);
813 : }
814 :
815 : /*
816 : * If TBM status is TBM_HASH then iterate over the pagetable and
817 : * convert it to page and chunk arrays. But if it's in the
818 : * TBM_ONE_PAGE mode then directly allocate the space for one entry
819 : * from the DSA.
820 : */
821 72 : npages = nchunks = 0;
822 72 : if (tbm->status == TBM_HASH)
823 : {
824 72 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
825 :
826 72 : pagetable_start_iterate(tbm->pagetable, &i);
827 27102 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
828 : {
829 27030 : idx = page - ptbase->ptentry;
830 27030 : if (page->ischunk)
831 24 : ptchunks->index[nchunks++] = idx;
832 : else
833 27006 : ptpages->index[npages++] = idx;
834 : }
835 :
836 : Assert(npages == tbm->npages);
837 : Assert(nchunks == tbm->nchunks);
838 : }
839 0 : else if (tbm->status == TBM_ONE_PAGE)
840 : {
841 : /*
842 : * In one page mode allocate the space for one pagetable entry,
843 : * initialize it, and directly store its index (i.e. 0) in the
844 : * page array.
845 : */
846 0 : tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
847 : sizeof(PagetableEntry));
848 0 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
849 0 : memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
850 0 : ptpages->index[0] = 0;
851 : }
852 :
853 72 : if (ptbase != NULL)
854 72 : pg_atomic_init_u32(&ptbase->refcount, 0);
855 72 : if (npages > 1)
856 72 : qsort_arg(ptpages->index, npages, sizeof(int),
857 72 : tbm_shared_comparator, ptbase->ptentry);
858 72 : if (nchunks > 1)
859 6 : qsort_arg(ptchunks->index, nchunks, sizeof(int),
860 6 : tbm_shared_comparator, ptbase->ptentry);
861 : }
862 :
863 : /*
864 : * Store the TBM members in the shared state so that we can share them
865 : * across multiple processes.
866 : */
867 144 : istate->nentries = tbm->nentries;
868 144 : istate->maxentries = tbm->maxentries;
869 144 : istate->npages = tbm->npages;
870 144 : istate->nchunks = tbm->nchunks;
871 144 : istate->pagetable = tbm->dsapagetable;
872 144 : istate->spages = tbm->ptpages;
873 144 : istate->schunks = tbm->ptchunks;
874 :
875 144 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
876 144 : ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
877 144 : ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
878 :
879 : /*
880 : * For every shared iterator, referring to pagetable and iterator array,
881 : * increase the refcount by 1 so that while freeing the shared iterator we
882 : * don't free pagetable and iterator array until its refcount becomes 0.
883 : */
884 144 : if (ptbase != NULL)
885 144 : pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
886 144 : if (ptpages != NULL)
887 144 : pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
888 144 : if (ptchunks != NULL)
889 12 : pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
890 :
891 : /* Initialize the iterator lock */
892 144 : LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
893 :
894 : /* Initialize the shared iterator state */
895 144 : istate->schunkbit = 0;
896 144 : istate->schunkptr = 0;
897 144 : istate->spageptr = 0;
898 :
899 144 : tbm->iterating = TBM_ITERATING_SHARED;
900 :
901 144 : return dp;
902 : }
903 :
904 : /*
905 : * tbm_extract_page_tuple - extract the tuple offsets from a page
906 : *
907 : * The extracted offsets are stored into TBMIterateResult.
908 : */
909 : static inline int
910 459174 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
911 : {
912 : int wordnum;
913 459174 : int ntuples = 0;
914 :
915 2755044 : for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
916 : {
917 2295870 : bitmapword w = page->words[wordnum];
918 :
919 2295870 : if (w != 0)
920 : {
921 1090270 : int off = wordnum * BITS_PER_BITMAPWORD + 1;
922 :
923 47395218 : while (w != 0)
924 : {
925 46304948 : if (w & 1)
926 11169200 : output->offsets[ntuples++] = (OffsetNumber) off;
927 46304948 : off++;
928 46304948 : w >>= 1;
929 : }
930 : }
931 : }
932 :
933 459174 : return ntuples;
934 : }
935 :
936 : /*
937 : * tbm_advance_schunkbit - Advance the schunkbit
938 : */
939 : static inline void
940 332680 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
941 : {
942 332680 : int schunkbit = *schunkbitp;
943 :
944 1526448 : while (schunkbit < PAGES_PER_CHUNK)
945 : {
946 1520556 : int wordnum = WORDNUM(schunkbit);
947 1520556 : int bitnum = BITNUM(schunkbit);
948 :
949 1520556 : if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
950 326788 : break;
951 1193768 : schunkbit++;
952 : }
953 :
954 332680 : *schunkbitp = schunkbit;
955 332680 : }
956 :
957 : /*
958 : * tbm_iterate - scan through next page of a TIDBitmap
959 : *
960 : * Returns a TBMIterateResult representing one page, or NULL if there are
961 : * no more pages to scan. Pages are guaranteed to be delivered in numerical
962 : * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
963 : * remember the exact tuples to look at on this page --- the caller must
964 : * examine all tuples on the page and check if they meet the intended
965 : * condition. If result->recheck is true, only the indicated tuples need
966 : * be examined, but the condition must be rechecked anyway. (For ease of
967 : * testing, recheck is always set true when ntuples < 0.)
968 : */
969 : TBMIterateResult *
970 742606 : tbm_iterate(TBMIterator *iterator)
971 : {
972 742606 : TIDBitmap *tbm = iterator->tbm;
973 742606 : TBMIterateResult *output = &(iterator->output);
974 :
975 : Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
976 :
977 : /*
978 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
979 : * schunkbit to the next set bit.
980 : */
981 748450 : while (iterator->schunkptr < tbm->nchunks)
982 : {
983 320380 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
984 320380 : int schunkbit = iterator->schunkbit;
985 :
986 320380 : tbm_advance_schunkbit(chunk, &schunkbit);
987 320380 : if (schunkbit < PAGES_PER_CHUNK)
988 : {
989 314536 : iterator->schunkbit = schunkbit;
990 314536 : break;
991 : }
992 : /* advance to next chunk */
993 5844 : iterator->schunkptr++;
994 5844 : iterator->schunkbit = 0;
995 : }
996 :
997 : /*
998 : * If both chunk and per-page data remain, must output the numerically
999 : * earlier page.
1000 : */
1001 742606 : if (iterator->schunkptr < tbm->nchunks)
1002 : {
1003 314536 : PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1004 : BlockNumber chunk_blockno;
1005 :
1006 314536 : chunk_blockno = chunk->blockno + iterator->schunkbit;
1007 314536 : if (iterator->spageptr >= tbm->npages ||
1008 18576 : chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1009 : {
1010 : /* Return a lossy page indicator from the chunk */
1011 308380 : output->blockno = chunk_blockno;
1012 308380 : output->ntuples = -1;
1013 308380 : output->recheck = true;
1014 308380 : iterator->schunkbit++;
1015 308380 : return output;
1016 : }
1017 : }
1018 :
1019 434226 : if (iterator->spageptr < tbm->npages)
1020 : {
1021 : PagetableEntry *page;
1022 : int ntuples;
1023 :
1024 : /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
1025 405162 : if (tbm->status == TBM_ONE_PAGE)
1026 11946 : page = &tbm->entry1;
1027 : else
1028 393216 : page = tbm->spages[iterator->spageptr];
1029 :
1030 : /* scan bitmap to extract individual offset numbers */
1031 405162 : ntuples = tbm_extract_page_tuple(page, output);
1032 405162 : output->blockno = page->blockno;
1033 405162 : output->ntuples = ntuples;
1034 405162 : output->recheck = page->recheck;
1035 405162 : iterator->spageptr++;
1036 405162 : return output;
1037 : }
1038 :
1039 : /* Nothing more in the bitmap */
1040 29064 : return NULL;
1041 : }
1042 :
1043 : /*
1044 : * tbm_shared_iterate - scan through next page of a TIDBitmap
1045 : *
1046 : * As above, but this will iterate using an iterator which is shared
1047 : * across multiple processes. We need to acquire the iterator LWLock,
1048 : * before accessing the shared members.
1049 : */
1050 : TBMIterateResult *
1051 60636 : tbm_shared_iterate(TBMSharedIterator *iterator)
1052 : {
1053 60636 : TBMIterateResult *output = &iterator->output;
1054 60636 : TBMSharedIteratorState *istate = iterator->state;
1055 60636 : PagetableEntry *ptbase = NULL;
1056 60636 : int *idxpages = NULL;
1057 60636 : int *idxchunks = NULL;
1058 :
1059 60636 : if (iterator->ptbase != NULL)
1060 60636 : ptbase = iterator->ptbase->ptentry;
1061 60636 : if (iterator->ptpages != NULL)
1062 60636 : idxpages = iterator->ptpages->index;
1063 60636 : if (iterator->ptchunks != NULL)
1064 14856 : idxchunks = iterator->ptchunks->index;
1065 :
1066 : /* Acquire the LWLock before accessing the shared members */
1067 60636 : LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1068 :
1069 : /*
1070 : * If lossy chunk pages remain, make sure we've advanced schunkptr/
1071 : * schunkbit to the next set bit.
1072 : */
1073 60684 : while (istate->schunkptr < istate->nchunks)
1074 : {
1075 12300 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1076 12300 : int schunkbit = istate->schunkbit;
1077 :
1078 12300 : tbm_advance_schunkbit(chunk, &schunkbit);
1079 12300 : if (schunkbit < PAGES_PER_CHUNK)
1080 : {
1081 12252 : istate->schunkbit = schunkbit;
1082 12252 : break;
1083 : }
1084 : /* advance to next chunk */
1085 48 : istate->schunkptr++;
1086 48 : istate->schunkbit = 0;
1087 : }
1088 :
1089 : /*
1090 : * If both chunk and per-page data remain, must output the numerically
1091 : * earlier page.
1092 : */
1093 60636 : if (istate->schunkptr < istate->nchunks)
1094 : {
1095 12252 : PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1096 : BlockNumber chunk_blockno;
1097 :
1098 12252 : chunk_blockno = chunk->blockno + istate->schunkbit;
1099 :
1100 12252 : if (istate->spageptr >= istate->npages ||
1101 12252 : chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1102 : {
1103 : /* Return a lossy page indicator from the chunk */
1104 6204 : output->blockno = chunk_blockno;
1105 6204 : output->ntuples = -1;
1106 6204 : output->recheck = true;
1107 6204 : istate->schunkbit++;
1108 :
1109 6204 : LWLockRelease(&istate->lock);
1110 6204 : return output;
1111 : }
1112 : }
1113 :
1114 54432 : if (istate->spageptr < istate->npages)
1115 : {
1116 54012 : PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1117 : int ntuples;
1118 :
1119 : /* scan bitmap to extract individual offset numbers */
1120 54012 : ntuples = tbm_extract_page_tuple(page, output);
1121 54012 : output->blockno = page->blockno;
1122 54012 : output->ntuples = ntuples;
1123 54012 : output->recheck = page->recheck;
1124 54012 : istate->spageptr++;
1125 :
1126 54012 : LWLockRelease(&istate->lock);
1127 :
1128 54012 : return output;
1129 : }
1130 :
1131 420 : LWLockRelease(&istate->lock);
1132 :
1133 : /* Nothing more in the bitmap */
1134 420 : return NULL;
1135 : }
1136 :
1137 : /*
1138 : * tbm_end_iterate - finish an iteration over a TIDBitmap
1139 : *
1140 : * Currently this is just a pfree, but it might do more someday. (For
1141 : * instance, it could be useful to count open iterators and allow the
1142 : * bitmap to return to read/write status when there are no more iterators.)
1143 : */
1144 : void
1145 32434 : tbm_end_iterate(TBMIterator *iterator)
1146 : {
1147 32434 : pfree(iterator);
1148 32434 : }
1149 :
1150 : /*
1151 : * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1152 : *
1153 : * This doesn't free any of the shared state associated with the iterator,
1154 : * just our backend-private state.
1155 : */
1156 : void
1157 696 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
1158 : {
1159 696 : pfree(iterator);
1160 696 : }
1161 :
1162 : /*
1163 : * tbm_find_pageentry - find a PagetableEntry for the pageno
1164 : *
1165 : * Returns NULL if there is no non-lossy entry for the pageno.
1166 : */
1167 : static const PagetableEntry *
1168 5048 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1169 : {
1170 : const PagetableEntry *page;
1171 :
1172 5048 : if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1173 0 : return NULL;
1174 :
1175 5048 : if (tbm->status == TBM_ONE_PAGE)
1176 : {
1177 0 : page = &tbm->entry1;
1178 0 : if (page->blockno != pageno)
1179 0 : return NULL;
1180 : Assert(!page->ischunk);
1181 0 : return page;
1182 : }
1183 :
1184 5048 : page = pagetable_lookup(tbm->pagetable, pageno);
1185 5048 : if (page == NULL)
1186 512 : return NULL;
1187 4536 : if (page->ischunk)
1188 0 : return NULL; /* don't want a lossy chunk header */
1189 4536 : return page;
1190 : }
1191 :
1192 : /*
1193 : * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1194 : *
1195 : * If new, the entry is marked as an exact (non-chunk) entry.
1196 : *
1197 : * This may cause the table to exceed the desired memory size. It is
1198 : * up to the caller to call tbm_lossify() at the next safe point if so.
1199 : */
1200 : static PagetableEntry *
1201 7086968 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1202 : {
1203 : PagetableEntry *page;
1204 : bool found;
1205 :
1206 7086968 : if (tbm->status == TBM_EMPTY)
1207 : {
1208 : /* Use the fixed slot */
1209 11148 : page = &tbm->entry1;
1210 11148 : found = false;
1211 11148 : tbm->status = TBM_ONE_PAGE;
1212 : }
1213 : else
1214 : {
1215 7075820 : if (tbm->status == TBM_ONE_PAGE)
1216 : {
1217 78422 : page = &tbm->entry1;
1218 78422 : if (page->blockno == pageno)
1219 73372 : return page;
1220 : /* Time to switch from one page to a hashtable */
1221 5050 : tbm_create_pagetable(tbm);
1222 : }
1223 :
1224 : /* Look up or create an entry */
1225 7002448 : page = pagetable_insert(tbm->pagetable, pageno, &found);
1226 : }
1227 :
1228 : /* Initialize it if not present before */
1229 7013596 : if (!found)
1230 : {
1231 255280 : char oldstatus = page->status;
1232 :
1233 1786960 : MemSet(page, 0, sizeof(PagetableEntry));
1234 255280 : page->status = oldstatus;
1235 255280 : page->blockno = pageno;
1236 : /* must count it too */
1237 255280 : tbm->nentries++;
1238 255280 : tbm->npages++;
1239 : }
1240 :
1241 7013596 : return page;
1242 : }
1243 :
1244 : /*
1245 : * tbm_page_is_lossy - is the page marked as lossily stored?
1246 : */
1247 : static bool
1248 7097728 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1249 : {
1250 : PagetableEntry *page;
1251 : BlockNumber chunk_pageno;
1252 : int bitno;
1253 :
1254 : /* we can skip the lookup if there are no lossy chunks */
1255 7097728 : if (tbm->nchunks == 0)
1256 6977098 : return false;
1257 : Assert(tbm->status == TBM_HASH);
1258 :
1259 120630 : bitno = pageno % PAGES_PER_CHUNK;
1260 120630 : chunk_pageno = pageno - bitno;
1261 :
1262 120630 : page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1263 :
1264 120630 : if (page != NULL && page->ischunk)
1265 : {
1266 12432 : int wordnum = WORDNUM(bitno);
1267 12432 : int bitnum = BITNUM(bitno);
1268 :
1269 12432 : if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1270 5712 : return true;
1271 : }
1272 114918 : return false;
1273 : }
1274 :
1275 : /*
1276 : * tbm_mark_page_lossy - mark the page number as lossily stored
1277 : *
1278 : * This may cause the table to exceed the desired memory size. It is
1279 : * up to the caller to call tbm_lossify() at the next safe point if so.
1280 : */
1281 : static void
1282 160292 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1283 : {
1284 : PagetableEntry *page;
1285 : bool found;
1286 : BlockNumber chunk_pageno;
1287 : int bitno;
1288 : int wordnum;
1289 : int bitnum;
1290 :
1291 : /* We force the bitmap into hashtable mode whenever it's lossy */
1292 160292 : if (tbm->status != TBM_HASH)
1293 2868 : tbm_create_pagetable(tbm);
1294 :
1295 160292 : bitno = pageno % PAGES_PER_CHUNK;
1296 160292 : chunk_pageno = pageno - bitno;
1297 :
1298 : /*
1299 : * Remove any extant non-lossy entry for the page. If the page is its own
1300 : * chunk header, however, we skip this and handle the case below.
1301 : */
1302 160292 : if (bitno != 0)
1303 : {
1304 157964 : if (pagetable_delete(tbm->pagetable, pageno))
1305 : {
1306 : /* It was present, so adjust counts */
1307 12312 : tbm->nentries--;
1308 12312 : tbm->npages--; /* assume it must have been non-lossy */
1309 : }
1310 : }
1311 :
1312 : /* Look up or create entry for chunk-header page */
1313 160292 : page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1314 :
1315 : /* Initialize it if not present before */
1316 160292 : if (!found)
1317 : {
1318 2868 : char oldstatus = page->status;
1319 :
1320 20076 : MemSet(page, 0, sizeof(PagetableEntry));
1321 2868 : page->status = oldstatus;
1322 2868 : page->blockno = chunk_pageno;
1323 2868 : page->ischunk = true;
1324 : /* must count it too */
1325 2868 : tbm->nentries++;
1326 2868 : tbm->nchunks++;
1327 : }
1328 157424 : else if (!page->ischunk)
1329 : {
1330 102 : char oldstatus = page->status;
1331 :
1332 : /* chunk header page was formerly non-lossy, make it lossy */
1333 714 : MemSet(page, 0, sizeof(PagetableEntry));
1334 102 : page->status = oldstatus;
1335 102 : page->blockno = chunk_pageno;
1336 102 : page->ischunk = true;
1337 : /* we assume it had some tuple bit(s) set, so mark it lossy */
1338 102 : page->words[0] = ((bitmapword) 1 << 0);
1339 : /* adjust counts */
1340 102 : tbm->nchunks++;
1341 102 : tbm->npages--;
1342 : }
1343 :
1344 : /* Now set the original target page's bit */
1345 160292 : wordnum = WORDNUM(bitno);
1346 160292 : bitnum = BITNUM(bitno);
1347 160292 : page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1348 160292 : }
1349 :
1350 : /*
1351 : * tbm_lossify - lose some information to get back under the memory limit
1352 : */
1353 : static void
1354 24 : tbm_lossify(TIDBitmap *tbm)
1355 : {
1356 : pagetable_iterator i;
1357 : PagetableEntry *page;
1358 :
1359 : /*
1360 : * XXX Really stupid implementation: this just lossifies pages in
1361 : * essentially random order. We should be paying some attention to the
1362 : * number of bits set in each page, instead.
1363 : *
1364 : * Since we are called as soon as nentries exceeds maxentries, we should
1365 : * push nentries down to significantly less than maxentries, or else we'll
1366 : * just end up doing this again very soon. We shoot for maxentries/2.
1367 : */
1368 : Assert(tbm->iterating == TBM_NOT_ITERATING);
1369 : Assert(tbm->status == TBM_HASH);
1370 :
1371 24 : pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1372 12330 : while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1373 : {
1374 12330 : if (page->ischunk)
1375 0 : continue; /* already a chunk header */
1376 :
1377 : /*
1378 : * If the page would become a chunk header, we won't save anything by
1379 : * converting it to lossy, so skip it.
1380 : */
1381 12330 : if ((page->blockno % PAGES_PER_CHUNK) == 0)
1382 18 : continue;
1383 :
1384 : /* This does the dirty work ... */
1385 12312 : tbm_mark_page_lossy(tbm, page->blockno);
1386 :
1387 12312 : if (tbm->nentries <= tbm->maxentries / 2)
1388 : {
1389 : /*
1390 : * We have made enough room. Remember where to start lossifying
1391 : * next round, so we evenly iterate over the hashtable.
1392 : */
1393 24 : tbm->lossify_start = i.cur;
1394 24 : break;
1395 : }
1396 :
1397 : /*
1398 : * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1399 : * hashtable and may have deleted the non-lossy chunk. We can
1400 : * continue the same hash table scan, since failure to visit one
1401 : * element or visiting the newly inserted element, isn't fatal.
1402 : */
1403 : }
1404 :
1405 : /*
1406 : * With a big bitmap and small work_mem, it's possible that we cannot get
1407 : * under maxentries. Again, if that happens, we'd end up uselessly
1408 : * calling tbm_lossify over and over. To prevent this from becoming a
1409 : * performance sink, force maxentries up to at least double the current
1410 : * number of entries. (In essence, we're admitting inability to fit
1411 : * within work_mem when we do this.) Note that this test will not fire if
1412 : * we broke out of the loop early; and if we didn't, the current number of
1413 : * entries is simply not reducible any further.
1414 : */
1415 24 : if (tbm->nentries > tbm->maxentries / 2)
1416 0 : tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1417 24 : }
1418 :
1419 : /*
1420 : * qsort comparator to handle PagetableEntry pointers.
1421 : */
1422 : static int
1423 1342562 : tbm_comparator(const void *left, const void *right)
1424 : {
1425 1342562 : BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1426 1342562 : BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1427 :
1428 1342562 : if (l < r)
1429 620542 : return -1;
1430 722020 : else if (l > r)
1431 722020 : return 1;
1432 0 : return 0;
1433 : }
1434 :
1435 : /*
1436 : * As above, but this will get index into PagetableEntry array. Therefore,
1437 : * it needs to get actual PagetableEntry using the index before comparing the
1438 : * blockno.
1439 : */
1440 : static int
1441 238290 : tbm_shared_comparator(const void *left, const void *right, void *arg)
1442 : {
1443 238290 : PagetableEntry *base = (PagetableEntry *) arg;
1444 238290 : PagetableEntry *lpage = &base[*(int *) left];
1445 238290 : PagetableEntry *rpage = &base[*(int *) right];
1446 :
1447 238290 : if (lpage->blockno < rpage->blockno)
1448 108966 : return -1;
1449 129324 : else if (lpage->blockno > rpage->blockno)
1450 129324 : return 1;
1451 0 : return 0;
1452 : }
1453 :
1454 : /*
1455 : * tbm_attach_shared_iterate
1456 : *
1457 : * Allocate a backend-private iterator and attach the shared iterator state
1458 : * to it so that multiple processed can iterate jointly.
1459 : *
1460 : * We also converts the DSA pointers to local pointers and store them into
1461 : * our private iterator.
1462 : */
1463 : TBMSharedIterator *
1464 696 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1465 : {
1466 : TBMSharedIterator *iterator;
1467 : TBMSharedIteratorState *istate;
1468 :
1469 : /*
1470 : * Create the TBMSharedIterator struct, with enough trailing space to
1471 : * serve the needs of the TBMIterateResult sub-struct.
1472 : */
1473 696 : iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1474 : MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1475 :
1476 696 : istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1477 :
1478 696 : iterator->state = istate;
1479 :
1480 696 : iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1481 :
1482 696 : if (istate->npages)
1483 696 : iterator->ptpages = dsa_get_address(dsa, istate->spages);
1484 696 : if (istate->nchunks)
1485 60 : iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1486 :
1487 696 : return iterator;
1488 : }
1489 :
1490 : /*
1491 : * pagetable_allocate
1492 : *
1493 : * Callback function for allocating the memory for hashtable elements.
1494 : * Allocate memory for hashtable elements, using DSA if available.
1495 : */
1496 : static inline void *
1497 8268 : pagetable_allocate(pagetable_hash *pagetable, Size size)
1498 : {
1499 8268 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1500 : PTEntryArray *ptbase;
1501 :
1502 8268 : if (tbm->dsa == NULL)
1503 8112 : return MemoryContextAllocExtended(pagetable->ctx, size,
1504 : MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1505 :
1506 : /*
1507 : * Save the dsapagetable reference in dsapagetableold before allocating
1508 : * new memory so that pagetable_free can free the old entry.
1509 : */
1510 156 : tbm->dsapagetableold = tbm->dsapagetable;
1511 156 : tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
1512 : sizeof(PTEntryArray) + size,
1513 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
1514 156 : ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1515 :
1516 156 : return ptbase->ptentry;
1517 : }
1518 :
1519 : /*
1520 : * pagetable_free
1521 : *
1522 : * Callback function for freeing hash table elements.
1523 : */
1524 : static inline void
1525 8264 : pagetable_free(pagetable_hash *pagetable, void *pointer)
1526 : {
1527 8264 : TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1528 :
1529 : /* pfree the input pointer if DSA is not available */
1530 8264 : if (tbm->dsa == NULL)
1531 8108 : pfree(pointer);
1532 156 : else if (DsaPointerIsValid(tbm->dsapagetableold))
1533 : {
1534 84 : dsa_free(tbm->dsa, tbm->dsapagetableold);
1535 84 : tbm->dsapagetableold = InvalidDsaPointer;
1536 : }
1537 8264 : }
1538 :
1539 : /*
1540 : * tbm_calculate_entries
1541 : *
1542 : * Estimate number of hashtable entries we can have within maxbytes.
1543 : */
1544 : long
1545 532584 : tbm_calculate_entries(double maxbytes)
1546 : {
1547 : long nbuckets;
1548 :
1549 : /*
1550 : * Estimate number of hashtable entries we can have within maxbytes. This
1551 : * estimates the hash cost as sizeof(PagetableEntry), which is good enough
1552 : * for our purpose. Also count an extra Pointer per entry for the arrays
1553 : * created during iteration readout.
1554 : */
1555 532584 : nbuckets = maxbytes /
1556 : (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
1557 532584 : nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
1558 532584 : nbuckets = Max(nbuckets, 16); /* sanity limit */
1559 :
1560 532584 : return nbuckets;
1561 : }
|