LCOV - code coverage report
Current view: top level - src/backend/nodes - tidbitmap.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 424 482 88.0 %
Date: 2024-09-08 21:11:44 Functions: 28 30 93.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tidbitmap.c
       4             :  *    PostgreSQL tuple-id (TID) bitmap package
       5             :  *
       6             :  * This module provides bitmap data structures that are spiritually
       7             :  * similar to Bitmapsets, but are specially adapted to store sets of
       8             :  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
       9             :  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
      10             :  * Also, since we wish to be able to store very large tuple sets in
      11             :  * memory with this data structure, we support "lossy" storage, in which
      12             :  * we no longer remember individual tuple offsets on a page but only the
      13             :  * fact that a particular page needs to be visited.
      14             :  *
      15             :  * The "lossy" storage uses one bit per disk page, so at the standard 8K
      16             :  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
      17             :  * of memory.  People pushing around tables of that size should have a
      18             :  * couple of Mb to spare, so we don't worry about providing a second level
      19             :  * of lossiness.  In theory we could fall back to page ranges at some
      20             :  * point, but for now that seems useless complexity.
      21             :  *
      22             :  * We also support the notion of candidate matches, or rechecking.  This
      23             :  * means we know that a search need visit only some tuples on a page,
      24             :  * but we are not certain that all of those tuples are real matches.
      25             :  * So the eventual heap scan must recheck the quals for these tuples only,
      26             :  * rather than rechecking the quals for all tuples on the page as in the
      27             :  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
      28             :  * into a bitmap, and it can also happen internally when we AND a lossy
      29             :  * and a non-lossy page.
      30             :  *
      31             :  *
      32             :  * Copyright (c) 2003-2024, PostgreSQL Global Development Group
      33             :  *
      34             :  * IDENTIFICATION
      35             :  *    src/backend/nodes/tidbitmap.c
      36             :  *
      37             :  *-------------------------------------------------------------------------
      38             :  */
      39             : #include "postgres.h"
      40             : 
      41             : #include <limits.h>
      42             : 
      43             : #include "access/htup_details.h"
      44             : #include "common/hashfn.h"
      45             : #include "common/int.h"
      46             : #include "nodes/bitmapset.h"
      47             : #include "nodes/tidbitmap.h"
      48             : #include "storage/lwlock.h"
      49             : #include "utils/dsa.h"
      50             : 
      51             : /*
      52             :  * The maximum number of tuples per page is not large (typically 256 with
      53             :  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
      54             :  * the per-page bitmaps variable size.  We just legislate that the size
      55             :  * is this:
      56             :  */
      57             : #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
      58             : 
      59             : /*
      60             :  * When we have to switch over to lossy storage, we use a data structure
      61             :  * with one bit per page, where all pages having the same number DIV
      62             :  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
      63             :  * and has the bit set for a given page, there must not be a per-page entry
      64             :  * for that page in the page table.
      65             :  *
      66             :  * We actually store both exact pages and lossy chunks in the same hash
      67             :  * table, using identical data structures.  (This is because the memory
      68             :  * management for hashtables doesn't easily/efficiently allow space to be
      69             :  * transferred easily from one hashtable to another.)  Therefore it's best
      70             :  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
      71             :  * too different.  But we also want PAGES_PER_CHUNK to be a power of 2 to
      72             :  * avoid expensive integer remainder operations.  So, define it like this:
      73             :  */
      74             : #define PAGES_PER_CHUNK  (BLCKSZ / 32)
      75             : 
      76             : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
      77             : 
      78             : #define WORDNUM(x)  ((x) / BITS_PER_BITMAPWORD)
      79             : #define BITNUM(x)   ((x) % BITS_PER_BITMAPWORD)
      80             : 
      81             : /* number of active words for an exact page: */
      82             : #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
      83             : /* number of active words for a lossy chunk: */
      84             : #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
      85             : 
      86             : /*
      87             :  * The hashtable entries are represented by this data structure.  For
      88             :  * an exact page, blockno is the page number and bit k of the bitmap
      89             :  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
      90             :  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
      91             :  * bit k represents page blockno+k.  Note that it is not possible to
      92             :  * have exact storage for the first page of a chunk if we are using
      93             :  * lossy storage for any page in the chunk's range, since the same
      94             :  * hashtable entry has to serve both purposes.
      95             :  *
      96             :  * recheck is used only on exact pages --- it indicates that although
      97             :  * only the stated tuples need be checked, the full index qual condition
      98             :  * must be checked for each (ie, these are candidate matches).
      99             :  */
     100             : typedef struct PagetableEntry
     101             : {
     102             :     BlockNumber blockno;        /* page number (hashtable key) */
     103             :     char        status;         /* hash entry status */
     104             :     bool        ischunk;        /* T = lossy storage, F = exact */
     105             :     bool        recheck;        /* should the tuples be rechecked? */
     106             :     bitmapword  words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
     107             : } PagetableEntry;
     108             : 
     109             : /*
     110             :  * Holds array of pagetable entries.
     111             :  */
     112             : typedef struct PTEntryArray
     113             : {
     114             :     pg_atomic_uint32 refcount;  /* no. of iterator attached */
     115             :     PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
     116             : } PTEntryArray;
     117             : 
     118             : /*
     119             :  * We want to avoid the overhead of creating the hashtable, which is
     120             :  * comparatively large, when not necessary. Particularly when we are using a
     121             :  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
     122             :  * long enough to accumulate one entry in such cases.  We therefore avoid
     123             :  * creating an actual hashtable until we need two pagetable entries.  When
     124             :  * just one pagetable entry is needed, we store it in a fixed field of
     125             :  * TIDBitMap.  (NOTE: we don't get rid of the hashtable if the bitmap later
     126             :  * shrinks down to zero or one page again.  So, status can be TBM_HASH even
     127             :  * when nentries is zero or one.)
     128             :  */
     129             : typedef enum
     130             : {
     131             :     TBM_EMPTY,                  /* no hashtable, nentries == 0 */
     132             :     TBM_ONE_PAGE,               /* entry1 contains the single entry */
     133             :     TBM_HASH,                   /* pagetable is valid, entry1 is not */
     134             : } TBMStatus;
     135             : 
     136             : /*
     137             :  * Current iterating state of the TBM.
     138             :  */
     139             : typedef enum
     140             : {
     141             :     TBM_NOT_ITERATING,          /* not yet converted to page and chunk array */
     142             :     TBM_ITERATING_PRIVATE,      /* converted to local page and chunk array */
     143             :     TBM_ITERATING_SHARED,       /* converted to shared page and chunk array */
     144             : } TBMIteratingState;
     145             : 
     146             : /*
     147             :  * Here is the representation for a whole TIDBitMap:
     148             :  */
     149             : struct TIDBitmap
     150             : {
     151             :     NodeTag     type;           /* to make it a valid Node */
     152             :     MemoryContext mcxt;         /* memory context containing me */
     153             :     TBMStatus   status;         /* see codes above */
     154             :     struct pagetable_hash *pagetable;   /* hash table of PagetableEntry's */
     155             :     int         nentries;       /* number of entries in pagetable */
     156             :     int         maxentries;     /* limit on same to meet maxbytes */
     157             :     int         npages;         /* number of exact entries in pagetable */
     158             :     int         nchunks;        /* number of lossy entries in pagetable */
     159             :     TBMIteratingState iterating;    /* tbm_begin_iterate called? */
     160             :     uint32      lossify_start;  /* offset to start lossifying hashtable at */
     161             :     PagetableEntry entry1;      /* used when status == TBM_ONE_PAGE */
     162             :     /* these are valid when iterating is true: */
     163             :     PagetableEntry **spages;    /* sorted exact-page list, or NULL */
     164             :     PagetableEntry **schunks;   /* sorted lossy-chunk list, or NULL */
     165             :     dsa_pointer dsapagetable;   /* dsa_pointer to the element array */
     166             :     dsa_pointer dsapagetableold;    /* dsa_pointer to the old element array */
     167             :     dsa_pointer ptpages;        /* dsa_pointer to the page array */
     168             :     dsa_pointer ptchunks;       /* dsa_pointer to the chunk array */
     169             :     dsa_area   *dsa;            /* reference to per-query dsa area */
     170             : };
     171             : 
     172             : /*
     173             :  * When iterating over a bitmap in sorted order, a TBMIterator is used to
     174             :  * track our progress.  There can be several iterators scanning the same
     175             :  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
     176             :  * any iterator is created.
     177             :  */
     178             : struct TBMIterator
     179             : {
     180             :     TIDBitmap  *tbm;            /* TIDBitmap we're iterating over */
     181             :     int         spageptr;       /* next spages index */
     182             :     int         schunkptr;      /* next schunks index */
     183             :     int         schunkbit;      /* next bit to check in current schunk */
     184             :     TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
     185             : };
     186             : 
     187             : /*
     188             :  * Holds the shared members of the iterator so that multiple processes
     189             :  * can jointly iterate.
     190             :  */
     191             : typedef struct TBMSharedIteratorState
     192             : {
     193             :     int         nentries;       /* number of entries in pagetable */
     194             :     int         maxentries;     /* limit on same to meet maxbytes */
     195             :     int         npages;         /* number of exact entries in pagetable */
     196             :     int         nchunks;        /* number of lossy entries in pagetable */
     197             :     dsa_pointer pagetable;      /* dsa pointers to head of pagetable data */
     198             :     dsa_pointer spages;         /* dsa pointer to page array */
     199             :     dsa_pointer schunks;        /* dsa pointer to chunk array */
     200             :     LWLock      lock;           /* lock to protect below members */
     201             :     int         spageptr;       /* next spages index */
     202             :     int         schunkptr;      /* next schunks index */
     203             :     int         schunkbit;      /* next bit to check in current schunk */
     204             : } TBMSharedIteratorState;
     205             : 
     206             : /*
     207             :  * pagetable iteration array.
     208             :  */
     209             : typedef struct PTIterationArray
     210             : {
     211             :     pg_atomic_uint32 refcount;  /* no. of iterator attached */
     212             :     int         index[FLEXIBLE_ARRAY_MEMBER];   /* index array */
     213             : } PTIterationArray;
     214             : 
     215             : /*
     216             :  * same as TBMIterator, but it is used for joint iteration, therefore this
     217             :  * also holds a reference to the shared state.
     218             :  */
     219             : struct TBMSharedIterator
     220             : {
     221             :     TBMSharedIteratorState *state;  /* shared state */
     222             :     PTEntryArray *ptbase;       /* pagetable element array */
     223             :     PTIterationArray *ptpages;  /* sorted exact page index list */
     224             :     PTIterationArray *ptchunks; /* sorted lossy page index list */
     225             :     TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
     226             : };
     227             : 
     228             : /* Local function prototypes */
     229             : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
     230             : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
     231             :                                const TIDBitmap *b);
     232             : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
     233             :                                                 BlockNumber pageno);
     234             : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
     235             : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
     236             : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
     237             : static void tbm_lossify(TIDBitmap *tbm);
     238             : static int  tbm_comparator(const void *left, const void *right);
     239             : static int  tbm_shared_comparator(const void *left, const void *right,
     240             :                                   void *arg);
     241             : 
     242             : /* define hashtable mapping block numbers to PagetableEntry's */
     243             : #define SH_USE_NONDEFAULT_ALLOCATOR
     244             : #define SH_PREFIX pagetable
     245             : #define SH_ELEMENT_TYPE PagetableEntry
     246             : #define SH_KEY_TYPE BlockNumber
     247             : #define SH_KEY blockno
     248             : #define SH_HASH_KEY(tb, key) murmurhash32(key)
     249             : #define SH_EQUAL(tb, a, b) a == b
     250             : #define SH_SCOPE static inline
     251             : #define SH_DEFINE
     252             : #define SH_DECLARE
     253             : #include "lib/simplehash.h"
     254             : 
     255             : 
     256             : /*
     257             :  * tbm_create - create an initially-empty bitmap
     258             :  *
     259             :  * The bitmap will live in the memory context that is CurrentMemoryContext
     260             :  * at the time of this call.  It will be limited to (approximately) maxbytes
     261             :  * total memory consumption.  If the DSA passed to this function is not NULL
     262             :  * then the memory for storing elements of the underlying page table will
     263             :  * be allocated from the DSA.
     264             :  */
     265             : TIDBitmap *
     266       17900 : tbm_create(long maxbytes, dsa_area *dsa)
     267             : {
     268             :     TIDBitmap  *tbm;
     269             : 
     270             :     /* Create the TIDBitmap struct and zero all its fields */
     271       17900 :     tbm = makeNode(TIDBitmap);
     272             : 
     273       17900 :     tbm->mcxt = CurrentMemoryContext;
     274       17900 :     tbm->status = TBM_EMPTY;
     275             : 
     276       17900 :     tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
     277       17900 :     tbm->lossify_start = 0;
     278       17900 :     tbm->dsa = dsa;
     279       17900 :     tbm->dsapagetable = InvalidDsaPointer;
     280       17900 :     tbm->dsapagetableold = InvalidDsaPointer;
     281       17900 :     tbm->ptpages = InvalidDsaPointer;
     282       17900 :     tbm->ptchunks = InvalidDsaPointer;
     283             : 
     284       17900 :     return tbm;
     285             : }
     286             : 
     287             : /*
     288             :  * Actually create the hashtable.  Since this is a moderately expensive
     289             :  * proposition, we don't do it until we have to.
     290             :  */
     291             : static void
     292        8154 : tbm_create_pagetable(TIDBitmap *tbm)
     293             : {
     294             :     Assert(tbm->status != TBM_HASH);
     295             :     Assert(tbm->pagetable == NULL);
     296             : 
     297        8154 :     tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
     298             : 
     299             :     /* If entry1 is valid, push it into the hashtable */
     300        8154 :     if (tbm->status == TBM_ONE_PAGE)
     301             :     {
     302             :         PagetableEntry *page;
     303             :         bool        found;
     304             :         char        oldstatus;
     305             : 
     306        5286 :         page = pagetable_insert(tbm->pagetable,
     307             :                                 tbm->entry1.blockno,
     308             :                                 &found);
     309             :         Assert(!found);
     310        5286 :         oldstatus = page->status;
     311        5286 :         memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
     312        5286 :         page->status = oldstatus;
     313             :     }
     314             : 
     315        8154 :     tbm->status = TBM_HASH;
     316        8154 : }
     317             : 
     318             : /*
     319             :  * tbm_free - free a TIDBitmap
     320             :  */
     321             : void
     322       17834 : tbm_free(TIDBitmap *tbm)
     323             : {
     324       17834 :     if (tbm->pagetable)
     325        8154 :         pagetable_destroy(tbm->pagetable);
     326       17834 :     if (tbm->spages)
     327        5128 :         pfree(tbm->spages);
     328       17834 :     if (tbm->schunks)
     329        2880 :         pfree(tbm->schunks);
     330       17834 :     pfree(tbm);
     331       17834 : }
     332             : 
     333             : /*
     334             :  * tbm_free_shared_area - free shared state
     335             :  *
     336             :  * Free shared iterator state, Also free shared pagetable and iterator arrays
     337             :  * memory if they are not referred by any of the shared iterator i.e recount
     338             :  * is becomes 0.
     339             :  */
     340             : void
     341         108 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
     342             : {
     343         108 :     TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
     344             :     PTEntryArray *ptbase;
     345             :     PTIterationArray *ptpages;
     346             :     PTIterationArray *ptchunks;
     347             : 
     348         108 :     if (DsaPointerIsValid(istate->pagetable))
     349             :     {
     350         108 :         ptbase = dsa_get_address(dsa, istate->pagetable);
     351         108 :         if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
     352          54 :             dsa_free(dsa, istate->pagetable);
     353             :     }
     354         108 :     if (DsaPointerIsValid(istate->spages))
     355             :     {
     356         108 :         ptpages = dsa_get_address(dsa, istate->spages);
     357         108 :         if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
     358          54 :             dsa_free(dsa, istate->spages);
     359             :     }
     360         108 :     if (DsaPointerIsValid(istate->schunks))
     361             :     {
     362           0 :         ptchunks = dsa_get_address(dsa, istate->schunks);
     363           0 :         if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
     364           0 :             dsa_free(dsa, istate->schunks);
     365             :     }
     366             : 
     367         108 :     dsa_free(dsa, dp);
     368         108 : }
     369             : 
     370             : /*
     371             :  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
     372             :  *
     373             :  * If recheck is true, then the recheck flag will be set in the
     374             :  * TBMIterateResult when any of these tuples are reported out.
     375             :  */
     376             : void
     377     6404262 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
     378             :                bool recheck)
     379             : {
     380     6404262 :     BlockNumber currblk = InvalidBlockNumber;
     381     6404262 :     PagetableEntry *page = NULL;    /* only valid when currblk is valid */
     382             :     int         i;
     383             : 
     384             :     Assert(tbm->iterating == TBM_NOT_ITERATING);
     385    14989982 :     for (i = 0; i < ntids; i++)
     386             :     {
     387     8585720 :         BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
     388     8585720 :         OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
     389             :         int         wordnum,
     390             :                     bitnum;
     391             : 
     392             :         /* safety check to ensure we don't overrun bit array bounds */
     393     8585720 :         if (off < 1 || off > MAX_TUPLES_PER_PAGE)
     394           0 :             elog(ERROR, "tuple offset out of range: %u", off);
     395             : 
     396             :         /*
     397             :          * Look up target page unless we already did.  This saves cycles when
     398             :          * the input includes consecutive tuples on the same page, which is
     399             :          * common enough to justify an extra test here.
     400             :          */
     401     8585720 :         if (blk != currblk)
     402             :         {
     403     7180972 :             if (tbm_page_is_lossy(tbm, blk))
     404        2856 :                 page = NULL;    /* remember page is lossy */
     405             :             else
     406     7178116 :                 page = tbm_get_pageentry(tbm, blk);
     407     7180972 :             currblk = blk;
     408             :         }
     409             : 
     410     8585720 :         if (page == NULL)
     411        2856 :             continue;           /* whole page is already marked */
     412             : 
     413     8582864 :         if (page->ischunk)
     414             :         {
     415             :             /* The page is a lossy chunk header, set bit for itself */
     416           0 :             wordnum = bitnum = 0;
     417             :         }
     418             :         else
     419             :         {
     420             :             /* Page is exact, so set bit for individual tuple */
     421     8582864 :             wordnum = WORDNUM(off - 1);
     422     8582864 :             bitnum = BITNUM(off - 1);
     423             :         }
     424     8582864 :         page->words[wordnum] |= ((bitmapword) 1 << bitnum);
     425     8582864 :         page->recheck |= recheck;
     426             : 
     427     8582864 :         if (tbm->nentries > tbm->maxentries)
     428             :         {
     429          24 :             tbm_lossify(tbm);
     430             :             /* Page could have been converted to lossy, so force new lookup */
     431          24 :             currblk = InvalidBlockNumber;
     432             :         }
     433             :     }
     434     6404262 : }
     435             : 
     436             : /*
     437             :  * tbm_add_page - add a whole page to a TIDBitmap
     438             :  *
     439             :  * This causes the whole page to be reported (with the recheck flag)
     440             :  * when the TIDBitmap is scanned.
     441             :  */
     442             : void
     443      147980 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
     444             : {
     445             :     /* Enter the page in the bitmap, or mark it lossy if already present */
     446      147980 :     tbm_mark_page_lossy(tbm, pageno);
     447             :     /* If we went over the memory limit, lossify some more pages */
     448      147980 :     if (tbm->nentries > tbm->maxentries)
     449           0 :         tbm_lossify(tbm);
     450      147980 : }
     451             : 
     452             : /*
     453             :  * tbm_union - set union
     454             :  *
     455             :  * a is modified in-place, b is not changed
     456             :  */
     457             : void
     458           0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
     459             : {
     460             :     Assert(!a->iterating);
     461             :     /* Nothing to do if b is empty */
     462           0 :     if (b->nentries == 0)
     463           0 :         return;
     464             :     /* Scan through chunks and pages in b, merge into a */
     465           0 :     if (b->status == TBM_ONE_PAGE)
     466           0 :         tbm_union_page(a, &b->entry1);
     467             :     else
     468             :     {
     469             :         pagetable_iterator i;
     470             :         PagetableEntry *bpage;
     471             : 
     472             :         Assert(b->status == TBM_HASH);
     473           0 :         pagetable_start_iterate(b->pagetable, &i);
     474           0 :         while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
     475           0 :             tbm_union_page(a, bpage);
     476             :     }
     477             : }
     478             : 
     479             : /* Process one page of b during a union op */
     480             : static void
     481           0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
     482             : {
     483             :     PagetableEntry *apage;
     484             :     int         wordnum;
     485             : 
     486           0 :     if (bpage->ischunk)
     487             :     {
     488             :         /* Scan b's chunk, mark each indicated page lossy in a */
     489           0 :         for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
     490             :         {
     491           0 :             bitmapword  w = bpage->words[wordnum];
     492             : 
     493           0 :             if (w != 0)
     494             :             {
     495             :                 BlockNumber pg;
     496             : 
     497           0 :                 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
     498           0 :                 while (w != 0)
     499             :                 {
     500           0 :                     if (w & 1)
     501           0 :                         tbm_mark_page_lossy(a, pg);
     502           0 :                     pg++;
     503           0 :                     w >>= 1;
     504             :                 }
     505             :             }
     506             :         }
     507             :     }
     508           0 :     else if (tbm_page_is_lossy(a, bpage->blockno))
     509             :     {
     510             :         /* page is already lossy in a, nothing to do */
     511           0 :         return;
     512             :     }
     513             :     else
     514             :     {
     515           0 :         apage = tbm_get_pageentry(a, bpage->blockno);
     516           0 :         if (apage->ischunk)
     517             :         {
     518             :             /* The page is a lossy chunk header, set bit for itself */
     519           0 :             apage->words[0] |= ((bitmapword) 1 << 0);
     520             :         }
     521             :         else
     522             :         {
     523             :             /* Both pages are exact, merge at the bit level */
     524           0 :             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     525           0 :                 apage->words[wordnum] |= bpage->words[wordnum];
     526           0 :             apage->recheck |= bpage->recheck;
     527             :         }
     528             :     }
     529             : 
     530           0 :     if (a->nentries > a->maxentries)
     531           0 :         tbm_lossify(a);
     532             : }
     533             : 
     534             : /*
     535             :  * tbm_intersect - set intersection
     536             :  *
     537             :  * a is modified in-place, b is not changed
     538             :  */
     539             : void
     540          80 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
     541             : {
     542             :     Assert(!a->iterating);
     543             :     /* Nothing to do if a is empty */
     544          80 :     if (a->nentries == 0)
     545           0 :         return;
     546             :     /* Scan through chunks and pages in a, try to match to b */
     547          80 :     if (a->status == TBM_ONE_PAGE)
     548             :     {
     549          52 :         if (tbm_intersect_page(a, &a->entry1, b))
     550             :         {
     551             :             /* Page is now empty, remove it from a */
     552             :             Assert(!a->entry1.ischunk);
     553           0 :             a->npages--;
     554           0 :             a->nentries--;
     555             :             Assert(a->nentries == 0);
     556           0 :             a->status = TBM_EMPTY;
     557             :         }
     558             :     }
     559             :     else
     560             :     {
     561             :         pagetable_iterator i;
     562             :         PagetableEntry *apage;
     563             : 
     564             :         Assert(a->status == TBM_HASH);
     565          28 :         pagetable_start_iterate(a->pagetable, &i);
     566        4834 :         while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
     567             :         {
     568        4806 :             if (tbm_intersect_page(a, apage, b))
     569             :             {
     570             :                 /* Page or chunk is now empty, remove it from a */
     571        4626 :                 if (apage->ischunk)
     572           0 :                     a->nchunks--;
     573             :                 else
     574        4626 :                     a->npages--;
     575        4626 :                 a->nentries--;
     576        4626 :                 if (!pagetable_delete(a->pagetable, apage->blockno))
     577           0 :                     elog(ERROR, "hash table corrupted");
     578             :             }
     579             :         }
     580             :     }
     581             : }
     582             : 
     583             : /*
     584             :  * Process one page of a during an intersection op
     585             :  *
     586             :  * Returns true if apage is now empty and should be deleted from a
     587             :  */
     588             : static bool
     589        4858 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
     590             : {
     591             :     const PagetableEntry *bpage;
     592             :     int         wordnum;
     593             : 
     594        4858 :     if (apage->ischunk)
     595             :     {
     596             :         /* Scan each bit in chunk, try to clear */
     597          30 :         bool        candelete = true;
     598             : 
     599         150 :         for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
     600             :         {
     601         120 :             bitmapword  w = apage->words[wordnum];
     602             : 
     603         120 :             if (w != 0)
     604             :             {
     605         108 :                 bitmapword  neww = w;
     606             :                 BlockNumber pg;
     607             :                 int         bitnum;
     608             : 
     609         108 :                 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
     610         108 :                 bitnum = 0;
     611        6606 :                 while (w != 0)
     612             :                 {
     613        6498 :                     if (w & 1)
     614             :                     {
     615        3360 :                         if (!tbm_page_is_lossy(b, pg) &&
     616         252 :                             tbm_find_pageentry(b, pg) == NULL)
     617             :                         {
     618             :                             /* Page is not in b at all, lose lossy bit */
     619           0 :                             neww &= ~((bitmapword) 1 << bitnum);
     620             :                         }
     621             :                     }
     622        6498 :                     pg++;
     623        6498 :                     bitnum++;
     624        6498 :                     w >>= 1;
     625             :                 }
     626         108 :                 apage->words[wordnum] = neww;
     627         108 :                 if (neww != 0)
     628         108 :                     candelete = false;
     629             :             }
     630             :         }
     631          30 :         return candelete;
     632             :     }
     633        4828 :     else if (tbm_page_is_lossy(b, apage->blockno))
     634             :     {
     635             :         /*
     636             :          * Some of the tuples in 'a' might not satisfy the quals for 'b', but
     637             :          * because the page 'b' is lossy, we don't know which ones. Therefore
     638             :          * we mark 'a' as requiring rechecks, to indicate that at most those
     639             :          * tuples set in 'a' are matches.
     640             :          */
     641           0 :         apage->recheck = true;
     642           0 :         return false;
     643             :     }
     644             :     else
     645             :     {
     646        4828 :         bool        candelete = true;
     647             : 
     648        4828 :         bpage = tbm_find_pageentry(b, apage->blockno);
     649        4828 :         if (bpage != NULL)
     650             :         {
     651             :             /* Both pages are exact, merge at the bit level */
     652             :             Assert(!bpage->ischunk);
     653       25824 :             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     654             :             {
     655       21520 :                 apage->words[wordnum] &= bpage->words[wordnum];
     656       21520 :                 if (apage->words[wordnum] != 0)
     657         208 :                     candelete = false;
     658             :             }
     659        4304 :             apage->recheck |= bpage->recheck;
     660             :         }
     661             :         /* If there is no matching b page, we can just delete the a page */
     662        4828 :         return candelete;
     663             :     }
     664             : }
     665             : 
     666             : /*
     667             :  * tbm_is_empty - is a TIDBitmap completely empty?
     668             :  */
     669             : bool
     670         694 : tbm_is_empty(const TIDBitmap *tbm)
     671             : {
     672         694 :     return (tbm->nentries == 0);
     673             : }
     674             : 
     675             : /*
     676             :  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
     677             :  *
     678             :  * The TBMIterator struct is created in the caller's memory context.
     679             :  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
     680             :  * okay to just allow the memory context to be released, too.  It is caller's
     681             :  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
     682             :  * is freed.
     683             :  *
     684             :  * NB: after this is called, it is no longer allowed to modify the contents
     685             :  * of the bitmap.  However, you can call this multiple times to scan the
     686             :  * contents repeatedly, including parallel scans.
     687             :  */
     688             : TBMIterator *
     689       34848 : tbm_begin_iterate(TIDBitmap *tbm)
     690             : {
     691             :     TBMIterator *iterator;
     692             : 
     693             :     Assert(tbm->iterating != TBM_ITERATING_SHARED);
     694             : 
     695             :     /*
     696             :      * Create the TBMIterator struct, with enough trailing space to serve the
     697             :      * needs of the TBMIterateResult sub-struct.
     698             :      */
     699       34848 :     iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
     700             :                                       MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
     701       34848 :     iterator->tbm = tbm;
     702             : 
     703             :     /*
     704             :      * Initialize iteration pointers.
     705             :      */
     706       34848 :     iterator->spageptr = 0;
     707       34848 :     iterator->schunkptr = 0;
     708       34848 :     iterator->schunkbit = 0;
     709             : 
     710             :     /*
     711             :      * If we have a hashtable, create and fill the sorted page lists, unless
     712             :      * we already did that for a previous iterator.  Note that the lists are
     713             :      * attached to the bitmap not the iterator, so they can be used by more
     714             :      * than one iterator.
     715             :      */
     716       34848 :     if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
     717             :     {
     718             :         pagetable_iterator i;
     719             :         PagetableEntry *page;
     720             :         int         npages;
     721             :         int         nchunks;
     722             : 
     723        8002 :         if (!tbm->spages && tbm->npages > 0)
     724        5128 :             tbm->spages = (PagetableEntry **)
     725        5128 :                 MemoryContextAlloc(tbm->mcxt,
     726        5128 :                                    tbm->npages * sizeof(PagetableEntry *));
     727        8002 :         if (!tbm->schunks && tbm->nchunks > 0)
     728        2880 :             tbm->schunks = (PagetableEntry **)
     729        2880 :                 MemoryContextAlloc(tbm->mcxt,
     730        2880 :                                    tbm->nchunks * sizeof(PagetableEntry *));
     731             : 
     732        8002 :         npages = nchunks = 0;
     733        8002 :         pagetable_start_iterate(tbm->pagetable, &i);
     734      215258 :         while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
     735             :         {
     736      207256 :             if (page->ischunk)
     737        2922 :                 tbm->schunks[nchunks++] = page;
     738             :             else
     739      204334 :                 tbm->spages[npages++] = page;
     740             :         }
     741             :         Assert(npages == tbm->npages);
     742             :         Assert(nchunks == tbm->nchunks);
     743        8002 :         if (npages > 1)
     744        5124 :             qsort(tbm->spages, npages, sizeof(PagetableEntry *),
     745             :                   tbm_comparator);
     746        8002 :         if (nchunks > 1)
     747          12 :             qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
     748             :                   tbm_comparator);
     749             :     }
     750             : 
     751       34848 :     tbm->iterating = TBM_ITERATING_PRIVATE;
     752             : 
     753       34848 :     return iterator;
     754             : }
     755             : 
     756             : /*
     757             :  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
     758             :  *
     759             :  * The necessary shared state will be allocated from the DSA passed to
     760             :  * tbm_create, so that multiple processes can attach to it and iterate jointly.
     761             :  *
     762             :  * This will convert the pagetable hash into page and chunk array of the index
     763             :  * into pagetable array.
     764             :  */
     765             : dsa_pointer
     766         144 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
     767             : {
     768             :     dsa_pointer dp;
     769             :     TBMSharedIteratorState *istate;
     770         144 :     PTEntryArray *ptbase = NULL;
     771         144 :     PTIterationArray *ptpages = NULL;
     772         144 :     PTIterationArray *ptchunks = NULL;
     773             : 
     774             :     Assert(tbm->dsa != NULL);
     775             :     Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
     776             : 
     777             :     /*
     778             :      * Allocate TBMSharedIteratorState from DSA to hold the shared members and
     779             :      * lock, this will also be used by multiple worker for shared iterate.
     780             :      */
     781         144 :     dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
     782         144 :     istate = dsa_get_address(tbm->dsa, dp);
     783             : 
     784             :     /*
     785             :      * If we're not already iterating, create and fill the sorted page lists.
     786             :      * (If we are, the sorted page lists are already stored in the TIDBitmap,
     787             :      * and we can just reuse them.)
     788             :      */
     789         144 :     if (tbm->iterating == TBM_NOT_ITERATING)
     790             :     {
     791             :         pagetable_iterator i;
     792             :         PagetableEntry *page;
     793             :         int         idx;
     794             :         int         npages;
     795             :         int         nchunks;
     796             : 
     797             :         /*
     798             :          * Allocate the page and chunk array memory from the DSA to share
     799             :          * across multiple processes.
     800             :          */
     801          72 :         if (tbm->npages)
     802             :         {
     803          72 :             tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
     804             :                                         tbm->npages * sizeof(int));
     805          72 :             ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
     806          72 :             pg_atomic_init_u32(&ptpages->refcount, 0);
     807             :         }
     808          72 :         if (tbm->nchunks)
     809             :         {
     810           6 :             tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
     811             :                                          tbm->nchunks * sizeof(int));
     812           6 :             ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
     813           6 :             pg_atomic_init_u32(&ptchunks->refcount, 0);
     814             :         }
     815             : 
     816             :         /*
     817             :          * If TBM status is TBM_HASH then iterate over the pagetable and
     818             :          * convert it to page and chunk arrays.  But if it's in the
     819             :          * TBM_ONE_PAGE mode then directly allocate the space for one entry
     820             :          * from the DSA.
     821             :          */
     822          72 :         npages = nchunks = 0;
     823          72 :         if (tbm->status == TBM_HASH)
     824             :         {
     825          72 :             ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     826             : 
     827          72 :             pagetable_start_iterate(tbm->pagetable, &i);
     828       27102 :             while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
     829             :             {
     830       27030 :                 idx = page - ptbase->ptentry;
     831       27030 :                 if (page->ischunk)
     832          24 :                     ptchunks->index[nchunks++] = idx;
     833             :                 else
     834       27006 :                     ptpages->index[npages++] = idx;
     835             :             }
     836             : 
     837             :             Assert(npages == tbm->npages);
     838             :             Assert(nchunks == tbm->nchunks);
     839             :         }
     840           0 :         else if (tbm->status == TBM_ONE_PAGE)
     841             :         {
     842             :             /*
     843             :              * In one page mode allocate the space for one pagetable entry,
     844             :              * initialize it, and directly store its index (i.e. 0) in the
     845             :              * page array.
     846             :              */
     847           0 :             tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
     848             :                                              sizeof(PagetableEntry));
     849           0 :             ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     850           0 :             memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
     851           0 :             ptpages->index[0] = 0;
     852             :         }
     853             : 
     854          72 :         if (ptbase != NULL)
     855          72 :             pg_atomic_init_u32(&ptbase->refcount, 0);
     856          72 :         if (npages > 1)
     857          72 :             qsort_arg(ptpages->index, npages, sizeof(int),
     858          72 :                       tbm_shared_comparator, ptbase->ptentry);
     859          72 :         if (nchunks > 1)
     860           6 :             qsort_arg(ptchunks->index, nchunks, sizeof(int),
     861           6 :                       tbm_shared_comparator, ptbase->ptentry);
     862             :     }
     863             : 
     864             :     /*
     865             :      * Store the TBM members in the shared state so that we can share them
     866             :      * across multiple processes.
     867             :      */
     868         144 :     istate->nentries = tbm->nentries;
     869         144 :     istate->maxentries = tbm->maxentries;
     870         144 :     istate->npages = tbm->npages;
     871         144 :     istate->nchunks = tbm->nchunks;
     872         144 :     istate->pagetable = tbm->dsapagetable;
     873         144 :     istate->spages = tbm->ptpages;
     874         144 :     istate->schunks = tbm->ptchunks;
     875             : 
     876         144 :     ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     877         144 :     ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
     878         144 :     ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
     879             : 
     880             :     /*
     881             :      * For every shared iterator, referring to pagetable and iterator array,
     882             :      * increase the refcount by 1 so that while freeing the shared iterator we
     883             :      * don't free pagetable and iterator array until its refcount becomes 0.
     884             :      */
     885         144 :     if (ptbase != NULL)
     886         144 :         pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
     887         144 :     if (ptpages != NULL)
     888         144 :         pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
     889         144 :     if (ptchunks != NULL)
     890          12 :         pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
     891             : 
     892             :     /* Initialize the iterator lock */
     893         144 :     LWLockInitialize(&istate->lock, LWTRANCHE_SHARED_TIDBITMAP);
     894             : 
     895             :     /* Initialize the shared iterator state */
     896         144 :     istate->schunkbit = 0;
     897         144 :     istate->schunkptr = 0;
     898         144 :     istate->spageptr = 0;
     899             : 
     900         144 :     tbm->iterating = TBM_ITERATING_SHARED;
     901             : 
     902         144 :     return dp;
     903             : }
     904             : 
     905             : /*
     906             :  * tbm_extract_page_tuple - extract the tuple offsets from a page
     907             :  *
     908             :  * The extracted offsets are stored into TBMIterateResult.
     909             :  */
     910             : static inline int
     911      467698 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
     912             : {
     913             :     int         wordnum;
     914      467698 :     int         ntuples = 0;
     915             : 
     916     2806188 :     for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     917             :     {
     918     2338490 :         bitmapword  w = page->words[wordnum];
     919             : 
     920     2338490 :         if (w != 0)
     921             :         {
     922     1101474 :             int         off = wordnum * BITS_PER_BITMAPWORD + 1;
     923             : 
     924    47826650 :             while (w != 0)
     925             :             {
     926    46725176 :                 if (w & 1)
     927    11338780 :                     output->offsets[ntuples++] = (OffsetNumber) off;
     928    46725176 :                 off++;
     929    46725176 :                 w >>= 1;
     930             :             }
     931             :         }
     932             :     }
     933             : 
     934      467698 :     return ntuples;
     935             : }
     936             : 
     937             : /*
     938             :  *  tbm_advance_schunkbit - Advance the schunkbit
     939             :  */
     940             : static inline void
     941      332680 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
     942             : {
     943      332680 :     int         schunkbit = *schunkbitp;
     944             : 
     945     1526448 :     while (schunkbit < PAGES_PER_CHUNK)
     946             :     {
     947     1520556 :         int         wordnum = WORDNUM(schunkbit);
     948     1520556 :         int         bitnum = BITNUM(schunkbit);
     949             : 
     950     1520556 :         if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
     951      326788 :             break;
     952     1193768 :         schunkbit++;
     953             :     }
     954             : 
     955      332680 :     *schunkbitp = schunkbit;
     956      332680 : }
     957             : 
     958             : /*
     959             :  * tbm_iterate - scan through next page of a TIDBitmap
     960             :  *
     961             :  * Returns a TBMIterateResult representing one page, or NULL if there are
     962             :  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
     963             :  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
     964             :  * remember the exact tuples to look at on this page --- the caller must
     965             :  * examine all tuples on the page and check if they meet the intended
     966             :  * condition.  If result->recheck is true, only the indicated tuples need
     967             :  * be examined, but the condition must be rechecked anyway.  (For ease of
     968             :  * testing, recheck is always set true when ntuples < 0.)
     969             :  */
     970             : TBMIterateResult *
     971      752908 : tbm_iterate(TBMIterator *iterator)
     972             : {
     973      752908 :     TIDBitmap  *tbm = iterator->tbm;
     974      752908 :     TBMIterateResult *output = &(iterator->output);
     975             : 
     976             :     Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
     977             : 
     978             :     /*
     979             :      * If lossy chunk pages remain, make sure we've advanced schunkptr/
     980             :      * schunkbit to the next set bit.
     981             :      */
     982      758752 :     while (iterator->schunkptr < tbm->nchunks)
     983             :     {
     984      320380 :         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
     985      320380 :         int         schunkbit = iterator->schunkbit;
     986             : 
     987      320380 :         tbm_advance_schunkbit(chunk, &schunkbit);
     988      320380 :         if (schunkbit < PAGES_PER_CHUNK)
     989             :         {
     990      314536 :             iterator->schunkbit = schunkbit;
     991      314536 :             break;
     992             :         }
     993             :         /* advance to next chunk */
     994        5844 :         iterator->schunkptr++;
     995        5844 :         iterator->schunkbit = 0;
     996             :     }
     997             : 
     998             :     /*
     999             :      * If both chunk and per-page data remain, must output the numerically
    1000             :      * earlier page.
    1001             :      */
    1002      752908 :     if (iterator->schunkptr < tbm->nchunks)
    1003             :     {
    1004      314536 :         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
    1005             :         BlockNumber chunk_blockno;
    1006             : 
    1007      314536 :         chunk_blockno = chunk->blockno + iterator->schunkbit;
    1008      314536 :         if (iterator->spageptr >= tbm->npages ||
    1009       18576 :             chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
    1010             :         {
    1011             :             /* Return a lossy page indicator from the chunk */
    1012      308380 :             output->blockno = chunk_blockno;
    1013      308380 :             output->ntuples = -1;
    1014      308380 :             output->recheck = true;
    1015      308380 :             iterator->schunkbit++;
    1016      308380 :             return output;
    1017             :         }
    1018             :     }
    1019             : 
    1020      444528 :     if (iterator->spageptr < tbm->npages)
    1021             :     {
    1022             :         PagetableEntry *page;
    1023             :         int         ntuples;
    1024             : 
    1025             :         /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
    1026      413686 :         if (tbm->status == TBM_ONE_PAGE)
    1027       12246 :             page = &tbm->entry1;
    1028             :         else
    1029      401440 :             page = tbm->spages[iterator->spageptr];
    1030             : 
    1031             :         /* scan bitmap to extract individual offset numbers */
    1032      413686 :         ntuples = tbm_extract_page_tuple(page, output);
    1033      413686 :         output->blockno = page->blockno;
    1034      413686 :         output->ntuples = ntuples;
    1035      413686 :         output->recheck = page->recheck;
    1036      413686 :         iterator->spageptr++;
    1037      413686 :         return output;
    1038             :     }
    1039             : 
    1040             :     /* Nothing more in the bitmap */
    1041       30842 :     return NULL;
    1042             : }
    1043             : 
    1044             : /*
    1045             :  *  tbm_shared_iterate - scan through next page of a TIDBitmap
    1046             :  *
    1047             :  *  As above, but this will iterate using an iterator which is shared
    1048             :  *  across multiple processes.  We need to acquire the iterator LWLock,
    1049             :  *  before accessing the shared members.
    1050             :  */
    1051             : TBMIterateResult *
    1052       60628 : tbm_shared_iterate(TBMSharedIterator *iterator)
    1053             : {
    1054       60628 :     TBMIterateResult *output = &iterator->output;
    1055       60628 :     TBMSharedIteratorState *istate = iterator->state;
    1056       60628 :     PagetableEntry *ptbase = NULL;
    1057       60628 :     int        *idxpages = NULL;
    1058       60628 :     int        *idxchunks = NULL;
    1059             : 
    1060       60628 :     if (iterator->ptbase != NULL)
    1061       60628 :         ptbase = iterator->ptbase->ptentry;
    1062       60628 :     if (iterator->ptpages != NULL)
    1063       60628 :         idxpages = iterator->ptpages->index;
    1064       60628 :     if (iterator->ptchunks != NULL)
    1065       14858 :         idxchunks = iterator->ptchunks->index;
    1066             : 
    1067             :     /* Acquire the LWLock before accessing the shared members */
    1068       60628 :     LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
    1069             : 
    1070             :     /*
    1071             :      * If lossy chunk pages remain, make sure we've advanced schunkptr/
    1072             :      * schunkbit to the next set bit.
    1073             :      */
    1074       60676 :     while (istate->schunkptr < istate->nchunks)
    1075             :     {
    1076       12300 :         PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
    1077       12300 :         int         schunkbit = istate->schunkbit;
    1078             : 
    1079       12300 :         tbm_advance_schunkbit(chunk, &schunkbit);
    1080       12300 :         if (schunkbit < PAGES_PER_CHUNK)
    1081             :         {
    1082       12252 :             istate->schunkbit = schunkbit;
    1083       12252 :             break;
    1084             :         }
    1085             :         /* advance to next chunk */
    1086          48 :         istate->schunkptr++;
    1087          48 :         istate->schunkbit = 0;
    1088             :     }
    1089             : 
    1090             :     /*
    1091             :      * If both chunk and per-page data remain, must output the numerically
    1092             :      * earlier page.
    1093             :      */
    1094       60628 :     if (istate->schunkptr < istate->nchunks)
    1095             :     {
    1096       12252 :         PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
    1097             :         BlockNumber chunk_blockno;
    1098             : 
    1099       12252 :         chunk_blockno = chunk->blockno + istate->schunkbit;
    1100             : 
    1101       12252 :         if (istate->spageptr >= istate->npages ||
    1102       12252 :             chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
    1103             :         {
    1104             :             /* Return a lossy page indicator from the chunk */
    1105        6204 :             output->blockno = chunk_blockno;
    1106        6204 :             output->ntuples = -1;
    1107        6204 :             output->recheck = true;
    1108        6204 :             istate->schunkbit++;
    1109             : 
    1110        6204 :             LWLockRelease(&istate->lock);
    1111        6204 :             return output;
    1112             :         }
    1113             :     }
    1114             : 
    1115       54424 :     if (istate->spageptr < istate->npages)
    1116             :     {
    1117       54012 :         PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
    1118             :         int         ntuples;
    1119             : 
    1120             :         /* scan bitmap to extract individual offset numbers */
    1121       54012 :         ntuples = tbm_extract_page_tuple(page, output);
    1122       54012 :         output->blockno = page->blockno;
    1123       54012 :         output->ntuples = ntuples;
    1124       54012 :         output->recheck = page->recheck;
    1125       54012 :         istate->spageptr++;
    1126             : 
    1127       54012 :         LWLockRelease(&istate->lock);
    1128             : 
    1129       54012 :         return output;
    1130             :     }
    1131             : 
    1132         412 :     LWLockRelease(&istate->lock);
    1133             : 
    1134             :     /* Nothing more in the bitmap */
    1135         412 :     return NULL;
    1136             : }
    1137             : 
    1138             : /*
    1139             :  * tbm_end_iterate - finish an iteration over a TIDBitmap
    1140             :  *
    1141             :  * Currently this is just a pfree, but it might do more someday.  (For
    1142             :  * instance, it could be useful to count open iterators and allow the
    1143             :  * bitmap to return to read/write status when there are no more iterators.)
    1144             :  */
    1145             : void
    1146       34770 : tbm_end_iterate(TBMIterator *iterator)
    1147             : {
    1148       34770 :     pfree(iterator);
    1149       34770 : }
    1150             : 
    1151             : /*
    1152             :  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
    1153             :  *
    1154             :  * This doesn't free any of the shared state associated with the iterator,
    1155             :  * just our backend-private state.
    1156             :  */
    1157             : void
    1158         676 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
    1159             : {
    1160         676 :     pfree(iterator);
    1161         676 : }
    1162             : 
    1163             : /*
    1164             :  * tbm_find_pageentry - find a PagetableEntry for the pageno
    1165             :  *
    1166             :  * Returns NULL if there is no non-lossy entry for the pageno.
    1167             :  */
    1168             : static const PagetableEntry *
    1169        5080 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
    1170             : {
    1171             :     const PagetableEntry *page;
    1172             : 
    1173        5080 :     if (tbm->nentries == 0)      /* in case pagetable doesn't exist */
    1174           0 :         return NULL;
    1175             : 
    1176        5080 :     if (tbm->status == TBM_ONE_PAGE)
    1177             :     {
    1178           0 :         page = &tbm->entry1;
    1179           0 :         if (page->blockno != pageno)
    1180           0 :             return NULL;
    1181             :         Assert(!page->ischunk);
    1182           0 :         return page;
    1183             :     }
    1184             : 
    1185        5080 :     page = pagetable_lookup(tbm->pagetable, pageno);
    1186        5080 :     if (page == NULL)
    1187         524 :         return NULL;
    1188        4556 :     if (page->ischunk)
    1189           0 :         return NULL;            /* don't want a lossy chunk header */
    1190        4556 :     return page;
    1191             : }
    1192             : 
    1193             : /*
    1194             :  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
    1195             :  *
    1196             :  * If new, the entry is marked as an exact (non-chunk) entry.
    1197             :  *
    1198             :  * This may cause the table to exceed the desired memory size.  It is
    1199             :  * up to the caller to call tbm_lossify() at the next safe point if so.
    1200             :  */
    1201             : static PagetableEntry *
    1202     7178116 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
    1203             : {
    1204             :     PagetableEntry *page;
    1205             :     bool        found;
    1206             : 
    1207     7178116 :     if (tbm->status == TBM_EMPTY)
    1208             :     {
    1209             :         /* Use the fixed slot */
    1210       11534 :         page = &tbm->entry1;
    1211       11534 :         found = false;
    1212       11534 :         tbm->status = TBM_ONE_PAGE;
    1213             :     }
    1214             :     else
    1215             :     {
    1216     7166582 :         if (tbm->status == TBM_ONE_PAGE)
    1217             :         {
    1218       86802 :             page = &tbm->entry1;
    1219       86802 :             if (page->blockno == pageno)
    1220       81516 :                 return page;
    1221             :             /* Time to switch from one page to a hashtable */
    1222        5286 :             tbm_create_pagetable(tbm);
    1223             :         }
    1224             : 
    1225             :         /* Look up or create an entry */
    1226     7085066 :         page = pagetable_insert(tbm->pagetable, pageno, &found);
    1227             :     }
    1228             : 
    1229             :     /* Initialize it if not present before */
    1230     7096600 :     if (!found)
    1231             :     {
    1232      259954 :         char        oldstatus = page->status;
    1233             : 
    1234     1819678 :         MemSet(page, 0, sizeof(PagetableEntry));
    1235      259954 :         page->status = oldstatus;
    1236      259954 :         page->blockno = pageno;
    1237             :         /* must count it too */
    1238      259954 :         tbm->nentries++;
    1239      259954 :         tbm->npages++;
    1240             :     }
    1241             : 
    1242     7096600 :     return page;
    1243             : }
    1244             : 
    1245             : /*
    1246             :  * tbm_page_is_lossy - is the page marked as lossily stored?
    1247             :  */
    1248             : static bool
    1249     7188908 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
    1250             : {
    1251             :     PagetableEntry *page;
    1252             :     BlockNumber chunk_pageno;
    1253             :     int         bitno;
    1254             : 
    1255             :     /* we can skip the lookup if there are no lossy chunks */
    1256     7188908 :     if (tbm->nchunks == 0)
    1257     7068278 :         return false;
    1258             :     Assert(tbm->status == TBM_HASH);
    1259             : 
    1260      120630 :     bitno = pageno % PAGES_PER_CHUNK;
    1261      120630 :     chunk_pageno = pageno - bitno;
    1262             : 
    1263      120630 :     page = pagetable_lookup(tbm->pagetable, chunk_pageno);
    1264             : 
    1265      120630 :     if (page != NULL && page->ischunk)
    1266             :     {
    1267       12432 :         int         wordnum = WORDNUM(bitno);
    1268       12432 :         int         bitnum = BITNUM(bitno);
    1269             : 
    1270       12432 :         if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
    1271        5712 :             return true;
    1272             :     }
    1273      114918 :     return false;
    1274             : }
    1275             : 
    1276             : /*
    1277             :  * tbm_mark_page_lossy - mark the page number as lossily stored
    1278             :  *
    1279             :  * This may cause the table to exceed the desired memory size.  It is
    1280             :  * up to the caller to call tbm_lossify() at the next safe point if so.
    1281             :  */
    1282             : static void
    1283      160292 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
    1284             : {
    1285             :     PagetableEntry *page;
    1286             :     bool        found;
    1287             :     BlockNumber chunk_pageno;
    1288             :     int         bitno;
    1289             :     int         wordnum;
    1290             :     int         bitnum;
    1291             : 
    1292             :     /* We force the bitmap into hashtable mode whenever it's lossy */
    1293      160292 :     if (tbm->status != TBM_HASH)
    1294        2868 :         tbm_create_pagetable(tbm);
    1295             : 
    1296      160292 :     bitno = pageno % PAGES_PER_CHUNK;
    1297      160292 :     chunk_pageno = pageno - bitno;
    1298             : 
    1299             :     /*
    1300             :      * Remove any extant non-lossy entry for the page.  If the page is its own
    1301             :      * chunk header, however, we skip this and handle the case below.
    1302             :      */
    1303      160292 :     if (bitno != 0)
    1304             :     {
    1305      157964 :         if (pagetable_delete(tbm->pagetable, pageno))
    1306             :         {
    1307             :             /* It was present, so adjust counts */
    1308       12312 :             tbm->nentries--;
    1309       12312 :             tbm->npages--;       /* assume it must have been non-lossy */
    1310             :         }
    1311             :     }
    1312             : 
    1313             :     /* Look up or create entry for chunk-header page */
    1314      160292 :     page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
    1315             : 
    1316             :     /* Initialize it if not present before */
    1317      160292 :     if (!found)
    1318             :     {
    1319        2868 :         char        oldstatus = page->status;
    1320             : 
    1321       20076 :         MemSet(page, 0, sizeof(PagetableEntry));
    1322        2868 :         page->status = oldstatus;
    1323        2868 :         page->blockno = chunk_pageno;
    1324        2868 :         page->ischunk = true;
    1325             :         /* must count it too */
    1326        2868 :         tbm->nentries++;
    1327        2868 :         tbm->nchunks++;
    1328             :     }
    1329      157424 :     else if (!page->ischunk)
    1330             :     {
    1331         102 :         char        oldstatus = page->status;
    1332             : 
    1333             :         /* chunk header page was formerly non-lossy, make it lossy */
    1334         714 :         MemSet(page, 0, sizeof(PagetableEntry));
    1335         102 :         page->status = oldstatus;
    1336         102 :         page->blockno = chunk_pageno;
    1337         102 :         page->ischunk = true;
    1338             :         /* we assume it had some tuple bit(s) set, so mark it lossy */
    1339         102 :         page->words[0] = ((bitmapword) 1 << 0);
    1340             :         /* adjust counts */
    1341         102 :         tbm->nchunks++;
    1342         102 :         tbm->npages--;
    1343             :     }
    1344             : 
    1345             :     /* Now set the original target page's bit */
    1346      160292 :     wordnum = WORDNUM(bitno);
    1347      160292 :     bitnum = BITNUM(bitno);
    1348      160292 :     page->words[wordnum] |= ((bitmapword) 1 << bitnum);
    1349      160292 : }
    1350             : 
    1351             : /*
    1352             :  * tbm_lossify - lose some information to get back under the memory limit
    1353             :  */
    1354             : static void
    1355          24 : tbm_lossify(TIDBitmap *tbm)
    1356             : {
    1357             :     pagetable_iterator i;
    1358             :     PagetableEntry *page;
    1359             : 
    1360             :     /*
    1361             :      * XXX Really stupid implementation: this just lossifies pages in
    1362             :      * essentially random order.  We should be paying some attention to the
    1363             :      * number of bits set in each page, instead.
    1364             :      *
    1365             :      * Since we are called as soon as nentries exceeds maxentries, we should
    1366             :      * push nentries down to significantly less than maxentries, or else we'll
    1367             :      * just end up doing this again very soon.  We shoot for maxentries/2.
    1368             :      */
    1369             :     Assert(tbm->iterating == TBM_NOT_ITERATING);
    1370             :     Assert(tbm->status == TBM_HASH);
    1371             : 
    1372          24 :     pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
    1373       12330 :     while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
    1374             :     {
    1375       12330 :         if (page->ischunk)
    1376           0 :             continue;           /* already a chunk header */
    1377             : 
    1378             :         /*
    1379             :          * If the page would become a chunk header, we won't save anything by
    1380             :          * converting it to lossy, so skip it.
    1381             :          */
    1382       12330 :         if ((page->blockno % PAGES_PER_CHUNK) == 0)
    1383          18 :             continue;
    1384             : 
    1385             :         /* This does the dirty work ... */
    1386       12312 :         tbm_mark_page_lossy(tbm, page->blockno);
    1387             : 
    1388       12312 :         if (tbm->nentries <= tbm->maxentries / 2)
    1389             :         {
    1390             :             /*
    1391             :              * We have made enough room. Remember where to start lossifying
    1392             :              * next round, so we evenly iterate over the hashtable.
    1393             :              */
    1394          24 :             tbm->lossify_start = i.cur;
    1395          24 :             break;
    1396             :         }
    1397             : 
    1398             :         /*
    1399             :          * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
    1400             :          * hashtable and may have deleted the non-lossy chunk.  We can
    1401             :          * continue the same hash table scan, since failure to visit one
    1402             :          * element or visiting the newly inserted element, isn't fatal.
    1403             :          */
    1404             :     }
    1405             : 
    1406             :     /*
    1407             :      * With a big bitmap and small work_mem, it's possible that we cannot get
    1408             :      * under maxentries.  Again, if that happens, we'd end up uselessly
    1409             :      * calling tbm_lossify over and over.  To prevent this from becoming a
    1410             :      * performance sink, force maxentries up to at least double the current
    1411             :      * number of entries.  (In essence, we're admitting inability to fit
    1412             :      * within work_mem when we do this.)  Note that this test will not fire if
    1413             :      * we broke out of the loop early; and if we didn't, the current number of
    1414             :      * entries is simply not reducible any further.
    1415             :      */
    1416          24 :     if (tbm->nentries > tbm->maxentries / 2)
    1417           0 :         tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
    1418          24 : }
    1419             : 
    1420             : /*
    1421             :  * qsort comparator to handle PagetableEntry pointers.
    1422             :  */
    1423             : static int
    1424     1369668 : tbm_comparator(const void *left, const void *right)
    1425             : {
    1426     1369668 :     BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
    1427     1369668 :     BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
    1428             : 
    1429     1369668 :     return pg_cmp_u32(l, r);
    1430             : }
    1431             : 
    1432             : /*
    1433             :  * As above, but this will get index into PagetableEntry array.  Therefore,
    1434             :  * it needs to get actual PagetableEntry using the index before comparing the
    1435             :  * blockno.
    1436             :  */
    1437             : static int
    1438      238290 : tbm_shared_comparator(const void *left, const void *right, void *arg)
    1439             : {
    1440      238290 :     PagetableEntry *base = (PagetableEntry *) arg;
    1441      238290 :     PagetableEntry *lpage = &base[*(int *) left];
    1442      238290 :     PagetableEntry *rpage = &base[*(int *) right];
    1443             : 
    1444      238290 :     if (lpage->blockno < rpage->blockno)
    1445      108966 :         return -1;
    1446      129324 :     else if (lpage->blockno > rpage->blockno)
    1447      129324 :         return 1;
    1448           0 :     return 0;
    1449             : }
    1450             : 
    1451             : /*
    1452             :  *  tbm_attach_shared_iterate
    1453             :  *
    1454             :  *  Allocate a backend-private iterator and attach the shared iterator state
    1455             :  *  to it so that multiple processed can iterate jointly.
    1456             :  *
    1457             :  *  We also converts the DSA pointers to local pointers and store them into
    1458             :  *  our private iterator.
    1459             :  */
    1460             : TBMSharedIterator *
    1461         676 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
    1462             : {
    1463             :     TBMSharedIterator *iterator;
    1464             :     TBMSharedIteratorState *istate;
    1465             : 
    1466             :     /*
    1467             :      * Create the TBMSharedIterator struct, with enough trailing space to
    1468             :      * serve the needs of the TBMIterateResult sub-struct.
    1469             :      */
    1470         676 :     iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
    1471             :                                              MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
    1472             : 
    1473         676 :     istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
    1474             : 
    1475         676 :     iterator->state = istate;
    1476             : 
    1477         676 :     iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
    1478             : 
    1479         676 :     if (istate->npages)
    1480         676 :         iterator->ptpages = dsa_get_address(dsa, istate->spages);
    1481         676 :     if (istate->nchunks)
    1482          60 :         iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
    1483             : 
    1484         676 :     return iterator;
    1485             : }
    1486             : 
    1487             : /*
    1488             :  * pagetable_allocate
    1489             :  *
    1490             :  * Callback function for allocating the memory for hashtable elements.
    1491             :  * Allocate memory for hashtable elements, using DSA if available.
    1492             :  */
    1493             : static inline void *
    1494        8510 : pagetable_allocate(pagetable_hash *pagetable, Size size)
    1495             : {
    1496        8510 :     TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
    1497             :     PTEntryArray *ptbase;
    1498             : 
    1499        8510 :     if (tbm->dsa == NULL)
    1500        8354 :         return MemoryContextAllocExtended(pagetable->ctx, size,
    1501             :                                           MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
    1502             : 
    1503             :     /*
    1504             :      * Save the dsapagetable reference in dsapagetableold before allocating
    1505             :      * new memory so that pagetable_free can free the old entry.
    1506             :      */
    1507         156 :     tbm->dsapagetableold = tbm->dsapagetable;
    1508         156 :     tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
    1509             :                                               sizeof(PTEntryArray) + size,
    1510             :                                               DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
    1511         156 :     ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
    1512             : 
    1513         156 :     return ptbase->ptentry;
    1514             : }
    1515             : 
    1516             : /*
    1517             :  * pagetable_free
    1518             :  *
    1519             :  * Callback function for freeing hash table elements.
    1520             :  */
    1521             : static inline void
    1522        8510 : pagetable_free(pagetable_hash *pagetable, void *pointer)
    1523             : {
    1524        8510 :     TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
    1525             : 
    1526             :     /* pfree the input pointer if DSA is not available */
    1527        8510 :     if (tbm->dsa == NULL)
    1528        8354 :         pfree(pointer);
    1529         156 :     else if (DsaPointerIsValid(tbm->dsapagetableold))
    1530             :     {
    1531          84 :         dsa_free(tbm->dsa, tbm->dsapagetableold);
    1532          84 :         tbm->dsapagetableold = InvalidDsaPointer;
    1533             :     }
    1534        8510 : }
    1535             : 
    1536             : /*
    1537             :  * tbm_calculate_entries
    1538             :  *
    1539             :  * Estimate number of hashtable entries we can have within maxbytes.
    1540             :  */
    1541             : long
    1542      573672 : tbm_calculate_entries(double maxbytes)
    1543             : {
    1544             :     long        nbuckets;
    1545             : 
    1546             :     /*
    1547             :      * Estimate number of hashtable entries we can have within maxbytes. This
    1548             :      * estimates the hash cost as sizeof(PagetableEntry), which is good enough
    1549             :      * for our purpose.  Also count an extra Pointer per entry for the arrays
    1550             :      * created during iteration readout.
    1551             :      */
    1552      573672 :     nbuckets = maxbytes /
    1553             :         (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
    1554      573672 :     nbuckets = Min(nbuckets, INT_MAX - 1);  /* safety limit */
    1555      573672 :     nbuckets = Max(nbuckets, 16);   /* sanity limit */
    1556             : 
    1557      573672 :     return nbuckets;
    1558             : }

Generated by: LCOV version 1.14