LCOV - code coverage report
Current view: top level - src/backend/nodes - tidbitmap.c (source / functions) Hit Total Coverage
Test: PostgreSQL 12beta2 Lines: 427 486 87.9 %
Date: 2019-06-19 14:06:47 Functions: 28 30 93.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tidbitmap.c
       4             :  *    PostgreSQL tuple-id (TID) bitmap package
       5             :  *
       6             :  * This module provides bitmap data structures that are spiritually
       7             :  * similar to Bitmapsets, but are specially adapted to store sets of
       8             :  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
       9             :  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
      10             :  * Also, since we wish to be able to store very large tuple sets in
      11             :  * memory with this data structure, we support "lossy" storage, in which
      12             :  * we no longer remember individual tuple offsets on a page but only the
      13             :  * fact that a particular page needs to be visited.
      14             :  *
      15             :  * The "lossy" storage uses one bit per disk page, so at the standard 8K
      16             :  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
      17             :  * of memory.  People pushing around tables of that size should have a
      18             :  * couple of Mb to spare, so we don't worry about providing a second level
      19             :  * of lossiness.  In theory we could fall back to page ranges at some
      20             :  * point, but for now that seems useless complexity.
      21             :  *
      22             :  * We also support the notion of candidate matches, or rechecking.  This
      23             :  * means we know that a search need visit only some tuples on a page,
      24             :  * but we are not certain that all of those tuples are real matches.
      25             :  * So the eventual heap scan must recheck the quals for these tuples only,
      26             :  * rather than rechecking the quals for all tuples on the page as in the
      27             :  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
      28             :  * into a bitmap, and it can also happen internally when we AND a lossy
      29             :  * and a non-lossy page.
      30             :  *
      31             :  *
      32             :  * Copyright (c) 2003-2019, PostgreSQL Global Development Group
      33             :  *
      34             :  * IDENTIFICATION
      35             :  *    src/backend/nodes/tidbitmap.c
      36             :  *
      37             :  *-------------------------------------------------------------------------
      38             :  */
      39             : #include "postgres.h"
      40             : 
      41             : #include <limits.h>
      42             : 
      43             : #include "access/htup_details.h"
      44             : #include "nodes/bitmapset.h"
      45             : #include "nodes/tidbitmap.h"
      46             : #include "storage/lwlock.h"
      47             : #include "utils/dsa.h"
      48             : #include "utils/hashutils.h"
      49             : 
      50             : /*
      51             :  * The maximum number of tuples per page is not large (typically 256 with
      52             :  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
      53             :  * the per-page bitmaps variable size.  We just legislate that the size
      54             :  * is this:
      55             :  */
      56             : #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
      57             : 
      58             : /*
      59             :  * When we have to switch over to lossy storage, we use a data structure
      60             :  * with one bit per page, where all pages having the same number DIV
      61             :  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
      62             :  * and has the bit set for a given page, there must not be a per-page entry
      63             :  * for that page in the page table.
      64             :  *
      65             :  * We actually store both exact pages and lossy chunks in the same hash
      66             :  * table, using identical data structures.  (This is because the memory
      67             :  * management for hashtables doesn't easily/efficiently allow space to be
      68             :  * transferred easily from one hashtable to another.)  Therefore it's best
      69             :  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
      70             :  * too different.  But we also want PAGES_PER_CHUNK to be a power of 2 to
      71             :  * avoid expensive integer remainder operations.  So, define it like this:
      72             :  */
      73             : #define PAGES_PER_CHUNK  (BLCKSZ / 32)
      74             : 
      75             : /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
      76             : 
      77             : #define WORDNUM(x)  ((x) / BITS_PER_BITMAPWORD)
      78             : #define BITNUM(x)   ((x) % BITS_PER_BITMAPWORD)
      79             : 
      80             : /* number of active words for an exact page: */
      81             : #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
      82             : /* number of active words for a lossy chunk: */
      83             : #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
      84             : 
      85             : /*
      86             :  * The hashtable entries are represented by this data structure.  For
      87             :  * an exact page, blockno is the page number and bit k of the bitmap
      88             :  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
      89             :  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
      90             :  * bit k represents page blockno+k.  Note that it is not possible to
      91             :  * have exact storage for the first page of a chunk if we are using
      92             :  * lossy storage for any page in the chunk's range, since the same
      93             :  * hashtable entry has to serve both purposes.
      94             :  *
      95             :  * recheck is used only on exact pages --- it indicates that although
      96             :  * only the stated tuples need be checked, the full index qual condition
      97             :  * must be checked for each (ie, these are candidate matches).
      98             :  */
      99             : typedef struct PagetableEntry
     100             : {
     101             :     BlockNumber blockno;        /* page number (hashtable key) */
     102             :     char        status;         /* hash entry status */
     103             :     bool        ischunk;        /* T = lossy storage, F = exact */
     104             :     bool        recheck;        /* should the tuples be rechecked? */
     105             :     bitmapword  words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
     106             : } PagetableEntry;
     107             : 
     108             : /*
     109             :  * Holds array of pagetable entries.
     110             :  */
     111             : typedef struct PTEntryArray
     112             : {
     113             :     pg_atomic_uint32 refcount;  /* no. of iterator attached */
     114             :     PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
     115             : } PTEntryArray;
     116             : 
     117             : /*
     118             :  * We want to avoid the overhead of creating the hashtable, which is
     119             :  * comparatively large, when not necessary. Particularly when we are using a
     120             :  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
     121             :  * long enough to accumulate one entry in such cases.  We therefore avoid
     122             :  * creating an actual hashtable until we need two pagetable entries.  When
     123             :  * just one pagetable entry is needed, we store it in a fixed field of
     124             :  * TIDBitMap.  (NOTE: we don't get rid of the hashtable if the bitmap later
     125             :  * shrinks down to zero or one page again.  So, status can be TBM_HASH even
     126             :  * when nentries is zero or one.)
     127             :  */
     128             : typedef enum
     129             : {
     130             :     TBM_EMPTY,                  /* no hashtable, nentries == 0 */
     131             :     TBM_ONE_PAGE,               /* entry1 contains the single entry */
     132             :     TBM_HASH                    /* pagetable is valid, entry1 is not */
     133             : } TBMStatus;
     134             : 
     135             : /*
     136             :  * Current iterating state of the TBM.
     137             :  */
     138             : typedef enum
     139             : {
     140             :     TBM_NOT_ITERATING,          /* not yet converted to page and chunk array */
     141             :     TBM_ITERATING_PRIVATE,      /* converted to local page and chunk array */
     142             :     TBM_ITERATING_SHARED        /* converted to shared page and chunk array */
     143             : } TBMIteratingState;
     144             : 
     145             : /*
     146             :  * Here is the representation for a whole TIDBitMap:
     147             :  */
     148             : struct TIDBitmap
     149             : {
     150             :     NodeTag     type;           /* to make it a valid Node */
     151             :     MemoryContext mcxt;         /* memory context containing me */
     152             :     TBMStatus   status;         /* see codes above */
     153             :     struct pagetable_hash *pagetable;   /* hash table of PagetableEntry's */
     154             :     int         nentries;       /* number of entries in pagetable */
     155             :     int         maxentries;     /* limit on same to meet maxbytes */
     156             :     int         npages;         /* number of exact entries in pagetable */
     157             :     int         nchunks;        /* number of lossy entries in pagetable */
     158             :     TBMIteratingState iterating;    /* tbm_begin_iterate called? */
     159             :     uint32      lossify_start;  /* offset to start lossifying hashtable at */
     160             :     PagetableEntry entry1;      /* used when status == TBM_ONE_PAGE */
     161             :     /* these are valid when iterating is true: */
     162             :     PagetableEntry **spages;    /* sorted exact-page list, or NULL */
     163             :     PagetableEntry **schunks;   /* sorted lossy-chunk list, or NULL */
     164             :     dsa_pointer dsapagetable;   /* dsa_pointer to the element array */
     165             :     dsa_pointer dsapagetableold;    /* dsa_pointer to the old element array */
     166             :     dsa_pointer ptpages;        /* dsa_pointer to the page array */
     167             :     dsa_pointer ptchunks;       /* dsa_pointer to the chunk array */
     168             :     dsa_area   *dsa;            /* reference to per-query dsa area */
     169             : };
     170             : 
     171             : /*
     172             :  * When iterating over a bitmap in sorted order, a TBMIterator is used to
     173             :  * track our progress.  There can be several iterators scanning the same
     174             :  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
     175             :  * any iterator is created.
     176             :  */
     177             : struct TBMIterator
     178             : {
     179             :     TIDBitmap  *tbm;            /* TIDBitmap we're iterating over */
     180             :     int         spageptr;       /* next spages index */
     181             :     int         schunkptr;      /* next schunks index */
     182             :     int         schunkbit;      /* next bit to check in current schunk */
     183             :     TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
     184             : };
     185             : 
     186             : /*
     187             :  * Holds the shared members of the iterator so that multiple processes
     188             :  * can jointly iterate.
     189             :  */
     190             : typedef struct TBMSharedIteratorState
     191             : {
     192             :     int         nentries;       /* number of entries in pagetable */
     193             :     int         maxentries;     /* limit on same to meet maxbytes */
     194             :     int         npages;         /* number of exact entries in pagetable */
     195             :     int         nchunks;        /* number of lossy entries in pagetable */
     196             :     dsa_pointer pagetable;      /* dsa pointers to head of pagetable data */
     197             :     dsa_pointer spages;         /* dsa pointer to page array */
     198             :     dsa_pointer schunks;        /* dsa pointer to chunk array */
     199             :     LWLock      lock;           /* lock to protect below members */
     200             :     int         spageptr;       /* next spages index */
     201             :     int         schunkptr;      /* next schunks index */
     202             :     int         schunkbit;      /* next bit to check in current schunk */
     203             : } TBMSharedIteratorState;
     204             : 
     205             : /*
     206             :  * pagetable iteration array.
     207             :  */
     208             : typedef struct PTIterationArray
     209             : {
     210             :     pg_atomic_uint32 refcount;  /* no. of iterator attached */
     211             :     int         index[FLEXIBLE_ARRAY_MEMBER];   /* index array */
     212             : } PTIterationArray;
     213             : 
     214             : /*
     215             :  * same as TBMIterator, but it is used for joint iteration, therefore this
     216             :  * also holds a reference to the shared state.
     217             :  */
     218             : struct TBMSharedIterator
     219             : {
     220             :     TBMSharedIteratorState *state;  /* shared state */
     221             :     PTEntryArray *ptbase;       /* pagetable element array */
     222             :     PTIterationArray *ptpages;  /* sorted exact page index list */
     223             :     PTIterationArray *ptchunks; /* sorted lossy page index list */
     224             :     TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
     225             : };
     226             : 
     227             : /* Local function prototypes */
     228             : static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
     229             : static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
     230             :                                const TIDBitmap *b);
     231             : static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
     232             :                                                 BlockNumber pageno);
     233             : static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
     234             : static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
     235             : static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
     236             : static void tbm_lossify(TIDBitmap *tbm);
     237             : static int  tbm_comparator(const void *left, const void *right);
     238             : static int  tbm_shared_comparator(const void *left, const void *right,
     239             :                                   void *arg);
     240             : 
     241             : /* define hashtable mapping block numbers to PagetableEntry's */
     242             : #define SH_USE_NONDEFAULT_ALLOCATOR
     243             : #define SH_PREFIX pagetable
     244             : #define SH_ELEMENT_TYPE PagetableEntry
     245             : #define SH_KEY_TYPE BlockNumber
     246             : #define SH_KEY blockno
     247             : #define SH_HASH_KEY(tb, key) murmurhash32(key)
     248             : #define SH_EQUAL(tb, a, b) a == b
     249             : #define SH_SCOPE static inline
     250             : #define SH_DEFINE
     251             : #define SH_DECLARE
     252             : #include "lib/simplehash.h"
     253             : 
     254             : 
     255             : /*
     256             :  * tbm_create - create an initially-empty bitmap
     257             :  *
     258             :  * The bitmap will live in the memory context that is CurrentMemoryContext
     259             :  * at the time of this call.  It will be limited to (approximately) maxbytes
     260             :  * total memory consumption.  If the DSA passed to this function is not NULL
     261             :  * then the memory for storing elements of the underlying page table will
     262             :  * be allocated from the DSA.
     263             :  */
     264             : TIDBitmap *
     265       14114 : tbm_create(long maxbytes, dsa_area *dsa)
     266             : {
     267             :     TIDBitmap  *tbm;
     268             : 
     269             :     /* Create the TIDBitmap struct and zero all its fields */
     270       14114 :     tbm = makeNode(TIDBitmap);
     271             : 
     272       14114 :     tbm->mcxt = CurrentMemoryContext;
     273       14114 :     tbm->status = TBM_EMPTY;
     274             : 
     275       14114 :     tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
     276       14114 :     tbm->lossify_start = 0;
     277       14114 :     tbm->dsa = dsa;
     278       14114 :     tbm->dsapagetable = InvalidDsaPointer;
     279       14114 :     tbm->dsapagetableold = InvalidDsaPointer;
     280       14114 :     tbm->ptpages = InvalidDsaPointer;
     281       14114 :     tbm->ptchunks = InvalidDsaPointer;
     282             : 
     283       14114 :     return tbm;
     284             : }
     285             : 
     286             : /*
     287             :  * Actually create the hashtable.  Since this is a moderately expensive
     288             :  * proposition, we don't do it until we have to.
     289             :  */
     290             : static void
     291        2844 : tbm_create_pagetable(TIDBitmap *tbm)
     292             : {
     293             :     Assert(tbm->status != TBM_HASH);
     294             :     Assert(tbm->pagetable == NULL);
     295             : 
     296        2844 :     tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
     297             : 
     298             :     /* If entry1 is valid, push it into the hashtable */
     299        2844 :     if (tbm->status == TBM_ONE_PAGE)
     300             :     {
     301             :         PagetableEntry *page;
     302             :         bool        found;
     303             :         char        oldstatus;
     304             : 
     305        1852 :         page = pagetable_insert(tbm->pagetable,
     306             :                                 tbm->entry1.blockno,
     307             :                                 &found);
     308             :         Assert(!found);
     309        1852 :         oldstatus = page->status;
     310        1852 :         memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
     311        1852 :         page->status = oldstatus;
     312             :     }
     313             : 
     314        2844 :     tbm->status = TBM_HASH;
     315        2844 : }
     316             : 
     317             : /*
     318             :  * tbm_free - free a TIDBitmap
     319             :  */
     320             : void
     321       14072 : tbm_free(TIDBitmap *tbm)
     322             : {
     323       14072 :     if (tbm->pagetable)
     324        2844 :         pagetable_destroy(tbm->pagetable);
     325       14072 :     if (tbm->spages)
     326        1772 :         pfree(tbm->spages);
     327       14072 :     if (tbm->schunks)
     328        1000 :         pfree(tbm->schunks);
     329       14072 :     pfree(tbm);
     330       14072 : }
     331             : 
     332             : /*
     333             :  * tbm_free_shared_area - free shared state
     334             :  *
     335             :  * Free shared iterator state, Also free shared pagetable and iterator arrays
     336             :  * memory if they are not referred by any of the shared iterator i.e recount
     337             :  * is becomes 0.
     338             :  */
     339             : void
     340          72 : tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
     341             : {
     342          72 :     TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
     343             :     PTEntryArray *ptbase;
     344             :     PTIterationArray *ptpages;
     345             :     PTIterationArray *ptchunks;
     346             : 
     347          72 :     if (DsaPointerIsValid(istate->pagetable))
     348             :     {
     349          72 :         ptbase = dsa_get_address(dsa, istate->pagetable);
     350          72 :         if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
     351          36 :             dsa_free(dsa, istate->pagetable);
     352             :     }
     353          72 :     if (DsaPointerIsValid(istate->spages))
     354             :     {
     355          72 :         ptpages = dsa_get_address(dsa, istate->spages);
     356          72 :         if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
     357          36 :             dsa_free(dsa, istate->spages);
     358             :     }
     359          72 :     if (DsaPointerIsValid(istate->schunks))
     360             :     {
     361           0 :         ptchunks = dsa_get_address(dsa, istate->schunks);
     362           0 :         if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
     363           0 :             dsa_free(dsa, istate->schunks);
     364             :     }
     365             : 
     366          72 :     dsa_free(dsa, dp);
     367          72 : }
     368             : 
     369             : /*
     370             :  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
     371             :  *
     372             :  * If recheck is true, then the recheck flag will be set in the
     373             :  * TBMIterateResult when any of these tuples are reported out.
     374             :  */
     375             : void
     376     2290924 : tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
     377             :                bool recheck)
     378             : {
     379     2290924 :     BlockNumber currblk = InvalidBlockNumber;
     380     2290924 :     PagetableEntry *page = NULL;    /* only valid when currblk is valid */
     381             :     int         i;
     382             : 
     383             :     Assert(tbm->iterating == TBM_NOT_ITERATING);
     384     5084560 :     for (i = 0; i < ntids; i++)
     385             :     {
     386     2793636 :         BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
     387     2793636 :         OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
     388             :         int         wordnum,
     389             :                     bitnum;
     390             : 
     391             :         /* safety check to ensure we don't overrun bit array bounds */
     392     2793636 :         if (off < 1 || off > MAX_TUPLES_PER_PAGE)
     393           0 :             elog(ERROR, "tuple offset out of range: %u", off);
     394             : 
     395             :         /*
     396             :          * Look up target page unless we already did.  This saves cycles when
     397             :          * the input includes consecutive tuples on the same page, which is
     398             :          * common enough to justify an extra test here.
     399             :          */
     400     2793636 :         if (blk != currblk)
     401             :         {
     402     2443676 :             if (tbm_page_is_lossy(tbm, blk))
     403        1904 :                 page = NULL;    /* remember page is lossy */
     404             :             else
     405     2441772 :                 page = tbm_get_pageentry(tbm, blk);
     406     2443676 :             currblk = blk;
     407             :         }
     408             : 
     409     2793636 :         if (page == NULL)
     410        1904 :             continue;           /* whole page is already marked */
     411             : 
     412     2791732 :         if (page->ischunk)
     413             :         {
     414             :             /* The page is a lossy chunk header, set bit for itself */
     415           0 :             wordnum = bitnum = 0;
     416             :         }
     417             :         else
     418             :         {
     419             :             /* Page is exact, so set bit for individual tuple */
     420     2791732 :             wordnum = WORDNUM(off - 1);
     421     2791732 :             bitnum = BITNUM(off - 1);
     422             :         }
     423     2791732 :         page->words[wordnum] |= ((bitmapword) 1 << bitnum);
     424     2791732 :         page->recheck |= recheck;
     425             : 
     426     2791732 :         if (tbm->nentries > tbm->maxentries)
     427             :         {
     428          16 :             tbm_lossify(tbm);
     429             :             /* Page could have been converted to lossy, so force new lookup */
     430          16 :             currblk = InvalidBlockNumber;
     431             :         }
     432             :     }
     433     2290924 : }
     434             : 
     435             : /*
     436             :  * tbm_add_page - add a whole page to a TIDBitmap
     437             :  *
     438             :  * This causes the whole page to be reported (with the recheck flag)
     439             :  * when the TIDBitmap is scanned.
     440             :  */
     441             : void
     442       73908 : tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
     443             : {
     444             :     /* Enter the page in the bitmap, or mark it lossy if already present */
     445       73908 :     tbm_mark_page_lossy(tbm, pageno);
     446             :     /* If we went over the memory limit, lossify some more pages */
     447       73908 :     if (tbm->nentries > tbm->maxentries)
     448           0 :         tbm_lossify(tbm);
     449       73908 : }
     450             : 
     451             : /*
     452             :  * tbm_union - set union
     453             :  *
     454             :  * a is modified in-place, b is not changed
     455             :  */
     456             : void
     457           0 : tbm_union(TIDBitmap *a, const TIDBitmap *b)
     458             : {
     459             :     Assert(!a->iterating);
     460             :     /* Nothing to do if b is empty */
     461           0 :     if (b->nentries == 0)
     462           0 :         return;
     463             :     /* Scan through chunks and pages in b, merge into a */
     464           0 :     if (b->status == TBM_ONE_PAGE)
     465           0 :         tbm_union_page(a, &b->entry1);
     466             :     else
     467             :     {
     468             :         pagetable_iterator i;
     469             :         PagetableEntry *bpage;
     470             : 
     471             :         Assert(b->status == TBM_HASH);
     472           0 :         pagetable_start_iterate(b->pagetable, &i);
     473           0 :         while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
     474           0 :             tbm_union_page(a, bpage);
     475             :     }
     476             : }
     477             : 
     478             : /* Process one page of b during a union op */
     479             : static void
     480           0 : tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
     481             : {
     482             :     PagetableEntry *apage;
     483             :     int         wordnum;
     484             : 
     485           0 :     if (bpage->ischunk)
     486             :     {
     487             :         /* Scan b's chunk, mark each indicated page lossy in a */
     488           0 :         for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
     489             :         {
     490           0 :             bitmapword  w = bpage->words[wordnum];
     491             : 
     492           0 :             if (w != 0)
     493             :             {
     494             :                 BlockNumber pg;
     495             : 
     496           0 :                 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
     497           0 :                 while (w != 0)
     498             :                 {
     499           0 :                     if (w & 1)
     500           0 :                         tbm_mark_page_lossy(a, pg);
     501           0 :                     pg++;
     502           0 :                     w >>= 1;
     503             :                 }
     504             :             }
     505             :         }
     506             :     }
     507           0 :     else if (tbm_page_is_lossy(a, bpage->blockno))
     508             :     {
     509             :         /* page is already lossy in a, nothing to do */
     510           0 :         return;
     511             :     }
     512             :     else
     513             :     {
     514           0 :         apage = tbm_get_pageentry(a, bpage->blockno);
     515           0 :         if (apage->ischunk)
     516             :         {
     517             :             /* The page is a lossy chunk header, set bit for itself */
     518           0 :             apage->words[0] |= ((bitmapword) 1 << 0);
     519             :         }
     520             :         else
     521             :         {
     522             :             /* Both pages are exact, merge at the bit level */
     523           0 :             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     524           0 :                 apage->words[wordnum] |= bpage->words[wordnum];
     525           0 :             apage->recheck |= bpage->recheck;
     526             :         }
     527             :     }
     528             : 
     529           0 :     if (a->nentries > a->maxentries)
     530           0 :         tbm_lossify(a);
     531             : }
     532             : 
     533             : /*
     534             :  * tbm_intersect - set intersection
     535             :  *
     536             :  * a is modified in-place, b is not changed
     537             :  */
     538             : void
     539          36 : tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
     540             : {
     541             :     Assert(!a->iterating);
     542             :     /* Nothing to do if a is empty */
     543          36 :     if (a->nentries == 0)
     544           0 :         return;
     545             :     /* Scan through chunks and pages in a, try to match to b */
     546          36 :     if (a->status == TBM_ONE_PAGE)
     547             :     {
     548          24 :         if (tbm_intersect_page(a, &a->entry1, b))
     549             :         {
     550             :             /* Page is now empty, remove it from a */
     551             :             Assert(!a->entry1.ischunk);
     552           0 :             a->npages--;
     553           0 :             a->nentries--;
     554             :             Assert(a->nentries == 0);
     555           0 :             a->status = TBM_EMPTY;
     556             :         }
     557             :     }
     558             :     else
     559             :     {
     560             :         pagetable_iterator i;
     561             :         PagetableEntry *apage;
     562             : 
     563             :         Assert(a->status == TBM_HASH);
     564          12 :         pagetable_start_iterate(a->pagetable, &i);
     565          12 :         while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
     566             :         {
     567        3068 :             if (tbm_intersect_page(a, apage, b))
     568             :             {
     569             :                 /* Page or chunk is now empty, remove it from a */
     570        2952 :                 if (apage->ischunk)
     571           0 :                     a->nchunks--;
     572             :                 else
     573        2952 :                     a->npages--;
     574        2952 :                 a->nentries--;
     575        2952 :                 if (!pagetable_delete(a->pagetable, apage->blockno))
     576           0 :                     elog(ERROR, "hash table corrupted");
     577             :             }
     578             :         }
     579             :     }
     580             : }
     581             : 
     582             : /*
     583             :  * Process one page of a during an intersection op
     584             :  *
     585             :  * Returns true if apage is now empty and should be deleted from a
     586             :  */
     587             : static bool
     588        3092 : tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
     589             : {
     590             :     const PagetableEntry *bpage;
     591             :     int         wordnum;
     592             : 
     593        3092 :     if (apage->ischunk)
     594             :     {
     595             :         /* Scan each bit in chunk, try to clear */
     596          20 :         bool        candelete = true;
     597             : 
     598         100 :         for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
     599             :         {
     600          80 :             bitmapword  w = apage->words[wordnum];
     601             : 
     602          80 :             if (w != 0)
     603             :             {
     604          72 :                 bitmapword  neww = w;
     605             :                 BlockNumber pg;
     606             :                 int         bitnum;
     607             : 
     608          72 :                 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
     609          72 :                 bitnum = 0;
     610        4476 :                 while (w != 0)
     611             :                 {
     612        4332 :                     if (w & 1)
     613             :                     {
     614        2240 :                         if (!tbm_page_is_lossy(b, pg) &&
     615         168 :                             tbm_find_pageentry(b, pg) == NULL)
     616             :                         {
     617             :                             /* Page is not in b at all, lose lossy bit */
     618           0 :                             neww &= ~((bitmapword) 1 << bitnum);
     619             :                         }
     620             :                     }
     621        4332 :                     pg++;
     622        4332 :                     bitnum++;
     623        4332 :                     w >>= 1;
     624             :                 }
     625          72 :                 apage->words[wordnum] = neww;
     626          72 :                 if (neww != 0)
     627          72 :                     candelete = false;
     628             :             }
     629             :         }
     630          20 :         return candelete;
     631             :     }
     632        3072 :     else if (tbm_page_is_lossy(b, apage->blockno))
     633             :     {
     634             :         /*
     635             :          * Some of the tuples in 'a' might not satisfy the quals for 'b', but
     636             :          * because the page 'b' is lossy, we don't know which ones. Therefore
     637             :          * we mark 'a' as requiring rechecks, to indicate that at most those
     638             :          * tuples set in 'a' are matches.
     639             :          */
     640           0 :         apage->recheck = true;
     641           0 :         return false;
     642             :     }
     643             :     else
     644             :     {
     645        3072 :         bool        candelete = true;
     646             : 
     647        3072 :         bpage = tbm_find_pageentry(b, apage->blockno);
     648        3072 :         if (bpage != NULL)
     649             :         {
     650             :             /* Both pages are exact, merge at the bit level */
     651             :             Assert(!bpage->ischunk);
     652       16584 :             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     653             :             {
     654       13820 :                 apage->words[wordnum] &= bpage->words[wordnum];
     655       13820 :                 if (apage->words[wordnum] != 0)
     656         120 :                     candelete = false;
     657             :             }
     658        2764 :             apage->recheck |= bpage->recheck;
     659             :         }
     660             :         /* If there is no matching b page, we can just delete the a page */
     661        3072 :         return candelete;
     662             :     }
     663             : }
     664             : 
     665             : /*
     666             :  * tbm_is_empty - is a TIDBitmap completely empty?
     667             :  */
     668             : bool
     669         396 : tbm_is_empty(const TIDBitmap *tbm)
     670             : {
     671         396 :     return (tbm->nentries == 0);
     672             : }
     673             : 
     674             : /*
     675             :  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
     676             :  *
     677             :  * The TBMIterator struct is created in the caller's memory context.
     678             :  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
     679             :  * okay to just allow the memory context to be released, too.  It is caller's
     680             :  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
     681             :  * is freed.
     682             :  *
     683             :  * NB: after this is called, it is no longer allowed to modify the contents
     684             :  * of the bitmap.  However, you can call this multiple times to scan the
     685             :  * contents repeatedly, including parallel scans.
     686             :  */
     687             : TBMIterator *
     688       27736 : tbm_begin_iterate(TIDBitmap *tbm)
     689             : {
     690             :     TBMIterator *iterator;
     691             : 
     692             :     Assert(tbm->iterating != TBM_ITERATING_SHARED);
     693             : 
     694             :     /*
     695             :      * Create the TBMIterator struct, with enough trailing space to serve the
     696             :      * needs of the TBMIterateResult sub-struct.
     697             :      */
     698       27736 :     iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
     699             :                                       MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
     700       27736 :     iterator->tbm = tbm;
     701             : 
     702             :     /*
     703             :      * Initialize iteration pointers.
     704             :      */
     705       27736 :     iterator->spageptr = 0;
     706       27736 :     iterator->schunkptr = 0;
     707       27736 :     iterator->schunkbit = 0;
     708             : 
     709             :     /*
     710             :      * If we have a hashtable, create and fill the sorted page lists, unless
     711             :      * we already did that for a previous iterator.  Note that the lists are
     712             :      * attached to the bitmap not the iterator, so they can be used by more
     713             :      * than one iterator.
     714             :      */
     715       27736 :     if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
     716             :     {
     717             :         pagetable_iterator i;
     718             :         PagetableEntry *page;
     719             :         int         npages;
     720             :         int         nchunks;
     721             : 
     722        2764 :         if (!tbm->spages && tbm->npages > 0)
     723        1772 :             tbm->spages = (PagetableEntry **)
     724        1772 :                 MemoryContextAlloc(tbm->mcxt,
     725        1772 :                                    tbm->npages * sizeof(PagetableEntry *));
     726        2764 :         if (!tbm->schunks && tbm->nchunks > 0)
     727        1000 :             tbm->schunks = (PagetableEntry **)
     728        1000 :                 MemoryContextAlloc(tbm->mcxt,
     729        1000 :                                    tbm->nchunks * sizeof(PagetableEntry *));
     730             : 
     731        2764 :         npages = nchunks = 0;
     732        2764 :         pagetable_start_iterate(tbm->pagetable, &i);
     733       40728 :         while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
     734             :         {
     735       35200 :             if (page->ischunk)
     736        1028 :                 tbm->schunks[nchunks++] = page;
     737             :             else
     738       34172 :                 tbm->spages[npages++] = page;
     739             :         }
     740             :         Assert(npages == tbm->npages);
     741             :         Assert(nchunks == tbm->nchunks);
     742        2764 :         if (npages > 1)
     743        1772 :             qsort(tbm->spages, npages, sizeof(PagetableEntry *),
     744             :                   tbm_comparator);
     745        2764 :         if (nchunks > 1)
     746           8 :             qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
     747             :                   tbm_comparator);
     748             :     }
     749             : 
     750       27736 :     tbm->iterating = TBM_ITERATING_PRIVATE;
     751             : 
     752       27736 :     return iterator;
     753             : }
     754             : 
     755             : /*
     756             :  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
     757             :  *
     758             :  * The necessary shared state will be allocated from the DSA passed to
     759             :  * tbm_create, so that multiple processes can attach to it and iterate jointly.
     760             :  *
     761             :  * This will convert the pagetable hash into page and chunk array of the index
     762             :  * into pagetable array.
     763             :  */
     764             : dsa_pointer
     765          88 : tbm_prepare_shared_iterate(TIDBitmap *tbm)
     766             : {
     767             :     dsa_pointer dp;
     768             :     TBMSharedIteratorState *istate;
     769          88 :     PTEntryArray *ptbase = NULL;
     770          88 :     PTIterationArray *ptpages = NULL;
     771          88 :     PTIterationArray *ptchunks = NULL;
     772             : 
     773             :     Assert(tbm->dsa != NULL);
     774             :     Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
     775             : 
     776             :     /*
     777             :      * Allocate TBMSharedIteratorState from DSA to hold the shared members and
     778             :      * lock, this will also be used by multiple worker for shared iterate.
     779             :      */
     780          88 :     dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
     781          88 :     istate = dsa_get_address(tbm->dsa, dp);
     782             : 
     783             :     /*
     784             :      * If we're not already iterating, create and fill the sorted page lists.
     785             :      * (If we are, the sorted page lists are already stored in the TIDBitmap,
     786             :      * and we can just reuse them.)
     787             :      */
     788          88 :     if (tbm->iterating == TBM_NOT_ITERATING)
     789             :     {
     790             :         pagetable_iterator i;
     791             :         PagetableEntry *page;
     792             :         int         idx;
     793             :         int         npages;
     794             :         int         nchunks;
     795             : 
     796             :         /*
     797             :          * Allocate the page and chunk array memory from the DSA to share
     798             :          * across multiple processes.
     799             :          */
     800          44 :         if (tbm->npages)
     801             :         {
     802          44 :             tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
     803             :                                         tbm->npages * sizeof(int));
     804          44 :             ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
     805          44 :             pg_atomic_init_u32(&ptpages->refcount, 0);
     806             :         }
     807          44 :         if (tbm->nchunks)
     808             :         {
     809           4 :             tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
     810             :                                          tbm->nchunks * sizeof(int));
     811           4 :             ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
     812           4 :             pg_atomic_init_u32(&ptchunks->refcount, 0);
     813             :         }
     814             : 
     815             :         /*
     816             :          * If TBM status is TBM_HASH then iterate over the pagetable and
     817             :          * convert it to page and chunk arrays.  But if it's in the
     818             :          * TBM_ONE_PAGE mode then directly allocate the space for one entry
     819             :          * from the DSA.
     820             :          */
     821          44 :         npages = nchunks = 0;
     822          44 :         if (tbm->status == TBM_HASH)
     823             :         {
     824          44 :             ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     825             : 
     826          44 :             pagetable_start_iterate(tbm->pagetable, &i);
     827       16776 :             while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
     828             :             {
     829       16688 :                 idx = page - ptbase->ptentry;
     830       16688 :                 if (page->ischunk)
     831          16 :                     ptchunks->index[nchunks++] = idx;
     832             :                 else
     833       16672 :                     ptpages->index[npages++] = idx;
     834             :             }
     835             : 
     836             :             Assert(npages == tbm->npages);
     837             :             Assert(nchunks == tbm->nchunks);
     838             :         }
     839           0 :         else if (tbm->status == TBM_ONE_PAGE)
     840             :         {
     841             :             /*
     842             :              * In one page mode allocate the space for one pagetable entry,
     843             :              * initialize it, and directly store its index (i.e. 0) in the
     844             :              * page array.
     845             :              */
     846           0 :             tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
     847             :                                              sizeof(PagetableEntry));
     848           0 :             ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     849           0 :             memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
     850           0 :             ptpages->index[0] = 0;
     851             :         }
     852             : 
     853          44 :         if (ptbase != NULL)
     854          44 :             pg_atomic_init_u32(&ptbase->refcount, 0);
     855          44 :         if (npages > 1)
     856          44 :             qsort_arg((void *) (ptpages->index), npages, sizeof(int),
     857          44 :                       tbm_shared_comparator, (void *) ptbase->ptentry);
     858          44 :         if (nchunks > 1)
     859           4 :             qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
     860           4 :                       tbm_shared_comparator, (void *) ptbase->ptentry);
     861             :     }
     862             : 
     863             :     /*
     864             :      * Store the TBM members in the shared state so that we can share them
     865             :      * across multiple processes.
     866             :      */
     867          88 :     istate->nentries = tbm->nentries;
     868          88 :     istate->maxentries = tbm->maxentries;
     869          88 :     istate->npages = tbm->npages;
     870          88 :     istate->nchunks = tbm->nchunks;
     871          88 :     istate->pagetable = tbm->dsapagetable;
     872          88 :     istate->spages = tbm->ptpages;
     873          88 :     istate->schunks = tbm->ptchunks;
     874             : 
     875          88 :     ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
     876          88 :     ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
     877          88 :     ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
     878             : 
     879             :     /*
     880             :      * For every shared iterator, referring to pagetable and iterator array,
     881             :      * increase the refcount by 1 so that while freeing the shared iterator we
     882             :      * don't free pagetable and iterator array until its refcount becomes 0.
     883             :      */
     884          88 :     if (ptbase != NULL)
     885          88 :         pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
     886          88 :     if (ptpages != NULL)
     887          88 :         pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
     888          88 :     if (ptchunks != NULL)
     889           8 :         pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
     890             : 
     891             :     /* Initialize the iterator lock */
     892          88 :     LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
     893             : 
     894             :     /* Initialize the shared iterator state */
     895          88 :     istate->schunkbit = 0;
     896          88 :     istate->schunkptr = 0;
     897          88 :     istate->spageptr = 0;
     898             : 
     899          88 :     tbm->iterating = TBM_ITERATING_SHARED;
     900             : 
     901          88 :     return dp;
     902             : }
     903             : 
     904             : /*
     905             :  * tbm_extract_page_tuple - extract the tuple offsets from a page
     906             :  *
     907             :  * The extracted offsets are stored into TBMIterateResult.
     908             :  */
     909             : static inline int
     910      109088 : tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
     911             : {
     912             :     int         wordnum;
     913      109088 :     int         ntuples = 0;
     914             : 
     915      654528 :     for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
     916             :     {
     917      545440 :         bitmapword  w = page->words[wordnum];
     918             : 
     919      545440 :         if (w != 0)
     920             :         {
     921      153488 :             int         off = wordnum * BITS_PER_BITMAPWORD + 1;
     922             : 
     923     5935682 :             while (w != 0)
     924             :             {
     925     5628706 :                 if (w & 1)
     926     3776082 :                     output->offsets[ntuples++] = (OffsetNumber) off;
     927     5628706 :                 off++;
     928     5628706 :                 w >>= 1;
     929             :             }
     930             :         }
     931             :     }
     932             : 
     933      109088 :     return ntuples;
     934             : }
     935             : 
     936             : /*
     937             :  *  tbm_advance_schunkbit - Advance the chunkbit
     938             :  */
     939             : static inline void
     940      170456 : tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
     941             : {
     942      170456 :     int         schunkbit = *schunkbitp;
     943             : 
     944      715208 :     while (schunkbit < PAGES_PER_CHUNK)
     945             :     {
     946      542664 :         int         wordnum = WORDNUM(schunkbit);
     947      542664 :         int         bitnum = BITNUM(schunkbit);
     948             : 
     949      542664 :         if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
     950      168368 :             break;
     951      374296 :         schunkbit++;
     952             :     }
     953             : 
     954      170456 :     *schunkbitp = schunkbit;
     955      170456 : }
     956             : 
     957             : /*
     958             :  * tbm_iterate - scan through next page of a TIDBitmap
     959             :  *
     960             :  * Returns a TBMIterateResult representing one page, or NULL if there are
     961             :  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
     962             :  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
     963             :  * remember the exact tuples to look at on this page --- the caller must
     964             :  * examine all tuples on the page and check if they meet the intended
     965             :  * condition.  If result->recheck is true, only the indicated tuples need
     966             :  * be examined, but the condition must be rechecked anyway.  (For ease of
     967             :  * testing, recheck is always set true when ntuples < 0.)
     968             :  */
     969             : TBMIterateResult *
     970      253662 : tbm_iterate(TBMIterator *iterator)
     971             : {
     972      253662 :     TIDBitmap  *tbm = iterator->tbm;
     973      253662 :     TBMIterateResult *output = &(iterator->output);
     974             : 
     975             :     Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
     976             : 
     977             :     /*
     978             :      * If lossy chunk pages remain, make sure we've advanced schunkptr/
     979             :      * schunkbit to the next set bit.
     980             :      */
     981      509380 :     while (iterator->schunkptr < tbm->nchunks)
     982             :     {
     983      162256 :         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
     984      162256 :         int         schunkbit = iterator->schunkbit;
     985             : 
     986      162256 :         tbm_advance_schunkbit(chunk, &schunkbit);
     987      162256 :         if (schunkbit < PAGES_PER_CHUNK)
     988             :         {
     989      160200 :             iterator->schunkbit = schunkbit;
     990      160200 :             break;
     991             :         }
     992             :         /* advance to next chunk */
     993        2056 :         iterator->schunkptr++;
     994        2056 :         iterator->schunkbit = 0;
     995             :     }
     996             : 
     997             :     /*
     998             :      * If both chunk and per-page data remain, must output the numerically
     999             :      * earlier page.
    1000             :      */
    1001      253662 :     if (iterator->schunkptr < tbm->nchunks)
    1002             :     {
    1003      160200 :         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
    1004             :         BlockNumber chunk_blockno;
    1005             : 
    1006      160200 :         chunk_blockno = chunk->blockno + iterator->schunkbit;
    1007      172584 :         if (iterator->spageptr >= tbm->npages ||
    1008       12384 :             chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
    1009             :         {
    1010             :             /* Return a lossy page indicator from the chunk */
    1011      156096 :             output->blockno = chunk_blockno;
    1012      156096 :             output->ntuples = -1;
    1013      156096 :             output->recheck = true;
    1014      156096 :             iterator->schunkbit++;
    1015      156096 :             return output;
    1016             :         }
    1017             :     }
    1018             : 
    1019       97566 :     if (iterator->spageptr < tbm->npages)
    1020             :     {
    1021             :         PagetableEntry *page;
    1022             :         int         ntuples;
    1023             : 
    1024             :         /* In ONE_PAGE state, we don't allocate an spages[] array */
    1025       75744 :         if (tbm->status == TBM_ONE_PAGE)
    1026       11204 :             page = &tbm->entry1;
    1027             :         else
    1028       64540 :             page = tbm->spages[iterator->spageptr];
    1029             : 
    1030             :         /* scan bitmap to extract individual offset numbers */
    1031       75744 :         ntuples = tbm_extract_page_tuple(page, output);
    1032       75744 :         output->blockno = page->blockno;
    1033       75744 :         output->ntuples = ntuples;
    1034       75744 :         output->recheck = page->recheck;
    1035       75744 :         iterator->spageptr++;
    1036       75744 :         return output;
    1037             :     }
    1038             : 
    1039             :     /* Nothing more in the bitmap */
    1040       21822 :     return NULL;
    1041             : }
    1042             : 
    1043             : /*
    1044             :  *  tbm_shared_iterate - scan through next page of a TIDBitmap
    1045             :  *
    1046             :  *  As above, but this will iterate using an iterator which is shared
    1047             :  *  across multiple processes.  We need to acquire the iterator LWLock,
    1048             :  *  before accessing the shared members.
    1049             :  */
    1050             : TBMIterateResult *
    1051       37744 : tbm_shared_iterate(TBMSharedIterator *iterator)
    1052             : {
    1053       37744 :     TBMIterateResult *output = &iterator->output;
    1054       37744 :     TBMSharedIteratorState *istate = iterator->state;
    1055       37744 :     PagetableEntry *ptbase = NULL;
    1056       37744 :     int        *idxpages = NULL;
    1057       37744 :     int        *idxchunks = NULL;
    1058             : 
    1059       37744 :     if (iterator->ptbase != NULL)
    1060       37744 :         ptbase = iterator->ptbase->ptentry;
    1061       37744 :     if (iterator->ptpages != NULL)
    1062       37744 :         idxpages = iterator->ptpages->index;
    1063       37744 :     if (iterator->ptchunks != NULL)
    1064        9904 :         idxchunks = iterator->ptchunks->index;
    1065             : 
    1066             :     /* Acquire the LWLock before accessing the shared members */
    1067       37744 :     LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
    1068             : 
    1069             :     /*
    1070             :      * If lossy chunk pages remain, make sure we've advanced schunkptr/
    1071             :      * schunkbit to the next set bit.
    1072             :      */
    1073       75520 :     while (istate->schunkptr < istate->nchunks)
    1074             :     {
    1075        8200 :         PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
    1076        8200 :         int         schunkbit = istate->schunkbit;
    1077             : 
    1078        8200 :         tbm_advance_schunkbit(chunk, &schunkbit);
    1079        8200 :         if (schunkbit < PAGES_PER_CHUNK)
    1080             :         {
    1081        8168 :             istate->schunkbit = schunkbit;
    1082        8168 :             break;
    1083             :         }
    1084             :         /* advance to next chunk */
    1085          32 :         istate->schunkptr++;
    1086          32 :         istate->schunkbit = 0;
    1087             :     }
    1088             : 
    1089             :     /*
    1090             :      * If both chunk and per-page data remain, must output the numerically
    1091             :      * earlier page.
    1092             :      */
    1093       37744 :     if (istate->schunkptr < istate->nchunks)
    1094             :     {
    1095        8168 :         PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
    1096             :         BlockNumber chunk_blockno;
    1097             : 
    1098        8168 :         chunk_blockno = chunk->blockno + istate->schunkbit;
    1099             : 
    1100       16336 :         if (istate->spageptr >= istate->npages ||
    1101        8168 :             chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
    1102             :         {
    1103             :             /* Return a lossy page indicator from the chunk */
    1104        4136 :             output->blockno = chunk_blockno;
    1105        4136 :             output->ntuples = -1;
    1106        4136 :             output->recheck = true;
    1107        4136 :             istate->schunkbit++;
    1108             : 
    1109        4136 :             LWLockRelease(&istate->lock);
    1110        4136 :             return output;
    1111             :         }
    1112             :     }
    1113             : 
    1114       33608 :     if (istate->spageptr < istate->npages)
    1115             :     {
    1116       33344 :         PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
    1117             :         int         ntuples;
    1118             : 
    1119             :         /* scan bitmap to extract individual offset numbers */
    1120       33344 :         ntuples = tbm_extract_page_tuple(page, output);
    1121       33344 :         output->blockno = page->blockno;
    1122       33344 :         output->ntuples = ntuples;
    1123       33344 :         output->recheck = page->recheck;
    1124       33344 :         istate->spageptr++;
    1125             : 
    1126       33344 :         LWLockRelease(&istate->lock);
    1127             : 
    1128       33344 :         return output;
    1129             :     }
    1130             : 
    1131         264 :     LWLockRelease(&istate->lock);
    1132             : 
    1133             :     /* Nothing more in the bitmap */
    1134         264 :     return NULL;
    1135             : }
    1136             : 
    1137             : /*
    1138             :  * tbm_end_iterate - finish an iteration over a TIDBitmap
    1139             :  *
    1140             :  * Currently this is just a pfree, but it might do more someday.  (For
    1141             :  * instance, it could be useful to count open iterators and allow the
    1142             :  * bitmap to return to read/write status when there are no more iterators.)
    1143             :  */
    1144             : void
    1145       27684 : tbm_end_iterate(TBMIterator *iterator)
    1146             : {
    1147       27684 :     pfree(iterator);
    1148       27684 : }
    1149             : 
    1150             : /*
    1151             :  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
    1152             :  *
    1153             :  * This doesn't free any of the shared state associated with the iterator,
    1154             :  * just our backend-private state.
    1155             :  */
    1156             : void
    1157         440 : tbm_end_shared_iterate(TBMSharedIterator *iterator)
    1158             : {
    1159         440 :     pfree(iterator);
    1160         440 : }
    1161             : 
    1162             : /*
    1163             :  * tbm_find_pageentry - find a PagetableEntry for the pageno
    1164             :  *
    1165             :  * Returns NULL if there is no non-lossy entry for the pageno.
    1166             :  */
    1167             : static const PagetableEntry *
    1168        3240 : tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
    1169             : {
    1170             :     const PagetableEntry *page;
    1171             : 
    1172        3240 :     if (tbm->nentries == 0)      /* in case pagetable doesn't exist */
    1173           0 :         return NULL;
    1174             : 
    1175        3240 :     if (tbm->status == TBM_ONE_PAGE)
    1176             :     {
    1177           0 :         page = &tbm->entry1;
    1178           0 :         if (page->blockno != pageno)
    1179           0 :             return NULL;
    1180             :         Assert(!page->ischunk);
    1181           0 :         return page;
    1182             :     }
    1183             : 
    1184        3240 :     page = pagetable_lookup(tbm->pagetable, pageno);
    1185        3240 :     if (page == NULL)
    1186         308 :         return NULL;
    1187        2932 :     if (page->ischunk)
    1188           0 :         return NULL;            /* don't want a lossy chunk header */
    1189        2932 :     return page;
    1190             : }
    1191             : 
    1192             : /*
    1193             :  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
    1194             :  *
    1195             :  * If new, the entry is marked as an exact (non-chunk) entry.
    1196             :  *
    1197             :  * This may cause the table to exceed the desired memory size.  It is
    1198             :  * up to the caller to call tbm_lossify() at the next safe point if so.
    1199             :  */
    1200             : static PagetableEntry *
    1201     2441772 : tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
    1202             : {
    1203             :     PagetableEntry *page;
    1204             :     bool        found;
    1205             : 
    1206     2441772 :     if (tbm->status == TBM_EMPTY)
    1207             :     {
    1208             :         /* Use the fixed slot */
    1209        7574 :         page = &tbm->entry1;
    1210        7574 :         found = false;
    1211        7574 :         tbm->status = TBM_ONE_PAGE;
    1212             :     }
    1213             :     else
    1214             :     {
    1215     2434198 :         if (tbm->status == TBM_ONE_PAGE)
    1216             :         {
    1217       31782 :             page = &tbm->entry1;
    1218       31782 :             if (page->blockno == pageno)
    1219       29930 :                 return page;
    1220             :             /* Time to switch from one page to a hashtable */
    1221        1852 :             tbm_create_pagetable(tbm);
    1222             :         }
    1223             : 
    1224             :         /* Look up or create an entry */
    1225     2404268 :         page = pagetable_insert(tbm->pagetable, pageno, &found);
    1226             :     }
    1227             : 
    1228             :     /* Initialize it if not present before */
    1229     2411842 :     if (!found)
    1230             :     {
    1231       70966 :         char        oldstatus = page->status;
    1232             : 
    1233       70966 :         MemSet(page, 0, sizeof(PagetableEntry));
    1234       70966 :         page->status = oldstatus;
    1235       70966 :         page->blockno = pageno;
    1236             :         /* must count it too */
    1237       70966 :         tbm->nentries++;
    1238       70966 :         tbm->npages++;
    1239             :     }
    1240             : 
    1241     2411842 :     return page;
    1242             : }
    1243             : 
    1244             : /*
    1245             :  * tbm_page_is_lossy - is the page marked as lossily stored?
    1246             :  */
    1247             : static bool
    1248     2448820 : tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
    1249             : {
    1250             :     PagetableEntry *page;
    1251             :     BlockNumber chunk_pageno;
    1252             :     int         bitno;
    1253             : 
    1254             :     /* we can skip the lookup if there are no lossy chunks */
    1255     2448820 :     if (tbm->nchunks == 0)
    1256     2368400 :         return false;
    1257             :     Assert(tbm->status == TBM_HASH);
    1258             : 
    1259       80420 :     bitno = pageno % PAGES_PER_CHUNK;
    1260       80420 :     chunk_pageno = pageno - bitno;
    1261             : 
    1262       80420 :     page = pagetable_lookup(tbm->pagetable, chunk_pageno);
    1263             : 
    1264       80420 :     if (page != NULL && page->ischunk)
    1265             :     {
    1266        8288 :         int         wordnum = WORDNUM(bitno);
    1267        8288 :         int         bitnum = BITNUM(bitno);
    1268             : 
    1269        8288 :         if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
    1270        3808 :             return true;
    1271             :     }
    1272       76612 :     return false;
    1273             : }
    1274             : 
    1275             : /*
    1276             :  * tbm_mark_page_lossy - mark the page number as lossily stored
    1277             :  *
    1278             :  * This may cause the table to exceed the desired memory size.  It is
    1279             :  * up to the caller to call tbm_lossify() at the next safe point if so.
    1280             :  */
    1281             : static void
    1282       82116 : tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
    1283             : {
    1284             :     PagetableEntry *page;
    1285             :     bool        found;
    1286             :     BlockNumber chunk_pageno;
    1287             :     int         bitno;
    1288             :     int         wordnum;
    1289             :     int         bitnum;
    1290             : 
    1291             :     /* We force the bitmap into hashtable mode whenever it's lossy */
    1292       82116 :     if (tbm->status != TBM_HASH)
    1293         992 :         tbm_create_pagetable(tbm);
    1294             : 
    1295       82116 :     bitno = pageno % PAGES_PER_CHUNK;
    1296       82116 :     chunk_pageno = pageno - bitno;
    1297             : 
    1298             :     /*
    1299             :      * Remove any extant non-lossy entry for the page.  If the page is its own
    1300             :      * chunk header, however, we skip this and handle the case below.
    1301             :      */
    1302       82116 :     if (bitno != 0)
    1303             :     {
    1304       81228 :         if (pagetable_delete(tbm->pagetable, pageno))
    1305             :         {
    1306             :             /* It was present, so adjust counts */
    1307        8208 :             tbm->nentries--;
    1308        8208 :             tbm->npages--;       /* assume it must have been non-lossy */
    1309             :         }
    1310             :     }
    1311             : 
    1312             :     /* Look up or create entry for chunk-header page */
    1313       82116 :     page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
    1314             : 
    1315             :     /* Initialize it if not present before */
    1316       82116 :     if (!found)
    1317             :     {
    1318         992 :         char        oldstatus = page->status;
    1319             : 
    1320         992 :         MemSet(page, 0, sizeof(PagetableEntry));
    1321         992 :         page->status = oldstatus;
    1322         992 :         page->blockno = chunk_pageno;
    1323         992 :         page->ischunk = true;
    1324             :         /* must count it too */
    1325         992 :         tbm->nentries++;
    1326         992 :         tbm->nchunks++;
    1327             :     }
    1328       81124 :     else if (!page->ischunk)
    1329             :     {
    1330          68 :         char        oldstatus = page->status;
    1331             : 
    1332             :         /* chunk header page was formerly non-lossy, make it lossy */
    1333          68 :         MemSet(page, 0, sizeof(PagetableEntry));
    1334          68 :         page->status = oldstatus;
    1335          68 :         page->blockno = chunk_pageno;
    1336          68 :         page->ischunk = true;
    1337             :         /* we assume it had some tuple bit(s) set, so mark it lossy */
    1338          68 :         page->words[0] = ((bitmapword) 1 << 0);
    1339             :         /* adjust counts */
    1340          68 :         tbm->nchunks++;
    1341          68 :         tbm->npages--;
    1342             :     }
    1343             : 
    1344             :     /* Now set the original target page's bit */
    1345       82116 :     wordnum = WORDNUM(bitno);
    1346       82116 :     bitnum = BITNUM(bitno);
    1347       82116 :     page->words[wordnum] |= ((bitmapword) 1 << bitnum);
    1348       82116 : }
    1349             : 
    1350             : /*
    1351             :  * tbm_lossify - lose some information to get back under the memory limit
    1352             :  */
    1353             : static void
    1354          16 : tbm_lossify(TIDBitmap *tbm)
    1355             : {
    1356             :     pagetable_iterator i;
    1357             :     PagetableEntry *page;
    1358             : 
    1359             :     /*
    1360             :      * XXX Really stupid implementation: this just lossifies pages in
    1361             :      * essentially random order.  We should be paying some attention to the
    1362             :      * number of bits set in each page, instead.
    1363             :      *
    1364             :      * Since we are called as soon as nentries exceeds maxentries, we should
    1365             :      * push nentries down to significantly less than maxentries, or else we'll
    1366             :      * just end up doing this again very soon.  We shoot for maxentries/2.
    1367             :      */
    1368             :     Assert(tbm->iterating == TBM_NOT_ITERATING);
    1369             :     Assert(tbm->status == TBM_HASH);
    1370             : 
    1371          16 :     pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
    1372          16 :     while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
    1373             :     {
    1374        8220 :         if (page->ischunk)
    1375           0 :             continue;           /* already a chunk header */
    1376             : 
    1377             :         /*
    1378             :          * If the page would become a chunk header, we won't save anything by
    1379             :          * converting it to lossy, so skip it.
    1380             :          */
    1381        8220 :         if ((page->blockno % PAGES_PER_CHUNK) == 0)
    1382          12 :             continue;
    1383             : 
    1384             :         /* This does the dirty work ... */
    1385        8208 :         tbm_mark_page_lossy(tbm, page->blockno);
    1386             : 
    1387        8208 :         if (tbm->nentries <= tbm->maxentries / 2)
    1388             :         {
    1389             :             /*
    1390             :              * We have made enough room. Remember where to start lossifying
    1391             :              * next round, so we evenly iterate over the hashtable.
    1392             :              */
    1393          16 :             tbm->lossify_start = i.cur;
    1394          16 :             break;
    1395             :         }
    1396             : 
    1397             :         /*
    1398             :          * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
    1399             :          * hashtable and may have deleted the non-lossy chunk.  We can
    1400             :          * continue the same hash table scan, since failure to visit one
    1401             :          * element or visiting the newly inserted element, isn't fatal.
    1402             :          */
    1403             :     }
    1404             : 
    1405             :     /*
    1406             :      * With a big bitmap and small work_mem, it's possible that we cannot get
    1407             :      * under maxentries.  Again, if that happens, we'd end up uselessly
    1408             :      * calling tbm_lossify over and over.  To prevent this from becoming a
    1409             :      * performance sink, force maxentries up to at least double the current
    1410             :      * number of entries.  (In essence, we're admitting inability to fit
    1411             :      * within work_mem when we do this.)  Note that this test will not fire if
    1412             :      * we broke out of the loop early; and if we didn't, the current number of
    1413             :      * entries is simply not reducible any further.
    1414             :      */
    1415          16 :     if (tbm->nentries > tbm->maxentries / 2)
    1416           0 :         tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
    1417          16 : }
    1418             : 
    1419             : /*
    1420             :  * qsort comparator to handle PagetableEntry pointers.
    1421             :  */
    1422             : static int
    1423      195812 : tbm_comparator(const void *left, const void *right)
    1424             : {
    1425      195812 :     BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
    1426      195812 :     BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
    1427             : 
    1428      195812 :     if (l < r)
    1429       91570 :         return -1;
    1430      104242 :     else if (l > r)
    1431      104242 :         return 1;
    1432           0 :     return 0;
    1433             : }
    1434             : 
    1435             : /*
    1436             :  * As above, but this will get index into PagetableEntry array.  Therefore,
    1437             :  * it needs to get actual PagetableEntry using the index before comparing the
    1438             :  * blockno.
    1439             :  */
    1440             : static int
    1441      147192 : tbm_shared_comparator(const void *left, const void *right, void *arg)
    1442             : {
    1443      147192 :     PagetableEntry *base = (PagetableEntry *) arg;
    1444      147192 :     PagetableEntry *lpage = &base[*(int *) left];
    1445      147192 :     PagetableEntry *rpage = &base[*(int *) right];
    1446             : 
    1447      147192 :     if (lpage->blockno < rpage->blockno)
    1448       66636 :         return -1;
    1449       80556 :     else if (lpage->blockno > rpage->blockno)
    1450       80556 :         return 1;
    1451           0 :     return 0;
    1452             : }
    1453             : 
    1454             : /*
    1455             :  *  tbm_attach_shared_iterate
    1456             :  *
    1457             :  *  Allocate a backend-private iterator and attach the shared iterator state
    1458             :  *  to it so that multiple processed can iterate jointly.
    1459             :  *
    1460             :  *  We also converts the DSA pointers to local pointers and store them into
    1461             :  *  our private iterator.
    1462             :  */
    1463             : TBMSharedIterator *
    1464         440 : tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
    1465             : {
    1466             :     TBMSharedIterator *iterator;
    1467             :     TBMSharedIteratorState *istate;
    1468             : 
    1469             :     /*
    1470             :      * Create the TBMSharedIterator struct, with enough trailing space to
    1471             :      * serve the needs of the TBMIterateResult sub-struct.
    1472             :      */
    1473         440 :     iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
    1474             :                                              MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
    1475             : 
    1476         440 :     istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
    1477             : 
    1478         440 :     iterator->state = istate;
    1479             : 
    1480         440 :     iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
    1481             : 
    1482         440 :     if (istate->npages)
    1483         440 :         iterator->ptpages = dsa_get_address(dsa, istate->spages);
    1484         440 :     if (istate->nchunks)
    1485          40 :         iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
    1486             : 
    1487         440 :     return iterator;
    1488             : }
    1489             : 
    1490             : /*
    1491             :  * pagetable_allocate
    1492             :  *
    1493             :  * Callback function for allocating the memory for hashtable elements.
    1494             :  * Allocate memory for hashtable elements, using DSA if available.
    1495             :  */
    1496             : static inline void *
    1497        2948 : pagetable_allocate(pagetable_hash *pagetable, Size size)
    1498             : {
    1499        2948 :     TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
    1500             :     PTEntryArray *ptbase;
    1501             : 
    1502        2948 :     if (tbm->dsa == NULL)
    1503        2852 :         return MemoryContextAllocExtended(pagetable->ctx, size,
    1504             :                                           MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
    1505             : 
    1506             :     /*
    1507             :      * Save the dsapagetable reference in dsapagetableold before allocating
    1508             :      * new memory so that pagetable_free can free the old entry.
    1509             :      */
    1510          96 :     tbm->dsapagetableold = tbm->dsapagetable;
    1511          96 :     tbm->dsapagetable = dsa_allocate_extended(tbm->dsa,
    1512             :                                               sizeof(PTEntryArray) + size,
    1513             :                                               DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
    1514          96 :     ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
    1515             : 
    1516          96 :     return ptbase->ptentry;
    1517             : }
    1518             : 
    1519             : /*
    1520             :  * pagetable_free
    1521             :  *
    1522             :  * Callback function for freeing hash table elements.
    1523             :  */
    1524             : static inline void
    1525        2948 : pagetable_free(pagetable_hash *pagetable, void *pointer)
    1526             : {
    1527        2948 :     TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
    1528             : 
    1529             :     /* pfree the input pointer if DSA is not available */
    1530        2948 :     if (tbm->dsa == NULL)
    1531        2852 :         pfree(pointer);
    1532          96 :     else if (DsaPointerIsValid(tbm->dsapagetableold))
    1533             :     {
    1534          52 :         dsa_free(tbm->dsa, tbm->dsapagetableold);
    1535          52 :         tbm->dsapagetableold = InvalidDsaPointer;
    1536             :     }
    1537        2948 : }
    1538             : 
    1539             : /*
    1540             :  * tbm_calculate_entries
    1541             :  *
    1542             :  * Estimate number of hashtable entries we can have within maxbytes.
    1543             :  */
    1544             : long
    1545      331552 : tbm_calculate_entries(double maxbytes)
    1546             : {
    1547             :     long        nbuckets;
    1548             : 
    1549             :     /*
    1550             :      * Estimate number of hashtable entries we can have within maxbytes. This
    1551             :      * estimates the hash cost as sizeof(PagetableEntry), which is good enough
    1552             :      * for our purpose.  Also count an extra Pointer per entry for the arrays
    1553             :      * created during iteration readout.
    1554             :      */
    1555      331552 :     nbuckets = maxbytes /
    1556             :         (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
    1557      331552 :     nbuckets = Min(nbuckets, INT_MAX - 1);  /* safety limit */
    1558      331552 :     nbuckets = Max(nbuckets, 16);   /* sanity limit */
    1559             : 
    1560      331552 :     return nbuckets;
    1561             : }

Generated by: LCOV version 1.13