LCOV - code coverage report
Current view: top level - src/backend/access/hash - hashpage.c (source / functions) Hit Total Coverage
Test: PostgreSQL 13devel Lines: 384 469 81.9 %
Date: 2019-11-16 00:06:57 Functions: 18 19 94.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * hashpage.c
       4             :  *    Hash table page management code for the Postgres hash access method
       5             :  *
       6             :  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/hash/hashpage.c
      12             :  *
      13             :  * NOTES
      14             :  *    Postgres hash pages look like ordinary relation pages.  The opaque
      15             :  *    data at high addresses includes information about the page including
      16             :  *    whether a page is an overflow page or a true bucket, the bucket
      17             :  *    number, and the block numbers of the preceding and following pages
      18             :  *    in the same bucket.
      19             :  *
      20             :  *    The first page in a hash relation, page zero, is special -- it stores
      21             :  *    information describing the hash table; it is referred to as the
      22             :  *    "meta page." Pages one and higher store the actual data.
      23             :  *
      24             :  *    There are also bitmap pages, which are not manipulated here;
      25             :  *    see hashovfl.c.
      26             :  *
      27             :  *-------------------------------------------------------------------------
      28             :  */
      29             : #include "postgres.h"
      30             : 
      31             : #include "access/hash.h"
      32             : #include "access/hash_xlog.h"
      33             : #include "miscadmin.h"
      34             : #include "storage/lmgr.h"
      35             : #include "storage/predicate.h"
      36             : #include "storage/smgr.h"
      37             : 
      38             : static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
      39             :                                 uint32 nblocks);
      40             : static void _hash_splitbucket(Relation rel, Buffer metabuf,
      41             :                               Bucket obucket, Bucket nbucket,
      42             :                               Buffer obuf,
      43             :                               Buffer nbuf,
      44             :                               HTAB *htab,
      45             :                               uint32 maxbucket,
      46             :                               uint32 highmask, uint32 lowmask);
      47             : static void log_split_page(Relation rel, Buffer buf);
      48             : 
      49             : 
      50             : /*
      51             :  *  _hash_getbuf() -- Get a buffer by block number for read or write.
      52             :  *
      53             :  *      'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
      54             :  *      'flags' is a bitwise OR of the allowed page types.
      55             :  *
      56             :  *      This must be used only to fetch pages that are expected to be valid
      57             :  *      already.  _hash_checkpage() is applied using the given flags.
      58             :  *
      59             :  *      When this routine returns, the appropriate lock is set on the
      60             :  *      requested buffer and its reference count has been incremented
      61             :  *      (ie, the buffer is "locked and pinned").
      62             :  *
      63             :  *      P_NEW is disallowed because this routine can only be used
      64             :  *      to access pages that are known to be before the filesystem EOF.
      65             :  *      Extending the index should be done with _hash_getnewbuf.
      66             :  */
      67             : Buffer
      68     1115898 : _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
      69             : {
      70             :     Buffer      buf;
      71             : 
      72     1115898 :     if (blkno == P_NEW)
      73           0 :         elog(ERROR, "hash AM does not use P_NEW");
      74             : 
      75     1115898 :     buf = ReadBuffer(rel, blkno);
      76             : 
      77     1115898 :     if (access != HASH_NOLOCK)
      78      675964 :         LockBuffer(buf, access);
      79             : 
      80             :     /* ref count and lock type are correct */
      81             : 
      82     1115898 :     _hash_checkpage(rel, buf, flags);
      83             : 
      84     1115898 :     return buf;
      85             : }
      86             : 
      87             : /*
      88             :  * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
      89             :  *
      90             :  *      We read the page and try to acquire a cleanup lock.  If we get it,
      91             :  *      we return the buffer; otherwise, we return InvalidBuffer.
      92             :  */
      93             : Buffer
      94         626 : _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
      95             : {
      96             :     Buffer      buf;
      97             : 
      98         626 :     if (blkno == P_NEW)
      99           0 :         elog(ERROR, "hash AM does not use P_NEW");
     100             : 
     101         626 :     buf = ReadBuffer(rel, blkno);
     102             : 
     103         626 :     if (!ConditionalLockBufferForCleanup(buf))
     104             :     {
     105           0 :         ReleaseBuffer(buf);
     106           0 :         return InvalidBuffer;
     107             :     }
     108             : 
     109             :     /* ref count and lock type are correct */
     110             : 
     111         626 :     _hash_checkpage(rel, buf, flags);
     112             : 
     113         626 :     return buf;
     114             : }
     115             : 
     116             : /*
     117             :  *  _hash_getinitbuf() -- Get and initialize a buffer by block number.
     118             :  *
     119             :  *      This must be used only to fetch pages that are known to be before
     120             :  *      the index's filesystem EOF, but are to be filled from scratch.
     121             :  *      _hash_pageinit() is applied automatically.  Otherwise it has
     122             :  *      effects similar to _hash_getbuf() with access = HASH_WRITE.
     123             :  *
     124             :  *      When this routine returns, a write lock is set on the
     125             :  *      requested buffer and its reference count has been incremented
     126             :  *      (ie, the buffer is "locked and pinned").
     127             :  *
     128             :  *      P_NEW is disallowed because this routine can only be used
     129             :  *      to access pages that are known to be before the filesystem EOF.
     130             :  *      Extending the index should be done with _hash_getnewbuf.
     131             :  */
     132             : Buffer
     133          44 : _hash_getinitbuf(Relation rel, BlockNumber blkno)
     134             : {
     135             :     Buffer      buf;
     136             : 
     137          44 :     if (blkno == P_NEW)
     138           0 :         elog(ERROR, "hash AM does not use P_NEW");
     139             : 
     140          44 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
     141             :                              NULL);
     142             : 
     143             :     /* ref count and lock type are correct */
     144             : 
     145             :     /* initialize the page */
     146          44 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     147             : 
     148          44 :     return buf;
     149             : }
     150             : 
     151             : /*
     152             :  *  _hash_initbuf() -- Get and initialize a buffer by bucket number.
     153             :  */
     154             : void
     155        5228 : _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
     156             :               bool initpage)
     157             : {
     158             :     HashPageOpaque pageopaque;
     159             :     Page        page;
     160             : 
     161        5228 :     page = BufferGetPage(buf);
     162             : 
     163             :     /* initialize the page */
     164        5228 :     if (initpage)
     165           0 :         _hash_pageinit(page, BufferGetPageSize(buf));
     166             : 
     167        5228 :     pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
     168             : 
     169             :     /*
     170             :      * Set hasho_prevblkno with current hashm_maxbucket. This value will be
     171             :      * used to validate cached HashMetaPageData. See
     172             :      * _hash_getbucketbuf_from_hashkey().
     173             :      */
     174        5228 :     pageopaque->hasho_prevblkno = max_bucket;
     175        5228 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     176        5228 :     pageopaque->hasho_bucket = num_bucket;
     177        5228 :     pageopaque->hasho_flag = flag;
     178        5228 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     179        5228 : }
     180             : 
     181             : /*
     182             :  *  _hash_getnewbuf() -- Get a new page at the end of the index.
     183             :  *
     184             :  *      This has the same API as _hash_getinitbuf, except that we are adding
     185             :  *      a page to the index, and hence expect the page to be past the
     186             :  *      logical EOF.  (However, we have to support the case where it isn't,
     187             :  *      since a prior try might have crashed after extending the filesystem
     188             :  *      EOF but before updating the metapage to reflect the added page.)
     189             :  *
     190             :  *      It is caller's responsibility to ensure that only one process can
     191             :  *      extend the index at a time.  In practice, this function is called
     192             :  *      only while holding write lock on the metapage, because adding a page
     193             :  *      is always associated with an update of metapage data.
     194             :  */
     195             : Buffer
     196        6340 : _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
     197             : {
     198        6340 :     BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
     199             :     Buffer      buf;
     200             : 
     201        6340 :     if (blkno == P_NEW)
     202           0 :         elog(ERROR, "hash AM does not use P_NEW");
     203        6340 :     if (blkno > nblocks)
     204           0 :         elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
     205             :              RelationGetRelationName(rel));
     206             : 
     207             :     /* smgr insists we use P_NEW to extend the relation */
     208        6340 :     if (blkno == nblocks)
     209             :     {
     210        5714 :         buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
     211        5714 :         if (BufferGetBlockNumber(buf) != blkno)
     212           0 :             elog(ERROR, "unexpected hash relation size: %u, should be %u",
     213             :                  BufferGetBlockNumber(buf), blkno);
     214        5714 :         LockBuffer(buf, HASH_WRITE);
     215             :     }
     216             :     else
     217             :     {
     218         626 :         buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
     219             :                                  NULL);
     220             :     }
     221             : 
     222             :     /* ref count and lock type are correct */
     223             : 
     224             :     /* initialize the page */
     225        6340 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     226             : 
     227        6340 :     return buf;
     228             : }
     229             : 
     230             : /*
     231             :  *  _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
     232             :  *
     233             :  *      This is identical to _hash_getbuf() but also allows a buffer access
     234             :  *      strategy to be specified.  We use this for VACUUM operations.
     235             :  */
     236             : Buffer
     237         716 : _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
     238             :                            int access, int flags,
     239             :                            BufferAccessStrategy bstrategy)
     240             : {
     241             :     Buffer      buf;
     242             : 
     243         716 :     if (blkno == P_NEW)
     244           0 :         elog(ERROR, "hash AM does not use P_NEW");
     245             : 
     246         716 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
     247             : 
     248         716 :     if (access != HASH_NOLOCK)
     249         716 :         LockBuffer(buf, access);
     250             : 
     251             :     /* ref count and lock type are correct */
     252             : 
     253         716 :     _hash_checkpage(rel, buf, flags);
     254             : 
     255         716 :     return buf;
     256             : }
     257             : 
     258             : /*
     259             :  *  _hash_relbuf() -- release a locked buffer.
     260             :  *
     261             :  * Lock and pin (refcount) are both dropped.
     262             :  */
     263             : void
     264      648294 : _hash_relbuf(Relation rel, Buffer buf)
     265             : {
     266      648294 :     UnlockReleaseBuffer(buf);
     267      648294 : }
     268             : 
     269             : /*
     270             :  *  _hash_dropbuf() -- release an unlocked buffer.
     271             :  *
     272             :  * This is used to unpin a buffer on which we hold no lock.
     273             :  */
     274             : void
     275      475492 : _hash_dropbuf(Relation rel, Buffer buf)
     276             : {
     277      475492 :     ReleaseBuffer(buf);
     278      475492 : }
     279             : 
     280             : /*
     281             :  *  _hash_dropscanbuf() -- release buffers used in scan.
     282             :  *
     283             :  * This routine unpins the buffers used during scan on which we
     284             :  * hold no lock.
     285             :  */
     286             : void
     287         844 : _hash_dropscanbuf(Relation rel, HashScanOpaque so)
     288             : {
     289             :     /* release pin we hold on primary bucket page */
     290        1144 :     if (BufferIsValid(so->hashso_bucket_buf) &&
     291         300 :         so->hashso_bucket_buf != so->currPos.buf)
     292          42 :         _hash_dropbuf(rel, so->hashso_bucket_buf);
     293         844 :     so->hashso_bucket_buf = InvalidBuffer;
     294             : 
     295             :     /* release pin we hold on primary bucket page  of bucket being split */
     296         844 :     if (BufferIsValid(so->hashso_split_bucket_buf) &&
     297           0 :         so->hashso_split_bucket_buf != so->currPos.buf)
     298           0 :         _hash_dropbuf(rel, so->hashso_split_bucket_buf);
     299         844 :     so->hashso_split_bucket_buf = InvalidBuffer;
     300             : 
     301             :     /* release any pin we still hold */
     302         844 :     if (BufferIsValid(so->currPos.buf))
     303         258 :         _hash_dropbuf(rel, so->currPos.buf);
     304         844 :     so->currPos.buf = InvalidBuffer;
     305             : 
     306             :     /* reset split scan */
     307         844 :     so->hashso_buc_populated = false;
     308         844 :     so->hashso_buc_split = false;
     309         844 : }
     310             : 
     311             : 
     312             : /*
     313             :  *  _hash_init() -- Initialize the metadata page of a hash index,
     314             :  *              the initial buckets, and the initial bitmap page.
     315             :  *
     316             :  * The initial number of buckets is dependent on num_tuples, an estimate
     317             :  * of the number of tuples to be loaded into the index initially.  The
     318             :  * chosen number of buckets is returned.
     319             :  *
     320             :  * We are fairly cavalier about locking here, since we know that no one else
     321             :  * could be accessing this index.  In particular the rule about not holding
     322             :  * multiple buffer locks is ignored.
     323             :  */
     324             : uint32
     325         176 : _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
     326             : {
     327             :     Buffer      metabuf;
     328             :     Buffer      buf;
     329             :     Buffer      bitmapbuf;
     330             :     Page        pg;
     331             :     HashMetaPage metap;
     332             :     RegProcedure procid;
     333             :     int32       data_width;
     334             :     int32       item_width;
     335             :     int32       ffactor;
     336             :     uint32      num_buckets;
     337             :     uint32      i;
     338             :     bool        use_wal;
     339             : 
     340             :     /* safety check */
     341         176 :     if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
     342           0 :         elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
     343             :              RelationGetRelationName(rel));
     344             : 
     345             :     /*
     346             :      * WAL log creation of pages if the relation is persistent, or this is the
     347             :      * init fork.  Init forks for unlogged relations always need to be WAL
     348             :      * logged.
     349             :      */
     350         176 :     use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
     351             : 
     352             :     /*
     353             :      * Determine the target fill factor (in tuples per bucket) for this index.
     354             :      * The idea is to make the fill factor correspond to pages about as full
     355             :      * as the user-settable fillfactor parameter says.  We can compute it
     356             :      * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
     357             :      */
     358         176 :     data_width = sizeof(uint32);
     359         176 :     item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
     360             :         sizeof(ItemIdData);     /* include the line pointer */
     361         176 :     ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
     362             :     /* keep to a sane range */
     363         176 :     if (ffactor < 10)
     364           0 :         ffactor = 10;
     365             : 
     366         176 :     procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
     367             : 
     368             :     /*
     369             :      * We initialize the metapage, the first N bucket pages, and the first
     370             :      * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
     371             :      * calls to occur.  This ensures that the smgr level has the right idea of
     372             :      * the physical index length.
     373             :      *
     374             :      * Critical section not required, because on error the creation of the
     375             :      * whole relation will be rolled back.
     376             :      */
     377         176 :     metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
     378         176 :     _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
     379         176 :     MarkBufferDirty(metabuf);
     380             : 
     381         176 :     pg = BufferGetPage(metabuf);
     382         176 :     metap = HashPageGetMeta(pg);
     383             : 
     384             :     /* XLOG stuff */
     385         176 :     if (use_wal)
     386             :     {
     387             :         xl_hash_init_meta_page xlrec;
     388             :         XLogRecPtr  recptr;
     389             : 
     390         168 :         xlrec.num_tuples = num_tuples;
     391         168 :         xlrec.procid = metap->hashm_procid;
     392         168 :         xlrec.ffactor = metap->hashm_ffactor;
     393             : 
     394         168 :         XLogBeginInsert();
     395         168 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
     396         168 :         XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     397             : 
     398         168 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
     399             : 
     400         168 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     401             :     }
     402             : 
     403         176 :     num_buckets = metap->hashm_maxbucket + 1;
     404             : 
     405             :     /*
     406             :      * Release buffer lock on the metapage while we initialize buckets.
     407             :      * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
     408             :      * won't accomplish anything.  It's a bad idea to hold buffer locks for
     409             :      * long intervals in any case, since that can block the bgwriter.
     410             :      */
     411         176 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     412             : 
     413             :     /*
     414             :      * Initialize and WAL Log the first N buckets
     415             :      */
     416        5404 :     for (i = 0; i < num_buckets; i++)
     417             :     {
     418             :         BlockNumber blkno;
     419             : 
     420             :         /* Allow interrupts, in case N is huge */
     421        5228 :         CHECK_FOR_INTERRUPTS();
     422             : 
     423        5228 :         blkno = BUCKET_TO_BLKNO(metap, i);
     424        5228 :         buf = _hash_getnewbuf(rel, blkno, forkNum);
     425        5228 :         _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
     426        5228 :         MarkBufferDirty(buf);
     427             : 
     428        5228 :         if (use_wal)
     429       10328 :             log_newpage(&rel->rd_node,
     430             :                         forkNum,
     431             :                         blkno,
     432        5164 :                         BufferGetPage(buf),
     433             :                         true);
     434        5228 :         _hash_relbuf(rel, buf);
     435             :     }
     436             : 
     437             :     /* Now reacquire buffer lock on metapage */
     438         176 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     439             : 
     440             :     /*
     441             :      * Initialize bitmap page
     442             :      */
     443         176 :     bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
     444         176 :     _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
     445         176 :     MarkBufferDirty(bitmapbuf);
     446             : 
     447             :     /* add the new bitmap page to the metapage's list of bitmaps */
     448             :     /* metapage already has a write lock */
     449         176 :     if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
     450           0 :         ereport(ERROR,
     451             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     452             :                  errmsg("out of overflow pages in hash index \"%s\"",
     453             :                         RelationGetRelationName(rel))));
     454             : 
     455         176 :     metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
     456             : 
     457         176 :     metap->hashm_nmaps++;
     458         176 :     MarkBufferDirty(metabuf);
     459             : 
     460             :     /* XLOG stuff */
     461         176 :     if (use_wal)
     462             :     {
     463             :         xl_hash_init_bitmap_page xlrec;
     464             :         XLogRecPtr  recptr;
     465             : 
     466         168 :         xlrec.bmsize = metap->hashm_bmsize;
     467             : 
     468         168 :         XLogBeginInsert();
     469         168 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage);
     470         168 :         XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
     471             : 
     472             :         /*
     473             :          * This is safe only because nobody else can be modifying the index at
     474             :          * this stage; it's only visible to the transaction that is creating
     475             :          * it.
     476             :          */
     477         168 :         XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
     478             : 
     479         168 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
     480             : 
     481         168 :         PageSetLSN(BufferGetPage(bitmapbuf), recptr);
     482         168 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     483             :     }
     484             : 
     485             :     /* all done */
     486         176 :     _hash_relbuf(rel, bitmapbuf);
     487         176 :     _hash_relbuf(rel, metabuf);
     488             : 
     489         176 :     return num_buckets;
     490             : }
     491             : 
     492             : /*
     493             :  *  _hash_init_metabuffer() -- Initialize the metadata page of a hash index.
     494             :  */
     495             : void
     496         176 : _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
     497             :                       uint16 ffactor, bool initpage)
     498             : {
     499             :     HashMetaPage metap;
     500             :     HashPageOpaque pageopaque;
     501             :     Page        page;
     502             :     double      dnumbuckets;
     503             :     uint32      num_buckets;
     504             :     uint32      spare_index;
     505             :     uint32      i;
     506             : 
     507             :     /*
     508             :      * Choose the number of initial bucket pages to match the fill factor
     509             :      * given the estimated number of tuples.  We round up the result to the
     510             :      * total number of buckets which has to be allocated before using its
     511             :      * hashm_spares element. However always force at least 2 bucket pages. The
     512             :      * upper limit is determined by considerations explained in
     513             :      * _hash_expandtable().
     514             :      */
     515         176 :     dnumbuckets = num_tuples / ffactor;
     516         176 :     if (dnumbuckets <= 2.0)
     517          20 :         num_buckets = 2;
     518         156 :     else if (dnumbuckets >= (double) 0x40000000)
     519           0 :         num_buckets = 0x40000000;
     520             :     else
     521         156 :         num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
     522             : 
     523         176 :     spare_index = _hash_spareindex(num_buckets);
     524             :     Assert(spare_index < HASH_MAX_SPLITPOINTS);
     525             : 
     526         176 :     page = BufferGetPage(buf);
     527         176 :     if (initpage)
     528           0 :         _hash_pageinit(page, BufferGetPageSize(buf));
     529             : 
     530         176 :     pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
     531         176 :     pageopaque->hasho_prevblkno = InvalidBlockNumber;
     532         176 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     533         176 :     pageopaque->hasho_bucket = -1;
     534         176 :     pageopaque->hasho_flag = LH_META_PAGE;
     535         176 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     536             : 
     537         176 :     metap = HashPageGetMeta(page);
     538             : 
     539         176 :     metap->hashm_magic = HASH_MAGIC;
     540         176 :     metap->hashm_version = HASH_VERSION;
     541         176 :     metap->hashm_ntuples = 0;
     542         176 :     metap->hashm_nmaps = 0;
     543         176 :     metap->hashm_ffactor = ffactor;
     544         176 :     metap->hashm_bsize = HashGetMaxBitmapSize(page);
     545             :     /* find largest bitmap array size that will fit in page size */
     546         352 :     for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
     547             :     {
     548         352 :         if ((1 << i) <= metap->hashm_bsize)
     549         176 :             break;
     550             :     }
     551             :     Assert(i > 0);
     552         176 :     metap->hashm_bmsize = 1 << i;
     553         176 :     metap->hashm_bmshift = i + BYTE_TO_BIT;
     554             :     Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
     555             : 
     556             :     /*
     557             :      * Label the index with its primary hash support function's OID.  This is
     558             :      * pretty useless for normal operation (in fact, hashm_procid is not used
     559             :      * anywhere), but it might be handy for forensic purposes so we keep it.
     560             :      */
     561         176 :     metap->hashm_procid = procid;
     562             : 
     563             :     /*
     564             :      * We initialize the index with N buckets, 0 .. N-1, occupying physical
     565             :      * blocks 1 to N.  The first freespace bitmap page is in block N+1.
     566             :      */
     567         176 :     metap->hashm_maxbucket = num_buckets - 1;
     568             : 
     569             :     /*
     570             :      * Set highmask as next immediate ((2 ^ x) - 1), which should be
     571             :      * sufficient to cover num_buckets.
     572             :      */
     573         176 :     metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1;
     574         176 :     metap->hashm_lowmask = (metap->hashm_highmask >> 1);
     575             : 
     576         176 :     MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
     577         176 :     MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
     578             : 
     579             :     /* Set up mapping for one spare page after the initial splitpoints */
     580         176 :     metap->hashm_spares[spare_index] = 1;
     581         176 :     metap->hashm_ovflpoint = spare_index;
     582         176 :     metap->hashm_firstfree = 0;
     583             : 
     584             :     /*
     585             :      * Set pd_lower just past the end of the metadata.  This is essential,
     586             :      * because without doing so, metadata will be lost if xlog.c compresses
     587             :      * the page.
     588             :      */
     589         176 :     ((PageHeader) page)->pd_lower =
     590         176 :         ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
     591         176 : }
     592             : 
     593             : /*
     594             :  *  _hash_pageinit() -- Initialize a new hash index page.
     595             :  */
     596             : void
     597        6516 : _hash_pageinit(Page page, Size size)
     598             : {
     599        6516 :     PageInit(page, size, sizeof(HashPageOpaqueData));
     600        6516 : }
     601             : 
     602             : /*
     603             :  * Attempt to expand the hash table by creating one new bucket.
     604             :  *
     605             :  * This will silently do nothing if we don't get cleanup lock on old or
     606             :  * new bucket.
     607             :  *
     608             :  * Complete the pending splits and remove the tuples from old bucket,
     609             :  * if there are any left over from the previous split.
     610             :  *
     611             :  * The caller must hold a pin, but no lock, on the metapage buffer.
     612             :  * The buffer is returned in the same state.
     613             :  */
     614             : void
     615         626 : _hash_expandtable(Relation rel, Buffer metabuf)
     616             : {
     617             :     HashMetaPage metap;
     618             :     Bucket      old_bucket;
     619             :     Bucket      new_bucket;
     620             :     uint32      spare_ndx;
     621             :     BlockNumber start_oblkno;
     622             :     BlockNumber start_nblkno;
     623             :     Buffer      buf_nblkno;
     624             :     Buffer      buf_oblkno;
     625             :     Page        opage;
     626             :     Page        npage;
     627             :     HashPageOpaque oopaque;
     628             :     HashPageOpaque nopaque;
     629             :     uint32      maxbucket;
     630             :     uint32      highmask;
     631             :     uint32      lowmask;
     632         626 :     bool        metap_update_masks = false;
     633         626 :     bool        metap_update_splitpoint = false;
     634             : 
     635             : restart_expand:
     636             : 
     637             :     /*
     638             :      * Write-lock the meta page.  It used to be necessary to acquire a
     639             :      * heavyweight lock to begin a split, but that is no longer required.
     640             :      */
     641         626 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     642             : 
     643         626 :     _hash_checkpage(rel, metabuf, LH_META_PAGE);
     644         626 :     metap = HashPageGetMeta(BufferGetPage(metabuf));
     645             : 
     646             :     /*
     647             :      * Check to see if split is still needed; someone else might have already
     648             :      * done one while we waited for the lock.
     649             :      *
     650             :      * Make sure this stays in sync with _hash_doinsert()
     651             :      */
     652        1252 :     if (metap->hashm_ntuples <=
     653         626 :         (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
     654           0 :         goto fail;
     655             : 
     656             :     /*
     657             :      * Can't split anymore if maxbucket has reached its maximum possible
     658             :      * value.
     659             :      *
     660             :      * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
     661             :      * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
     662             :      * to half that because of overflow looping in _hash_log2() and
     663             :      * insufficient space in hashm_spares[].  It's moot anyway because an
     664             :      * index with 2^32 buckets would certainly overflow BlockNumber and hence
     665             :      * _hash_alloc_buckets() would fail, but if we supported buckets smaller
     666             :      * than a disk block then this would be an independent constraint.
     667             :      *
     668             :      * If you change this, see also the maximum initial number of buckets in
     669             :      * _hash_init().
     670             :      */
     671         626 :     if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
     672           0 :         goto fail;
     673             : 
     674             :     /*
     675             :      * Determine which bucket is to be split, and attempt to take cleanup lock
     676             :      * on the old bucket.  If we can't get the lock, give up.
     677             :      *
     678             :      * The cleanup lock protects us not only against other backends, but
     679             :      * against our own backend as well.
     680             :      *
     681             :      * The cleanup lock is mainly to protect the split from concurrent
     682             :      * inserts. See src/backend/access/hash/README, Lock Definitions for
     683             :      * further details.  Due to this locking restriction, if there is any
     684             :      * pending scan, the split will give up which is not good, but harmless.
     685             :      */
     686         626 :     new_bucket = metap->hashm_maxbucket + 1;
     687             : 
     688         626 :     old_bucket = (new_bucket & metap->hashm_lowmask);
     689             : 
     690         626 :     start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
     691             : 
     692         626 :     buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
     693         626 :     if (!buf_oblkno)
     694           0 :         goto fail;
     695             : 
     696         626 :     opage = BufferGetPage(buf_oblkno);
     697         626 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
     698             : 
     699             :     /*
     700             :      * We want to finish the split from a bucket as there is no apparent
     701             :      * benefit by not doing so and it will make the code complicated to finish
     702             :      * the split that involves multiple buckets considering the case where new
     703             :      * split also fails.  We don't need to consider the new bucket for
     704             :      * completing the split here as it is not possible that a re-split of new
     705             :      * bucket starts when there is still a pending split from old bucket.
     706             :      */
     707         626 :     if (H_BUCKET_BEING_SPLIT(oopaque))
     708             :     {
     709             :         /*
     710             :          * Copy bucket mapping info now; refer the comment in code below where
     711             :          * we copy this information before calling _hash_splitbucket to see
     712             :          * why this is okay.
     713             :          */
     714           0 :         maxbucket = metap->hashm_maxbucket;
     715           0 :         highmask = metap->hashm_highmask;
     716           0 :         lowmask = metap->hashm_lowmask;
     717             : 
     718             :         /*
     719             :          * Release the lock on metapage and old_bucket, before completing the
     720             :          * split.
     721             :          */
     722           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     723           0 :         LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
     724             : 
     725           0 :         _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
     726             :                            highmask, lowmask);
     727             : 
     728             :         /* release the pin on old buffer and retry for expand. */
     729           0 :         _hash_dropbuf(rel, buf_oblkno);
     730             : 
     731           0 :         goto restart_expand;
     732             :     }
     733             : 
     734             :     /*
     735             :      * Clean the tuples remained from the previous split.  This operation
     736             :      * requires cleanup lock and we already have one on the old bucket, so
     737             :      * let's do it. We also don't want to allow further splits from the bucket
     738             :      * till the garbage of previous split is cleaned.  This has two
     739             :      * advantages; first, it helps in avoiding the bloat due to garbage and
     740             :      * second is, during cleanup of bucket, we are always sure that the
     741             :      * garbage tuples belong to most recently split bucket.  On the contrary,
     742             :      * if we allow cleanup of bucket after meta page is updated to indicate
     743             :      * the new split and before the actual split, the cleanup operation won't
     744             :      * be able to decide whether the tuple has been moved to the newly created
     745             :      * bucket and ended up deleting such tuples.
     746             :      */
     747         626 :     if (H_NEEDS_SPLIT_CLEANUP(oopaque))
     748             :     {
     749             :         /*
     750             :          * Copy bucket mapping info now; refer to the comment in code below
     751             :          * where we copy this information before calling _hash_splitbucket to
     752             :          * see why this is okay.
     753             :          */
     754           0 :         maxbucket = metap->hashm_maxbucket;
     755           0 :         highmask = metap->hashm_highmask;
     756           0 :         lowmask = metap->hashm_lowmask;
     757             : 
     758             :         /* Release the metapage lock. */
     759           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     760             : 
     761           0 :         hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
     762             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
     763             :                           NULL, NULL);
     764             : 
     765           0 :         _hash_dropbuf(rel, buf_oblkno);
     766             : 
     767           0 :         goto restart_expand;
     768             :     }
     769             : 
     770             :     /*
     771             :      * There shouldn't be any active scan on new bucket.
     772             :      *
     773             :      * Note: it is safe to compute the new bucket's blkno here, even though we
     774             :      * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
     775             :      * the current value of hashm_spares[hashm_ovflpoint] correctly shows
     776             :      * where we are going to put a new splitpoint's worth of buckets.
     777             :      */
     778         626 :     start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
     779             : 
     780             :     /*
     781             :      * If the split point is increasing we need to allocate a new batch of
     782             :      * bucket pages.
     783             :      */
     784         626 :     spare_ndx = _hash_spareindex(new_bucket + 1);
     785         626 :     if (spare_ndx > metap->hashm_ovflpoint)
     786             :     {
     787             :         uint32      buckets_to_add;
     788             : 
     789             :         Assert(spare_ndx == metap->hashm_ovflpoint + 1);
     790             : 
     791             :         /*
     792             :          * We treat allocation of buckets as a separate WAL-logged action.
     793             :          * Even if we fail after this operation, won't leak bucket pages;
     794             :          * rather, the next split will consume this space. In any case, even
     795             :          * without failure we don't use all the space in one split operation.
     796             :          */
     797          30 :         buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
     798          30 :         if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
     799             :         {
     800             :             /* can't split due to BlockNumber overflow */
     801           0 :             _hash_relbuf(rel, buf_oblkno);
     802           0 :             goto fail;
     803             :         }
     804             :     }
     805             : 
     806             :     /*
     807             :      * Physically allocate the new bucket's primary page.  We want to do this
     808             :      * before changing the metapage's mapping info, in case we can't get the
     809             :      * disk space.  Ideally, we don't need to check for cleanup lock on new
     810             :      * bucket as no other backend could find this bucket unless meta page is
     811             :      * updated.  However, it is good to be consistent with old bucket locking.
     812             :      */
     813         626 :     buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
     814         626 :     if (!IsBufferCleanupOK(buf_nblkno))
     815             :     {
     816           0 :         _hash_relbuf(rel, buf_oblkno);
     817           0 :         _hash_relbuf(rel, buf_nblkno);
     818           0 :         goto fail;
     819             :     }
     820             : 
     821             :     /*
     822             :      * Since we are scribbling on the pages in the shared buffers, establish a
     823             :      * critical section.  Any failure in this next code leaves us with a big
     824             :      * problem: the metapage is effectively corrupt but could get written back
     825             :      * to disk.
     826             :      */
     827         626 :     START_CRIT_SECTION();
     828             : 
     829             :     /*
     830             :      * Okay to proceed with split.  Update the metapage bucket mapping info.
     831             :      */
     832         626 :     metap->hashm_maxbucket = new_bucket;
     833             : 
     834         626 :     if (new_bucket > metap->hashm_highmask)
     835             :     {
     836             :         /* Starting a new doubling */
     837          12 :         metap->hashm_lowmask = metap->hashm_highmask;
     838          12 :         metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
     839          12 :         metap_update_masks = true;
     840             :     }
     841             : 
     842             :     /*
     843             :      * If the split point is increasing we need to adjust the hashm_spares[]
     844             :      * array and hashm_ovflpoint so that future overflow pages will be created
     845             :      * beyond this new batch of bucket pages.
     846             :      */
     847         626 :     if (spare_ndx > metap->hashm_ovflpoint)
     848             :     {
     849          30 :         metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
     850          30 :         metap->hashm_ovflpoint = spare_ndx;
     851          30 :         metap_update_splitpoint = true;
     852             :     }
     853             : 
     854         626 :     MarkBufferDirty(metabuf);
     855             : 
     856             :     /*
     857             :      * Copy bucket mapping info now; this saves re-accessing the meta page
     858             :      * inside _hash_splitbucket's inner loop.  Note that once we drop the
     859             :      * split lock, other splits could begin, so these values might be out of
     860             :      * date before _hash_splitbucket finishes.  That's okay, since all it
     861             :      * needs is to tell which of these two buckets to map hashkeys into.
     862             :      */
     863         626 :     maxbucket = metap->hashm_maxbucket;
     864         626 :     highmask = metap->hashm_highmask;
     865         626 :     lowmask = metap->hashm_lowmask;
     866             : 
     867         626 :     opage = BufferGetPage(buf_oblkno);
     868         626 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
     869             : 
     870             :     /*
     871             :      * Mark the old bucket to indicate that split is in progress.  (At
     872             :      * operation end, we will clear the split-in-progress flag.)  Also, for a
     873             :      * primary bucket page, hasho_prevblkno stores the number of buckets that
     874             :      * existed as of the last split, so we must update that value here.
     875             :      */
     876         626 :     oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
     877         626 :     oopaque->hasho_prevblkno = maxbucket;
     878             : 
     879         626 :     MarkBufferDirty(buf_oblkno);
     880             : 
     881         626 :     npage = BufferGetPage(buf_nblkno);
     882             : 
     883             :     /*
     884             :      * initialize the new bucket's primary page and mark it to indicate that
     885             :      * split is in progress.
     886             :      */
     887         626 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
     888         626 :     nopaque->hasho_prevblkno = maxbucket;
     889         626 :     nopaque->hasho_nextblkno = InvalidBlockNumber;
     890         626 :     nopaque->hasho_bucket = new_bucket;
     891         626 :     nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
     892         626 :     nopaque->hasho_page_id = HASHO_PAGE_ID;
     893             : 
     894         626 :     MarkBufferDirty(buf_nblkno);
     895             : 
     896             :     /* XLOG stuff */
     897         626 :     if (RelationNeedsWAL(rel))
     898             :     {
     899             :         xl_hash_split_allocate_page xlrec;
     900             :         XLogRecPtr  recptr;
     901             : 
     902         626 :         xlrec.new_bucket = maxbucket;
     903         626 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
     904         626 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
     905         626 :         xlrec.flags = 0;
     906             : 
     907         626 :         XLogBeginInsert();
     908             : 
     909         626 :         XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
     910         626 :         XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
     911         626 :         XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
     912             : 
     913         626 :         if (metap_update_masks)
     914             :         {
     915          12 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
     916          12 :             XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32));
     917          12 :             XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32));
     918             :         }
     919             : 
     920         626 :         if (metap_update_splitpoint)
     921             :         {
     922          30 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
     923          30 :             XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
     924             :                                 sizeof(uint32));
     925          30 :             XLogRegisterBufData(2,
     926          30 :                                 (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
     927             :                                 sizeof(uint32));
     928             :         }
     929             : 
     930         626 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage);
     931             : 
     932         626 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
     933             : 
     934         626 :         PageSetLSN(BufferGetPage(buf_oblkno), recptr);
     935         626 :         PageSetLSN(BufferGetPage(buf_nblkno), recptr);
     936         626 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     937             :     }
     938             : 
     939         626 :     END_CRIT_SECTION();
     940             : 
     941             :     /* drop lock, but keep pin */
     942         626 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     943             : 
     944             :     /* Relocate records to the new bucket */
     945         626 :     _hash_splitbucket(rel, metabuf,
     946             :                       old_bucket, new_bucket,
     947             :                       buf_oblkno, buf_nblkno, NULL,
     948             :                       maxbucket, highmask, lowmask);
     949             : 
     950             :     /* all done, now release the pins on primary buckets. */
     951         626 :     _hash_dropbuf(rel, buf_oblkno);
     952         626 :     _hash_dropbuf(rel, buf_nblkno);
     953             : 
     954         626 :     return;
     955             : 
     956             :     /* Here if decide not to split or fail to acquire old bucket lock */
     957             : fail:
     958             : 
     959             :     /* We didn't write the metapage, so just drop lock */
     960           0 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     961             : }
     962             : 
     963             : 
     964             : /*
     965             :  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
     966             :  *
     967             :  * This does not need to initialize the new bucket pages; we'll do that as
     968             :  * each one is used by _hash_expandtable().  But we have to extend the logical
     969             :  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
     970             :  * sync with ours, so that we don't get complaints from smgr.
     971             :  *
     972             :  * We do this by writing a page of zeroes at the end of the splitpoint range.
     973             :  * We expect that the filesystem will ensure that the intervening pages read
     974             :  * as zeroes too.  On many filesystems this "hole" will not be allocated
     975             :  * immediately, which means that the index file may end up more fragmented
     976             :  * than if we forced it all to be allocated now; but since we don't scan
     977             :  * hash indexes sequentially anyway, that probably doesn't matter.
     978             :  *
     979             :  * XXX It's annoying that this code is executed with the metapage lock held.
     980             :  * We need to interlock against _hash_addovflpage() adding a new overflow page
     981             :  * concurrently, but it'd likely be better to use LockRelationForExtension
     982             :  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
     983             :  * so it may not be worth worrying about.
     984             :  *
     985             :  * Returns true if successful, or false if allocation failed due to
     986             :  * BlockNumber overflow.
     987             :  */
     988             : static bool
     989          30 : _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
     990             : {
     991             :     BlockNumber lastblock;
     992             :     PGAlignedBlock zerobuf;
     993             :     Page        page;
     994             :     HashPageOpaque ovflopaque;
     995             : 
     996          30 :     lastblock = firstblock + nblocks - 1;
     997             : 
     998             :     /*
     999             :      * Check for overflow in block number calculation; if so, we cannot extend
    1000             :      * the index anymore.
    1001             :      */
    1002          30 :     if (lastblock < firstblock || lastblock == InvalidBlockNumber)
    1003           0 :         return false;
    1004             : 
    1005          30 :     page = (Page) zerobuf.data;
    1006             : 
    1007             :     /*
    1008             :      * Initialize the page.  Just zeroing the page won't work; see
    1009             :      * _hash_freeovflpage for similar usage.  We take care to make the special
    1010             :      * space valid for the benefit of tools such as pageinspect.
    1011             :      */
    1012          30 :     _hash_pageinit(page, BLCKSZ);
    1013             : 
    1014          30 :     ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);
    1015             : 
    1016          30 :     ovflopaque->hasho_prevblkno = InvalidBlockNumber;
    1017          30 :     ovflopaque->hasho_nextblkno = InvalidBlockNumber;
    1018          30 :     ovflopaque->hasho_bucket = -1;
    1019          30 :     ovflopaque->hasho_flag = LH_UNUSED_PAGE;
    1020          30 :     ovflopaque->hasho_page_id = HASHO_PAGE_ID;
    1021             : 
    1022          30 :     if (RelationNeedsWAL(rel))
    1023          30 :         log_newpage(&rel->rd_node,
    1024             :                     MAIN_FORKNUM,
    1025             :                     lastblock,
    1026             :                     zerobuf.data,
    1027             :                     true);
    1028             : 
    1029          30 :     RelationOpenSmgr(rel);
    1030          30 :     PageSetChecksumInplace(page, lastblock);
    1031          30 :     smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf.data, false);
    1032             : 
    1033          30 :     return true;
    1034             : }
    1035             : 
    1036             : 
    1037             : /*
    1038             :  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
    1039             :  *
    1040             :  * This routine is used to partition the tuples between old and new bucket and
    1041             :  * is used to finish the incomplete split operations.  To finish the previously
    1042             :  * interrupted split operation, the caller needs to fill htab.  If htab is set,
    1043             :  * then we skip the movement of tuples that exists in htab, otherwise NULL
    1044             :  * value of htab indicates movement of all the tuples that belong to the new
    1045             :  * bucket.
    1046             :  *
    1047             :  * We are splitting a bucket that consists of a base bucket page and zero
    1048             :  * or more overflow (bucket chain) pages.  We must relocate tuples that
    1049             :  * belong in the new bucket.
    1050             :  *
    1051             :  * The caller must hold cleanup locks on both buckets to ensure that
    1052             :  * no one else is trying to access them (see README).
    1053             :  *
    1054             :  * The caller must hold a pin, but no lock, on the metapage buffer.
    1055             :  * The buffer is returned in the same state.  (The metapage is only
    1056             :  * touched if it becomes necessary to add or remove overflow pages.)
    1057             :  *
    1058             :  * Split needs to retain pin on primary bucket pages of both old and new
    1059             :  * buckets till end of operation.  This is to prevent vacuum from starting
    1060             :  * while a split is in progress.
    1061             :  *
    1062             :  * In addition, the caller must have created the new bucket's base page,
    1063             :  * which is passed in buffer nbuf, pinned and write-locked.  The lock will be
    1064             :  * released here and pin must be released by the caller.  (The API is set up
    1065             :  * this way because we must do _hash_getnewbuf() before releasing the metapage
    1066             :  * write lock.  So instead of passing the new bucket's start block number, we
    1067             :  * pass an actual buffer.)
    1068             :  */
    1069             : static void
    1070         626 : _hash_splitbucket(Relation rel,
    1071             :                   Buffer metabuf,
    1072             :                   Bucket obucket,
    1073             :                   Bucket nbucket,
    1074             :                   Buffer obuf,
    1075             :                   Buffer nbuf,
    1076             :                   HTAB *htab,
    1077             :                   uint32 maxbucket,
    1078             :                   uint32 highmask,
    1079             :                   uint32 lowmask)
    1080             : {
    1081             :     Buffer      bucket_obuf;
    1082             :     Buffer      bucket_nbuf;
    1083             :     Page        opage;
    1084             :     Page        npage;
    1085             :     HashPageOpaque oopaque;
    1086             :     HashPageOpaque nopaque;
    1087             :     OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
    1088             :     IndexTuple  itups[MaxIndexTuplesPerPage];
    1089         626 :     Size        all_tups_size = 0;
    1090             :     int         i;
    1091         626 :     uint16      nitups = 0;
    1092             : 
    1093         626 :     bucket_obuf = obuf;
    1094         626 :     opage = BufferGetPage(obuf);
    1095         626 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1096             : 
    1097         626 :     bucket_nbuf = nbuf;
    1098         626 :     npage = BufferGetPage(nbuf);
    1099         626 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1100             : 
    1101             :     /* Copy the predicate locks from old bucket to new bucket. */
    1102         626 :     PredicateLockPageSplit(rel,
    1103             :                            BufferGetBlockNumber(bucket_obuf),
    1104             :                            BufferGetBlockNumber(bucket_nbuf));
    1105             : 
    1106             :     /*
    1107             :      * Partition the tuples in the old bucket between the old bucket and the
    1108             :      * new bucket, advancing along the old bucket's overflow bucket chain and
    1109             :      * adding overflow pages to the new bucket as needed.  Outer loop iterates
    1110             :      * once per page in old bucket.
    1111             :      */
    1112             :     for (;;)
    1113         220 :     {
    1114             :         BlockNumber oblkno;
    1115             :         OffsetNumber ooffnum;
    1116             :         OffsetNumber omaxoffnum;
    1117             : 
    1118             :         /* Scan each tuple in old page */
    1119         846 :         omaxoffnum = PageGetMaxOffsetNumber(opage);
    1120      187226 :         for (ooffnum = FirstOffsetNumber;
    1121             :              ooffnum <= omaxoffnum;
    1122      185534 :              ooffnum = OffsetNumberNext(ooffnum))
    1123             :         {
    1124             :             IndexTuple  itup;
    1125             :             Size        itemsz;
    1126             :             Bucket      bucket;
    1127      185534 :             bool        found = false;
    1128             : 
    1129             :             /* skip dead tuples */
    1130      185534 :             if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
    1131           0 :                 continue;
    1132             : 
    1133             :             /*
    1134             :              * Before inserting a tuple, probe the hash table containing TIDs
    1135             :              * of tuples belonging to new bucket, if we find a match, then
    1136             :              * skip that tuple, else fetch the item's hash key (conveniently
    1137             :              * stored in the item) and determine which bucket it now belongs
    1138             :              * in.
    1139             :              */
    1140      185534 :             itup = (IndexTuple) PageGetItem(opage,
    1141             :                                             PageGetItemId(opage, ooffnum));
    1142             : 
    1143      185534 :             if (htab)
    1144           0 :                 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
    1145             : 
    1146      185534 :             if (found)
    1147           0 :                 continue;
    1148             : 
    1149      185534 :             bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
    1150             :                                           maxbucket, highmask, lowmask);
    1151             : 
    1152      185534 :             if (bucket == nbucket)
    1153             :             {
    1154             :                 IndexTuple  new_itup;
    1155             : 
    1156             :                 /*
    1157             :                  * make a copy of index tuple as we have to scribble on it.
    1158             :                  */
    1159       74224 :                 new_itup = CopyIndexTuple(itup);
    1160             : 
    1161             :                 /*
    1162             :                  * mark the index tuple as moved by split, such tuples are
    1163             :                  * skipped by scan if there is split in progress for a bucket.
    1164             :                  */
    1165       74224 :                 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
    1166             : 
    1167             :                 /*
    1168             :                  * insert the tuple into the new bucket.  if it doesn't fit on
    1169             :                  * the current page in the new bucket, we must allocate a new
    1170             :                  * overflow page and place the tuple on that page instead.
    1171             :                  */
    1172       74224 :                 itemsz = IndexTupleSize(new_itup);
    1173       74224 :                 itemsz = MAXALIGN(itemsz);
    1174             : 
    1175       74224 :                 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
    1176             :                 {
    1177             :                     /*
    1178             :                      * Change the shared buffer state in critical section,
    1179             :                      * otherwise any error could make it unrecoverable.
    1180             :                      */
    1181          52 :                     START_CRIT_SECTION();
    1182             : 
    1183          52 :                     _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1184          52 :                     MarkBufferDirty(nbuf);
    1185             :                     /* log the split operation before releasing the lock */
    1186          52 :                     log_split_page(rel, nbuf);
    1187             : 
    1188          52 :                     END_CRIT_SECTION();
    1189             : 
    1190             :                     /* drop lock, but keep pin */
    1191          52 :                     LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1192             : 
    1193             :                     /* be tidy */
    1194       21216 :                     for (i = 0; i < nitups; i++)
    1195       21164 :                         pfree(itups[i]);
    1196          52 :                     nitups = 0;
    1197          52 :                     all_tups_size = 0;
    1198             : 
    1199             :                     /* chain to a new overflow page */
    1200          52 :                     nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
    1201          52 :                     npage = BufferGetPage(nbuf);
    1202          52 :                     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1203             :                 }
    1204             : 
    1205       74224 :                 itups[nitups++] = new_itup;
    1206       74224 :                 all_tups_size += itemsz;
    1207             :             }
    1208             :             else
    1209             :             {
    1210             :                 /*
    1211             :                  * the tuple stays on this page, so nothing to do.
    1212             :                  */
    1213             :                 Assert(bucket == obucket);
    1214             :             }
    1215             :         }
    1216             : 
    1217         846 :         oblkno = oopaque->hasho_nextblkno;
    1218             : 
    1219             :         /* retain the pin on the old primary bucket */
    1220         846 :         if (obuf == bucket_obuf)
    1221         626 :             LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1222             :         else
    1223         220 :             _hash_relbuf(rel, obuf);
    1224             : 
    1225             :         /* Exit loop if no more overflow pages in old bucket */
    1226         846 :         if (!BlockNumberIsValid(oblkno))
    1227             :         {
    1228             :             /*
    1229             :              * Change the shared buffer state in critical section, otherwise
    1230             :              * any error could make it unrecoverable.
    1231             :              */
    1232         626 :             START_CRIT_SECTION();
    1233             : 
    1234         626 :             _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1235         626 :             MarkBufferDirty(nbuf);
    1236             :             /* log the split operation before releasing the lock */
    1237         626 :             log_split_page(rel, nbuf);
    1238             : 
    1239         626 :             END_CRIT_SECTION();
    1240             : 
    1241         626 :             if (nbuf == bucket_nbuf)
    1242         622 :                 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1243             :             else
    1244           4 :                 _hash_relbuf(rel, nbuf);
    1245             : 
    1246             :             /* be tidy */
    1247       53686 :             for (i = 0; i < nitups; i++)
    1248       53060 :                 pfree(itups[i]);
    1249         626 :             break;
    1250             :         }
    1251             : 
    1252             :         /* Else, advance to next old page */
    1253         220 :         obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
    1254         220 :         opage = BufferGetPage(obuf);
    1255         220 :         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1256             :     }
    1257             : 
    1258             :     /*
    1259             :      * We're at the end of the old bucket chain, so we're done partitioning
    1260             :      * the tuples.  Mark the old and new buckets to indicate split is
    1261             :      * finished.
    1262             :      *
    1263             :      * To avoid deadlocks due to locking order of buckets, first lock the old
    1264             :      * bucket and then the new bucket.
    1265             :      */
    1266         626 :     LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
    1267         626 :     opage = BufferGetPage(bucket_obuf);
    1268         626 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1269             : 
    1270         626 :     LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
    1271         626 :     npage = BufferGetPage(bucket_nbuf);
    1272         626 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1273             : 
    1274         626 :     START_CRIT_SECTION();
    1275             : 
    1276         626 :     oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
    1277         626 :     nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
    1278             : 
    1279             :     /*
    1280             :      * After the split is finished, mark the old bucket to indicate that it
    1281             :      * contains deletable tuples.  We will clear split-cleanup flag after
    1282             :      * deleting such tuples either at the end of split or at the next split
    1283             :      * from old bucket or at the time of vacuum.
    1284             :      */
    1285         626 :     oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    1286             : 
    1287             :     /*
    1288             :      * now write the buffers, here we don't release the locks as caller is
    1289             :      * responsible to release locks.
    1290             :      */
    1291         626 :     MarkBufferDirty(bucket_obuf);
    1292         626 :     MarkBufferDirty(bucket_nbuf);
    1293             : 
    1294         626 :     if (RelationNeedsWAL(rel))
    1295             :     {
    1296             :         XLogRecPtr  recptr;
    1297             :         xl_hash_split_complete xlrec;
    1298             : 
    1299         626 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
    1300         626 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
    1301             : 
    1302         626 :         XLogBeginInsert();
    1303             : 
    1304         626 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
    1305             : 
    1306         626 :         XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
    1307         626 :         XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
    1308             : 
    1309         626 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
    1310             : 
    1311         626 :         PageSetLSN(BufferGetPage(bucket_obuf), recptr);
    1312         626 :         PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
    1313             :     }
    1314             : 
    1315         626 :     END_CRIT_SECTION();
    1316             : 
    1317             :     /*
    1318             :      * If possible, clean up the old bucket.  We might not be able to do this
    1319             :      * if someone else has a pin on it, but if not then we can go ahead.  This
    1320             :      * isn't absolutely necessary, but it reduces bloat; if we don't do it
    1321             :      * now, VACUUM will do it eventually, but maybe not until new overflow
    1322             :      * pages have been allocated.  Note that there's no need to clean up the
    1323             :      * new bucket.
    1324             :      */
    1325         626 :     if (IsBufferCleanupOK(bucket_obuf))
    1326             :     {
    1327         626 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1328         626 :         hashbucketcleanup(rel, obucket, bucket_obuf,
    1329             :                           BufferGetBlockNumber(bucket_obuf), NULL,
    1330             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
    1331             :                           NULL, NULL);
    1332             :     }
    1333             :     else
    1334             :     {
    1335           0 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1336           0 :         LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
    1337             :     }
    1338         626 : }
    1339             : 
    1340             : /*
    1341             :  *  _hash_finish_split() -- Finish the previously interrupted split operation
    1342             :  *
    1343             :  * To complete the split operation, we form the hash table of TIDs in new
    1344             :  * bucket which is then used by split operation to skip tuples that are
    1345             :  * already moved before the split operation was previously interrupted.
    1346             :  *
    1347             :  * The caller must hold a pin, but no lock, on the metapage and old bucket's
    1348             :  * primary page buffer.  The buffers are returned in the same state.  (The
    1349             :  * metapage is only touched if it becomes necessary to add or remove overflow
    1350             :  * pages.)
    1351             :  */
    1352             : void
    1353           0 : _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
    1354             :                    uint32 maxbucket, uint32 highmask, uint32 lowmask)
    1355             : {
    1356             :     HASHCTL     hash_ctl;
    1357             :     HTAB       *tidhtab;
    1358           0 :     Buffer      bucket_nbuf = InvalidBuffer;
    1359             :     Buffer      nbuf;
    1360             :     Page        npage;
    1361             :     BlockNumber nblkno;
    1362             :     BlockNumber bucket_nblkno;
    1363             :     HashPageOpaque npageopaque;
    1364             :     Bucket      nbucket;
    1365             :     bool        found;
    1366             : 
    1367             :     /* Initialize hash tables used to track TIDs */
    1368           0 :     memset(&hash_ctl, 0, sizeof(hash_ctl));
    1369           0 :     hash_ctl.keysize = sizeof(ItemPointerData);
    1370           0 :     hash_ctl.entrysize = sizeof(ItemPointerData);
    1371           0 :     hash_ctl.hcxt = CurrentMemoryContext;
    1372             : 
    1373           0 :     tidhtab =
    1374             :         hash_create("bucket ctids",
    1375             :                     256,        /* arbitrary initial size */
    1376             :                     &hash_ctl,
    1377             :                     HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1378             : 
    1379           0 :     bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
    1380             : 
    1381             :     /*
    1382             :      * Scan the new bucket and build hash table of TIDs
    1383             :      */
    1384             :     for (;;)
    1385           0 :     {
    1386             :         OffsetNumber noffnum;
    1387             :         OffsetNumber nmaxoffnum;
    1388             : 
    1389           0 :         nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
    1390             :                             LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
    1391             : 
    1392             :         /* remember the primary bucket buffer to acquire cleanup lock on it. */
    1393           0 :         if (nblkno == bucket_nblkno)
    1394           0 :             bucket_nbuf = nbuf;
    1395             : 
    1396           0 :         npage = BufferGetPage(nbuf);
    1397           0 :         npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1398             : 
    1399             :         /* Scan each tuple in new page */
    1400           0 :         nmaxoffnum = PageGetMaxOffsetNumber(npage);
    1401           0 :         for (noffnum = FirstOffsetNumber;
    1402             :              noffnum <= nmaxoffnum;
    1403           0 :              noffnum = OffsetNumberNext(noffnum))
    1404             :         {
    1405             :             IndexTuple  itup;
    1406             : 
    1407             :             /* Fetch the item's TID and insert it in hash table. */
    1408           0 :             itup = (IndexTuple) PageGetItem(npage,
    1409             :                                             PageGetItemId(npage, noffnum));
    1410             : 
    1411           0 :             (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
    1412             : 
    1413             :             Assert(!found);
    1414             :         }
    1415             : 
    1416           0 :         nblkno = npageopaque->hasho_nextblkno;
    1417             : 
    1418             :         /*
    1419             :          * release our write lock without modifying buffer and ensure to
    1420             :          * retain the pin on primary bucket.
    1421             :          */
    1422           0 :         if (nbuf == bucket_nbuf)
    1423           0 :             LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1424             :         else
    1425           0 :             _hash_relbuf(rel, nbuf);
    1426             : 
    1427             :         /* Exit loop if no more overflow pages in new bucket */
    1428           0 :         if (!BlockNumberIsValid(nblkno))
    1429           0 :             break;
    1430             :     }
    1431             : 
    1432             :     /*
    1433             :      * Conditionally get the cleanup lock on old and new buckets to perform
    1434             :      * the split operation.  If we don't get the cleanup locks, silently give
    1435             :      * up and next insertion on old bucket will try again to complete the
    1436             :      * split.
    1437             :      */
    1438           0 :     if (!ConditionalLockBufferForCleanup(obuf))
    1439             :     {
    1440           0 :         hash_destroy(tidhtab);
    1441           0 :         return;
    1442             :     }
    1443           0 :     if (!ConditionalLockBufferForCleanup(bucket_nbuf))
    1444             :     {
    1445           0 :         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1446           0 :         hash_destroy(tidhtab);
    1447           0 :         return;
    1448             :     }
    1449             : 
    1450           0 :     npage = BufferGetPage(bucket_nbuf);
    1451           0 :     npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1452           0 :     nbucket = npageopaque->hasho_bucket;
    1453             : 
    1454           0 :     _hash_splitbucket(rel, metabuf, obucket,
    1455             :                       nbucket, obuf, bucket_nbuf, tidhtab,
    1456             :                       maxbucket, highmask, lowmask);
    1457             : 
    1458           0 :     _hash_dropbuf(rel, bucket_nbuf);
    1459           0 :     hash_destroy(tidhtab);
    1460             : }
    1461             : 
    1462             : /*
    1463             :  *  log_split_page() -- Log the split operation
    1464             :  *
    1465             :  *  We log the split operation when the new page in new bucket gets full,
    1466             :  *  so we log the entire page.
    1467             :  *
    1468             :  *  'buf' must be locked by the caller which is also responsible for unlocking
    1469             :  *  it.
    1470             :  */
    1471             : static void
    1472         678 : log_split_page(Relation rel, Buffer buf)
    1473             : {
    1474         678 :     if (RelationNeedsWAL(rel))
    1475             :     {
    1476             :         XLogRecPtr  recptr;
    1477             : 
    1478         678 :         XLogBeginInsert();
    1479             : 
    1480         678 :         XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
    1481             : 
    1482         678 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
    1483             : 
    1484         678 :         PageSetLSN(BufferGetPage(buf), recptr);
    1485             :     }
    1486         678 : }
    1487             : 
    1488             : /*
    1489             :  *  _hash_getcachedmetap() -- Returns cached metapage data.
    1490             :  *
    1491             :  *  If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
    1492             :  *  the metapage.  If not set, we'll set it before returning if we have to
    1493             :  *  refresh the cache, and return with a pin but no lock on it; caller is
    1494             :  *  responsible for releasing the pin.
    1495             :  *
    1496             :  *  We refresh the cache if it's not initialized yet or force_refresh is true.
    1497             :  */
    1498             : HashMetaPage
    1499      440608 : _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
    1500             : {
    1501             :     Page        page;
    1502             : 
    1503             :     Assert(metabuf);
    1504      440608 :     if (force_refresh || rel->rd_amcache == NULL)
    1505             :     {
    1506         712 :         char       *cache = NULL;
    1507             : 
    1508             :         /*
    1509             :          * It's important that we don't set rd_amcache to an invalid value.
    1510             :          * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
    1511             :          * install a pointer to the newly-allocated storage in the actual
    1512             :          * relcache entry until both have succeeded.
    1513             :          */
    1514         712 :         if (rel->rd_amcache == NULL)
    1515         340 :             cache = MemoryContextAlloc(rel->rd_indexcxt,
    1516             :                                        sizeof(HashMetaPageData));
    1517             : 
    1518             :         /* Read the metapage. */
    1519         712 :         if (BufferIsValid(*metabuf))
    1520           2 :             LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
    1521             :         else
    1522         710 :             *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
    1523             :                                     LH_META_PAGE);
    1524         712 :         page = BufferGetPage(*metabuf);
    1525             : 
    1526             :         /* Populate the cache. */
    1527         712 :         if (rel->rd_amcache == NULL)
    1528         340 :             rel->rd_amcache = cache;
    1529         712 :         memcpy(rel->rd_amcache, HashPageGetMeta(page),
    1530             :                sizeof(HashMetaPageData));
    1531             : 
    1532             :         /* Release metapage lock, but keep the pin. */
    1533         712 :         LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
    1534             :     }
    1535             : 
    1536      440608 :     return (HashMetaPage) rel->rd_amcache;
    1537             : }
    1538             : 
    1539             : /*
    1540             :  *  _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
    1541             :  *                                       hashkey.
    1542             :  *
    1543             :  *  Bucket pages do not move or get removed once they are allocated. This give
    1544             :  *  us an opportunity to use the previously saved metapage contents to reach
    1545             :  *  the target bucket buffer, instead of reading from the metapage every time.
    1546             :  *  This saves one buffer access every time we want to reach the target bucket
    1547             :  *  buffer, which is very helpful savings in bufmgr traffic and contention.
    1548             :  *
    1549             :  *  The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
    1550             :  *  bucket buffer has to be locked for reading or writing.
    1551             :  *
    1552             :  *  The out parameter cachedmetap is set with metapage contents used for
    1553             :  *  hashkey to bucket buffer mapping. Some callers need this info to reach the
    1554             :  *  old bucket in case of bucket split, see _hash_doinsert().
    1555             :  */
    1556             : Buffer
    1557      440234 : _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
    1558             :                                 HashMetaPage *cachedmetap)
    1559             : {
    1560             :     HashMetaPage metap;
    1561             :     Buffer      buf;
    1562      440234 :     Buffer      metabuf = InvalidBuffer;
    1563             :     Page        page;
    1564             :     Bucket      bucket;
    1565             :     BlockNumber blkno;
    1566             :     HashPageOpaque opaque;
    1567             : 
    1568             :     /* We read from target bucket buffer, hence locking is must. */
    1569             :     Assert(access == HASH_READ || access == HASH_WRITE);
    1570             : 
    1571      440234 :     metap = _hash_getcachedmetap(rel, &metabuf, false);
    1572             :     Assert(metap != NULL);
    1573             : 
    1574             :     /*
    1575             :      * Loop until we get a lock on the correct target bucket.
    1576             :      */
    1577             :     for (;;)
    1578             :     {
    1579             :         /*
    1580             :          * Compute the target bucket number, and convert to block number.
    1581             :          */
    1582      440974 :         bucket = _hash_hashkey2bucket(hashkey,
    1583             :                                       metap->hashm_maxbucket,
    1584             :                                       metap->hashm_highmask,
    1585             :                                       metap->hashm_lowmask);
    1586             : 
    1587      440604 :         blkno = BUCKET_TO_BLKNO(metap, bucket);
    1588             : 
    1589             :         /* Fetch the primary bucket page for the bucket */
    1590      440604 :         buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
    1591      440604 :         page = BufferGetPage(buf);
    1592      440604 :         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
    1593             :         Assert(opaque->hasho_bucket == bucket);
    1594             :         Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
    1595             : 
    1596             :         /*
    1597             :          * If this bucket hasn't been split, we're done.
    1598             :          */
    1599      440604 :         if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
    1600      440234 :             break;
    1601             : 
    1602             :         /* Drop lock on this buffer, update cached metapage, and retry. */
    1603         370 :         _hash_relbuf(rel, buf);
    1604         370 :         metap = _hash_getcachedmetap(rel, &metabuf, true);
    1605             :         Assert(metap != NULL);
    1606             :     }
    1607             : 
    1608      440234 :     if (BufferIsValid(metabuf))
    1609         710 :         _hash_dropbuf(rel, metabuf);
    1610             : 
    1611      440234 :     if (cachedmetap)
    1612      439932 :         *cachedmetap = metap;
    1613             : 
    1614      440234 :     return buf;
    1615             : }

Generated by: LCOV version 1.13