LCOV - code coverage report
Current view: top level - src/backend/access/hash - hashpage.c (source / functions) Hit Total Coverage
Test: PostgreSQL 14devel Lines: 383 469 81.7 %
Date: 2020-11-27 12:05:55 Functions: 18 19 94.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * hashpage.c
       4             :  *    Hash table page management code for the Postgres hash access method
       5             :  *
       6             :  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/hash/hashpage.c
      12             :  *
      13             :  * NOTES
      14             :  *    Postgres hash pages look like ordinary relation pages.  The opaque
      15             :  *    data at high addresses includes information about the page including
      16             :  *    whether a page is an overflow page or a true bucket, the bucket
      17             :  *    number, and the block numbers of the preceding and following pages
      18             :  *    in the same bucket.
      19             :  *
      20             :  *    The first page in a hash relation, page zero, is special -- it stores
      21             :  *    information describing the hash table; it is referred to as the
      22             :  *    "meta page." Pages one and higher store the actual data.
      23             :  *
      24             :  *    There are also bitmap pages, which are not manipulated here;
      25             :  *    see hashovfl.c.
      26             :  *
      27             :  *-------------------------------------------------------------------------
      28             :  */
      29             : #include "postgres.h"
      30             : 
      31             : #include "access/hash.h"
      32             : #include "access/hash_xlog.h"
      33             : #include "miscadmin.h"
      34             : #include "port/pg_bitutils.h"
      35             : #include "storage/lmgr.h"
      36             : #include "storage/predicate.h"
      37             : #include "storage/smgr.h"
      38             : 
      39             : static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
      40             :                                 uint32 nblocks);
      41             : static void _hash_splitbucket(Relation rel, Buffer metabuf,
      42             :                               Bucket obucket, Bucket nbucket,
      43             :                               Buffer obuf,
      44             :                               Buffer nbuf,
      45             :                               HTAB *htab,
      46             :                               uint32 maxbucket,
      47             :                               uint32 highmask, uint32 lowmask);
      48             : static void log_split_page(Relation rel, Buffer buf);
      49             : 
      50             : 
      51             : /*
      52             :  *  _hash_getbuf() -- Get a buffer by block number for read or write.
      53             :  *
      54             :  *      'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
      55             :  *      'flags' is a bitwise OR of the allowed page types.
      56             :  *
      57             :  *      This must be used only to fetch pages that are expected to be valid
      58             :  *      already.  _hash_checkpage() is applied using the given flags.
      59             :  *
      60             :  *      When this routine returns, the appropriate lock is set on the
      61             :  *      requested buffer and its reference count has been incremented
      62             :  *      (ie, the buffer is "locked and pinned").
      63             :  *
      64             :  *      P_NEW is disallowed because this routine can only be used
      65             :  *      to access pages that are known to be before the filesystem EOF.
      66             :  *      Extending the index should be done with _hash_getnewbuf.
      67             :  */
      68             : Buffer
      69     1140548 : _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
      70             : {
      71             :     Buffer      buf;
      72             : 
      73     1140548 :     if (blkno == P_NEW)
      74           0 :         elog(ERROR, "hash AM does not use P_NEW");
      75             : 
      76     1140548 :     buf = ReadBuffer(rel, blkno);
      77             : 
      78     1140548 :     if (access != HASH_NOLOCK)
      79      688804 :         LockBuffer(buf, access);
      80             : 
      81             :     /* ref count and lock type are correct */
      82             : 
      83     1140548 :     _hash_checkpage(rel, buf, flags);
      84             : 
      85     1140548 :     return buf;
      86             : }
      87             : 
      88             : /*
      89             :  * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
      90             :  *
      91             :  *      We read the page and try to acquire a cleanup lock.  If we get it,
      92             :  *      we return the buffer; otherwise, we return InvalidBuffer.
      93             :  */
      94             : Buffer
      95         644 : _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
      96             : {
      97             :     Buffer      buf;
      98             : 
      99         644 :     if (blkno == P_NEW)
     100           0 :         elog(ERROR, "hash AM does not use P_NEW");
     101             : 
     102         644 :     buf = ReadBuffer(rel, blkno);
     103             : 
     104         644 :     if (!ConditionalLockBufferForCleanup(buf))
     105             :     {
     106           0 :         ReleaseBuffer(buf);
     107           0 :         return InvalidBuffer;
     108             :     }
     109             : 
     110             :     /* ref count and lock type are correct */
     111             : 
     112         644 :     _hash_checkpage(rel, buf, flags);
     113             : 
     114         644 :     return buf;
     115             : }
     116             : 
     117             : /*
     118             :  *  _hash_getinitbuf() -- Get and initialize a buffer by block number.
     119             :  *
     120             :  *      This must be used only to fetch pages that are known to be before
     121             :  *      the index's filesystem EOF, but are to be filled from scratch.
     122             :  *      _hash_pageinit() is applied automatically.  Otherwise it has
     123             :  *      effects similar to _hash_getbuf() with access = HASH_WRITE.
     124             :  *
     125             :  *      When this routine returns, a write lock is set on the
     126             :  *      requested buffer and its reference count has been incremented
     127             :  *      (ie, the buffer is "locked and pinned").
     128             :  *
     129             :  *      P_NEW is disallowed because this routine can only be used
     130             :  *      to access pages that are known to be before the filesystem EOF.
     131             :  *      Extending the index should be done with _hash_getnewbuf.
     132             :  */
     133             : Buffer
     134          46 : _hash_getinitbuf(Relation rel, BlockNumber blkno)
     135             : {
     136             :     Buffer      buf;
     137             : 
     138          46 :     if (blkno == P_NEW)
     139           0 :         elog(ERROR, "hash AM does not use P_NEW");
     140             : 
     141          46 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
     142             :                              NULL);
     143             : 
     144             :     /* ref count and lock type are correct */
     145             : 
     146             :     /* initialize the page */
     147          46 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     148             : 
     149          46 :     return buf;
     150             : }
     151             : 
     152             : /*
     153             :  *  _hash_initbuf() -- Get and initialize a buffer by bucket number.
     154             :  */
     155             : void
     156        5276 : _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
     157             :               bool initpage)
     158             : {
     159             :     HashPageOpaque pageopaque;
     160             :     Page        page;
     161             : 
     162        5276 :     page = BufferGetPage(buf);
     163             : 
     164             :     /* initialize the page */
     165        5276 :     if (initpage)
     166           0 :         _hash_pageinit(page, BufferGetPageSize(buf));
     167             : 
     168        5276 :     pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
     169             : 
     170             :     /*
     171             :      * Set hasho_prevblkno with current hashm_maxbucket. This value will be
     172             :      * used to validate cached HashMetaPageData. See
     173             :      * _hash_getbucketbuf_from_hashkey().
     174             :      */
     175        5276 :     pageopaque->hasho_prevblkno = max_bucket;
     176        5276 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     177        5276 :     pageopaque->hasho_bucket = num_bucket;
     178        5276 :     pageopaque->hasho_flag = flag;
     179        5276 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     180        5276 : }
     181             : 
     182             : /*
     183             :  *  _hash_getnewbuf() -- Get a new page at the end of the index.
     184             :  *
     185             :  *      This has the same API as _hash_getinitbuf, except that we are adding
     186             :  *      a page to the index, and hence expect the page to be past the
     187             :  *      logical EOF.  (However, we have to support the case where it isn't,
     188             :  *      since a prior try might have crashed after extending the filesystem
     189             :  *      EOF but before updating the metapage to reflect the added page.)
     190             :  *
     191             :  *      It is caller's responsibility to ensure that only one process can
     192             :  *      extend the index at a time.  In practice, this function is called
     193             :  *      only while holding write lock on the metapage, because adding a page
     194             :  *      is always associated with an update of metapage data.
     195             :  */
     196             : Buffer
     197        6430 : _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
     198             : {
     199        6430 :     BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
     200             :     Buffer      buf;
     201             : 
     202        6430 :     if (blkno == P_NEW)
     203           0 :         elog(ERROR, "hash AM does not use P_NEW");
     204        6430 :     if (blkno > nblocks)
     205           0 :         elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
     206             :              RelationGetRelationName(rel));
     207             : 
     208             :     /* smgr insists we use P_NEW to extend the relation */
     209        6430 :     if (blkno == nblocks)
     210             :     {
     211        5786 :         buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
     212        5786 :         if (BufferGetBlockNumber(buf) != blkno)
     213           0 :             elog(ERROR, "unexpected hash relation size: %u, should be %u",
     214             :                  BufferGetBlockNumber(buf), blkno);
     215        5786 :         LockBuffer(buf, HASH_WRITE);
     216             :     }
     217             :     else
     218             :     {
     219         644 :         buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
     220             :                                  NULL);
     221             :     }
     222             : 
     223             :     /* ref count and lock type are correct */
     224             : 
     225             :     /* initialize the page */
     226        6430 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     227             : 
     228        6430 :     return buf;
     229             : }
     230             : 
     231             : /*
     232             :  *  _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
     233             :  *
     234             :  *      This is identical to _hash_getbuf() but also allows a buffer access
     235             :  *      strategy to be specified.  We use this for VACUUM operations.
     236             :  */
     237             : Buffer
     238         736 : _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
     239             :                            int access, int flags,
     240             :                            BufferAccessStrategy bstrategy)
     241             : {
     242             :     Buffer      buf;
     243             : 
     244         736 :     if (blkno == P_NEW)
     245           0 :         elog(ERROR, "hash AM does not use P_NEW");
     246             : 
     247         736 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
     248             : 
     249         736 :     if (access != HASH_NOLOCK)
     250         736 :         LockBuffer(buf, access);
     251             : 
     252             :     /* ref count and lock type are correct */
     253             : 
     254         736 :     _hash_checkpage(rel, buf, flags);
     255             : 
     256         736 :     return buf;
     257             : }
     258             : 
     259             : /*
     260             :  *  _hash_relbuf() -- release a locked buffer.
     261             :  *
     262             :  * Lock and pin (refcount) are both dropped.
     263             :  */
     264             : void
     265      660242 : _hash_relbuf(Relation rel, Buffer buf)
     266             : {
     267      660242 :     UnlockReleaseBuffer(buf);
     268      660242 : }
     269             : 
     270             : /*
     271             :  *  _hash_dropbuf() -- release an unlocked buffer.
     272             :  *
     273             :  * This is used to unpin a buffer on which we hold no lock.
     274             :  */
     275             : void
     276      488372 : _hash_dropbuf(Relation rel, Buffer buf)
     277             : {
     278      488372 :     ReleaseBuffer(buf);
     279      488372 : }
     280             : 
     281             : /*
     282             :  *  _hash_dropscanbuf() -- release buffers used in scan.
     283             :  *
     284             :  * This routine unpins the buffers used during scan on which we
     285             :  * hold no lock.
     286             :  */
     287             : void
     288         846 : _hash_dropscanbuf(Relation rel, HashScanOpaque so)
     289             : {
     290             :     /* release pin we hold on primary bucket page */
     291         846 :     if (BufferIsValid(so->hashso_bucket_buf) &&
     292         300 :         so->hashso_bucket_buf != so->currPos.buf)
     293          40 :         _hash_dropbuf(rel, so->hashso_bucket_buf);
     294         846 :     so->hashso_bucket_buf = InvalidBuffer;
     295             : 
     296             :     /* release pin we hold on primary bucket page  of bucket being split */
     297         846 :     if (BufferIsValid(so->hashso_split_bucket_buf) &&
     298           0 :         so->hashso_split_bucket_buf != so->currPos.buf)
     299           0 :         _hash_dropbuf(rel, so->hashso_split_bucket_buf);
     300         846 :     so->hashso_split_bucket_buf = InvalidBuffer;
     301             : 
     302             :     /* release any pin we still hold */
     303         846 :     if (BufferIsValid(so->currPos.buf))
     304         260 :         _hash_dropbuf(rel, so->currPos.buf);
     305         846 :     so->currPos.buf = InvalidBuffer;
     306             : 
     307             :     /* reset split scan */
     308         846 :     so->hashso_buc_populated = false;
     309         846 :     so->hashso_buc_split = false;
     310         846 : }
     311             : 
     312             : 
     313             : /*
     314             :  *  _hash_init() -- Initialize the metadata page of a hash index,
     315             :  *              the initial buckets, and the initial bitmap page.
     316             :  *
     317             :  * The initial number of buckets is dependent on num_tuples, an estimate
     318             :  * of the number of tuples to be loaded into the index initially.  The
     319             :  * chosen number of buckets is returned.
     320             :  *
     321             :  * We are fairly cavalier about locking here, since we know that no one else
     322             :  * could be accessing this index.  In particular the rule about not holding
     323             :  * multiple buffer locks is ignored.
     324             :  */
     325             : uint32
     326         184 : _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
     327             : {
     328             :     Buffer      metabuf;
     329             :     Buffer      buf;
     330             :     Buffer      bitmapbuf;
     331             :     Page        pg;
     332             :     HashMetaPage metap;
     333             :     RegProcedure procid;
     334             :     int32       data_width;
     335             :     int32       item_width;
     336             :     int32       ffactor;
     337             :     uint32      num_buckets;
     338             :     uint32      i;
     339             :     bool        use_wal;
     340             : 
     341             :     /* safety check */
     342         184 :     if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
     343           0 :         elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
     344             :              RelationGetRelationName(rel));
     345             : 
     346             :     /*
     347             :      * WAL log creation of pages if the relation is persistent, or this is the
     348             :      * init fork.  Init forks for unlogged relations always need to be WAL
     349             :      * logged.
     350             :      */
     351         184 :     use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
     352             : 
     353             :     /*
     354             :      * Determine the target fill factor (in tuples per bucket) for this index.
     355             :      * The idea is to make the fill factor correspond to pages about as full
     356             :      * as the user-settable fillfactor parameter says.  We can compute it
     357             :      * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
     358             :      */
     359         184 :     data_width = sizeof(uint32);
     360         184 :     item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
     361             :         sizeof(ItemIdData);     /* include the line pointer */
     362         184 :     ffactor = HashGetTargetPageUsage(rel) / item_width;
     363             :     /* keep to a sane range */
     364         184 :     if (ffactor < 10)
     365           0 :         ffactor = 10;
     366             : 
     367         184 :     procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
     368             : 
     369             :     /*
     370             :      * We initialize the metapage, the first N bucket pages, and the first
     371             :      * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
     372             :      * calls to occur.  This ensures that the smgr level has the right idea of
     373             :      * the physical index length.
     374             :      *
     375             :      * Critical section not required, because on error the creation of the
     376             :      * whole relation will be rolled back.
     377             :      */
     378         184 :     metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
     379         184 :     _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
     380         184 :     MarkBufferDirty(metabuf);
     381             : 
     382         184 :     pg = BufferGetPage(metabuf);
     383         184 :     metap = HashPageGetMeta(pg);
     384             : 
     385             :     /* XLOG stuff */
     386         184 :     if (use_wal)
     387             :     {
     388             :         xl_hash_init_meta_page xlrec;
     389             :         XLogRecPtr  recptr;
     390             : 
     391         176 :         xlrec.num_tuples = num_tuples;
     392         176 :         xlrec.procid = metap->hashm_procid;
     393         176 :         xlrec.ffactor = metap->hashm_ffactor;
     394             : 
     395         176 :         XLogBeginInsert();
     396         176 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
     397         176 :         XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     398             : 
     399         176 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
     400             : 
     401         176 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     402             :     }
     403             : 
     404         184 :     num_buckets = metap->hashm_maxbucket + 1;
     405             : 
     406             :     /*
     407             :      * Release buffer lock on the metapage while we initialize buckets.
     408             :      * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
     409             :      * won't accomplish anything.  It's a bad idea to hold buffer locks for
     410             :      * long intervals in any case, since that can block the bgwriter.
     411             :      */
     412         184 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     413             : 
     414             :     /*
     415             :      * Initialize and WAL Log the first N buckets
     416             :      */
     417        5460 :     for (i = 0; i < num_buckets; i++)
     418             :     {
     419             :         BlockNumber blkno;
     420             : 
     421             :         /* Allow interrupts, in case N is huge */
     422        5276 :         CHECK_FOR_INTERRUPTS();
     423             : 
     424        5276 :         blkno = BUCKET_TO_BLKNO(metap, i);
     425        5276 :         buf = _hash_getnewbuf(rel, blkno, forkNum);
     426        5276 :         _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
     427        5276 :         MarkBufferDirty(buf);
     428             : 
     429        5276 :         if (use_wal)
     430       10424 :             log_newpage(&rel->rd_node,
     431             :                         forkNum,
     432             :                         blkno,
     433        5212 :                         BufferGetPage(buf),
     434             :                         true);
     435        5276 :         _hash_relbuf(rel, buf);
     436             :     }
     437             : 
     438             :     /* Now reacquire buffer lock on metapage */
     439         184 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     440             : 
     441             :     /*
     442             :      * Initialize bitmap page
     443             :      */
     444         184 :     bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
     445         184 :     _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
     446         184 :     MarkBufferDirty(bitmapbuf);
     447             : 
     448             :     /* add the new bitmap page to the metapage's list of bitmaps */
     449             :     /* metapage already has a write lock */
     450         184 :     if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
     451           0 :         ereport(ERROR,
     452             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     453             :                  errmsg("out of overflow pages in hash index \"%s\"",
     454             :                         RelationGetRelationName(rel))));
     455             : 
     456         184 :     metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
     457             : 
     458         184 :     metap->hashm_nmaps++;
     459         184 :     MarkBufferDirty(metabuf);
     460             : 
     461             :     /* XLOG stuff */
     462         184 :     if (use_wal)
     463             :     {
     464             :         xl_hash_init_bitmap_page xlrec;
     465             :         XLogRecPtr  recptr;
     466             : 
     467         176 :         xlrec.bmsize = metap->hashm_bmsize;
     468             : 
     469         176 :         XLogBeginInsert();
     470         176 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage);
     471         176 :         XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
     472             : 
     473             :         /*
     474             :          * This is safe only because nobody else can be modifying the index at
     475             :          * this stage; it's only visible to the transaction that is creating
     476             :          * it.
     477             :          */
     478         176 :         XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
     479             : 
     480         176 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
     481             : 
     482         176 :         PageSetLSN(BufferGetPage(bitmapbuf), recptr);
     483         176 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     484             :     }
     485             : 
     486             :     /* all done */
     487         184 :     _hash_relbuf(rel, bitmapbuf);
     488         184 :     _hash_relbuf(rel, metabuf);
     489             : 
     490         184 :     return num_buckets;
     491             : }
     492             : 
     493             : /*
     494             :  *  _hash_init_metabuffer() -- Initialize the metadata page of a hash index.
     495             :  */
     496             : void
     497         184 : _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
     498             :                       uint16 ffactor, bool initpage)
     499             : {
     500             :     HashMetaPage metap;
     501             :     HashPageOpaque pageopaque;
     502             :     Page        page;
     503             :     double      dnumbuckets;
     504             :     uint32      num_buckets;
     505             :     uint32      spare_index;
     506             :     uint32      lshift;
     507             : 
     508             :     /*
     509             :      * Choose the number of initial bucket pages to match the fill factor
     510             :      * given the estimated number of tuples.  We round up the result to the
     511             :      * total number of buckets which has to be allocated before using its
     512             :      * hashm_spares element. However always force at least 2 bucket pages. The
     513             :      * upper limit is determined by considerations explained in
     514             :      * _hash_expandtable().
     515             :      */
     516         184 :     dnumbuckets = num_tuples / ffactor;
     517         184 :     if (dnumbuckets <= 2.0)
     518          20 :         num_buckets = 2;
     519         164 :     else if (dnumbuckets >= (double) 0x40000000)
     520           0 :         num_buckets = 0x40000000;
     521             :     else
     522         164 :         num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
     523             : 
     524         184 :     spare_index = _hash_spareindex(num_buckets);
     525             :     Assert(spare_index < HASH_MAX_SPLITPOINTS);
     526             : 
     527         184 :     page = BufferGetPage(buf);
     528         184 :     if (initpage)
     529           0 :         _hash_pageinit(page, BufferGetPageSize(buf));
     530             : 
     531         184 :     pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
     532         184 :     pageopaque->hasho_prevblkno = InvalidBlockNumber;
     533         184 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     534         184 :     pageopaque->hasho_bucket = -1;
     535         184 :     pageopaque->hasho_flag = LH_META_PAGE;
     536         184 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     537             : 
     538         184 :     metap = HashPageGetMeta(page);
     539             : 
     540         184 :     metap->hashm_magic = HASH_MAGIC;
     541         184 :     metap->hashm_version = HASH_VERSION;
     542         184 :     metap->hashm_ntuples = 0;
     543         184 :     metap->hashm_nmaps = 0;
     544         184 :     metap->hashm_ffactor = ffactor;
     545         184 :     metap->hashm_bsize = HashGetMaxBitmapSize(page);
     546             : 
     547             :     /* find largest bitmap array size that will fit in page size */
     548         184 :     lshift = pg_leftmost_one_pos32(metap->hashm_bsize);
     549             :     Assert(lshift > 0);
     550         184 :     metap->hashm_bmsize = 1 << lshift;
     551         184 :     metap->hashm_bmshift = lshift + BYTE_TO_BIT;
     552             :     Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
     553             : 
     554             :     /*
     555             :      * Label the index with its primary hash support function's OID.  This is
     556             :      * pretty useless for normal operation (in fact, hashm_procid is not used
     557             :      * anywhere), but it might be handy for forensic purposes so we keep it.
     558             :      */
     559         184 :     metap->hashm_procid = procid;
     560             : 
     561             :     /*
     562             :      * We initialize the index with N buckets, 0 .. N-1, occupying physical
     563             :      * blocks 1 to N.  The first freespace bitmap page is in block N+1.
     564             :      */
     565         184 :     metap->hashm_maxbucket = num_buckets - 1;
     566             : 
     567             :     /*
     568             :      * Set highmask as next immediate ((2 ^ x) - 1), which should be
     569             :      * sufficient to cover num_buckets.
     570             :      */
     571         184 :     metap->hashm_highmask = pg_nextpower2_32(num_buckets + 1) - 1;
     572         184 :     metap->hashm_lowmask = (metap->hashm_highmask >> 1);
     573             : 
     574         184 :     MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
     575         184 :     MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
     576             : 
     577             :     /* Set up mapping for one spare page after the initial splitpoints */
     578         184 :     metap->hashm_spares[spare_index] = 1;
     579         184 :     metap->hashm_ovflpoint = spare_index;
     580         184 :     metap->hashm_firstfree = 0;
     581             : 
     582             :     /*
     583             :      * Set pd_lower just past the end of the metadata.  This is essential,
     584             :      * because without doing so, metadata will be lost if xlog.c compresses
     585             :      * the page.
     586             :      */
     587         184 :     ((PageHeader) page)->pd_lower =
     588         184 :         ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
     589         184 : }
     590             : 
     591             : /*
     592             :  *  _hash_pageinit() -- Initialize a new hash index page.
     593             :  */
     594             : void
     595        6624 : _hash_pageinit(Page page, Size size)
     596             : {
     597        6624 :     PageInit(page, size, sizeof(HashPageOpaqueData));
     598        6624 : }
     599             : 
     600             : /*
     601             :  * Attempt to expand the hash table by creating one new bucket.
     602             :  *
     603             :  * This will silently do nothing if we don't get cleanup lock on old or
     604             :  * new bucket.
     605             :  *
     606             :  * Complete the pending splits and remove the tuples from old bucket,
     607             :  * if there are any left over from the previous split.
     608             :  *
     609             :  * The caller must hold a pin, but no lock, on the metapage buffer.
     610             :  * The buffer is returned in the same state.
     611             :  */
     612             : void
     613         644 : _hash_expandtable(Relation rel, Buffer metabuf)
     614             : {
     615             :     HashMetaPage metap;
     616             :     Bucket      old_bucket;
     617             :     Bucket      new_bucket;
     618             :     uint32      spare_ndx;
     619             :     BlockNumber start_oblkno;
     620             :     BlockNumber start_nblkno;
     621             :     Buffer      buf_nblkno;
     622             :     Buffer      buf_oblkno;
     623             :     Page        opage;
     624             :     Page        npage;
     625             :     HashPageOpaque oopaque;
     626             :     HashPageOpaque nopaque;
     627             :     uint32      maxbucket;
     628             :     uint32      highmask;
     629             :     uint32      lowmask;
     630         644 :     bool        metap_update_masks = false;
     631         644 :     bool        metap_update_splitpoint = false;
     632             : 
     633         644 : restart_expand:
     634             : 
     635             :     /*
     636             :      * Write-lock the meta page.  It used to be necessary to acquire a
     637             :      * heavyweight lock to begin a split, but that is no longer required.
     638             :      */
     639         644 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     640             : 
     641         644 :     _hash_checkpage(rel, metabuf, LH_META_PAGE);
     642         644 :     metap = HashPageGetMeta(BufferGetPage(metabuf));
     643             : 
     644             :     /*
     645             :      * Check to see if split is still needed; someone else might have already
     646             :      * done one while we waited for the lock.
     647             :      *
     648             :      * Make sure this stays in sync with _hash_doinsert()
     649             :      */
     650        1288 :     if (metap->hashm_ntuples <=
     651         644 :         (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
     652           0 :         goto fail;
     653             : 
     654             :     /*
     655             :      * Can't split anymore if maxbucket has reached its maximum possible
     656             :      * value.
     657             :      *
     658             :      * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
     659             :      * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
     660             :      * to half that to prevent failure of pg_ceil_log2_32() and insufficient
     661             :      * space in hashm_spares[].  It's moot anyway because an index with 2^32
     662             :      * buckets would certainly overflow BlockNumber and hence
     663             :      * _hash_alloc_buckets() would fail, but if we supported buckets smaller
     664             :      * than a disk block then this would be an independent constraint.
     665             :      *
     666             :      * If you change this, see also the maximum initial number of buckets in
     667             :      * _hash_init().
     668             :      */
     669         644 :     if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
     670           0 :         goto fail;
     671             : 
     672             :     /*
     673             :      * Determine which bucket is to be split, and attempt to take cleanup lock
     674             :      * on the old bucket.  If we can't get the lock, give up.
     675             :      *
     676             :      * The cleanup lock protects us not only against other backends, but
     677             :      * against our own backend as well.
     678             :      *
     679             :      * The cleanup lock is mainly to protect the split from concurrent
     680             :      * inserts. See src/backend/access/hash/README, Lock Definitions for
     681             :      * further details.  Due to this locking restriction, if there is any
     682             :      * pending scan, the split will give up which is not good, but harmless.
     683             :      */
     684         644 :     new_bucket = metap->hashm_maxbucket + 1;
     685             : 
     686         644 :     old_bucket = (new_bucket & metap->hashm_lowmask);
     687             : 
     688         644 :     start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
     689             : 
     690         644 :     buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
     691         644 :     if (!buf_oblkno)
     692           0 :         goto fail;
     693             : 
     694         644 :     opage = BufferGetPage(buf_oblkno);
     695         644 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
     696             : 
     697             :     /*
     698             :      * We want to finish the split from a bucket as there is no apparent
     699             :      * benefit by not doing so and it will make the code complicated to finish
     700             :      * the split that involves multiple buckets considering the case where new
     701             :      * split also fails.  We don't need to consider the new bucket for
     702             :      * completing the split here as it is not possible that a re-split of new
     703             :      * bucket starts when there is still a pending split from old bucket.
     704             :      */
     705         644 :     if (H_BUCKET_BEING_SPLIT(oopaque))
     706             :     {
     707             :         /*
     708             :          * Copy bucket mapping info now; refer the comment in code below where
     709             :          * we copy this information before calling _hash_splitbucket to see
     710             :          * why this is okay.
     711             :          */
     712           0 :         maxbucket = metap->hashm_maxbucket;
     713           0 :         highmask = metap->hashm_highmask;
     714           0 :         lowmask = metap->hashm_lowmask;
     715             : 
     716             :         /*
     717             :          * Release the lock on metapage and old_bucket, before completing the
     718             :          * split.
     719             :          */
     720           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     721           0 :         LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
     722             : 
     723           0 :         _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
     724             :                            highmask, lowmask);
     725             : 
     726             :         /* release the pin on old buffer and retry for expand. */
     727           0 :         _hash_dropbuf(rel, buf_oblkno);
     728             : 
     729           0 :         goto restart_expand;
     730             :     }
     731             : 
     732             :     /*
     733             :      * Clean the tuples remained from the previous split.  This operation
     734             :      * requires cleanup lock and we already have one on the old bucket, so
     735             :      * let's do it. We also don't want to allow further splits from the bucket
     736             :      * till the garbage of previous split is cleaned.  This has two
     737             :      * advantages; first, it helps in avoiding the bloat due to garbage and
     738             :      * second is, during cleanup of bucket, we are always sure that the
     739             :      * garbage tuples belong to most recently split bucket.  On the contrary,
     740             :      * if we allow cleanup of bucket after meta page is updated to indicate
     741             :      * the new split and before the actual split, the cleanup operation won't
     742             :      * be able to decide whether the tuple has been moved to the newly created
     743             :      * bucket and ended up deleting such tuples.
     744             :      */
     745         644 :     if (H_NEEDS_SPLIT_CLEANUP(oopaque))
     746             :     {
     747             :         /*
     748             :          * Copy bucket mapping info now; refer to the comment in code below
     749             :          * where we copy this information before calling _hash_splitbucket to
     750             :          * see why this is okay.
     751             :          */
     752           0 :         maxbucket = metap->hashm_maxbucket;
     753           0 :         highmask = metap->hashm_highmask;
     754           0 :         lowmask = metap->hashm_lowmask;
     755             : 
     756             :         /* Release the metapage lock. */
     757           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     758             : 
     759           0 :         hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
     760             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
     761             :                           NULL, NULL);
     762             : 
     763           0 :         _hash_dropbuf(rel, buf_oblkno);
     764             : 
     765           0 :         goto restart_expand;
     766             :     }
     767             : 
     768             :     /*
     769             :      * There shouldn't be any active scan on new bucket.
     770             :      *
     771             :      * Note: it is safe to compute the new bucket's blkno here, even though we
     772             :      * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
     773             :      * the current value of hashm_spares[hashm_ovflpoint] correctly shows
     774             :      * where we are going to put a new splitpoint's worth of buckets.
     775             :      */
     776         644 :     start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
     777             : 
     778             :     /*
     779             :      * If the split point is increasing we need to allocate a new batch of
     780             :      * bucket pages.
     781             :      */
     782         644 :     spare_ndx = _hash_spareindex(new_bucket + 1);
     783         644 :     if (spare_ndx > metap->hashm_ovflpoint)
     784             :     {
     785             :         uint32      buckets_to_add;
     786             : 
     787             :         Assert(spare_ndx == metap->hashm_ovflpoint + 1);
     788             : 
     789             :         /*
     790             :          * We treat allocation of buckets as a separate WAL-logged action.
     791             :          * Even if we fail after this operation, won't leak bucket pages;
     792             :          * rather, the next split will consume this space. In any case, even
     793             :          * without failure we don't use all the space in one split operation.
     794             :          */
     795          36 :         buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
     796          36 :         if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
     797             :         {
     798             :             /* can't split due to BlockNumber overflow */
     799           0 :             _hash_relbuf(rel, buf_oblkno);
     800           0 :             goto fail;
     801             :         }
     802             :     }
     803             : 
     804             :     /*
     805             :      * Physically allocate the new bucket's primary page.  We want to do this
     806             :      * before changing the metapage's mapping info, in case we can't get the
     807             :      * disk space.  Ideally, we don't need to check for cleanup lock on new
     808             :      * bucket as no other backend could find this bucket unless meta page is
     809             :      * updated.  However, it is good to be consistent with old bucket locking.
     810             :      */
     811         644 :     buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
     812         644 :     if (!IsBufferCleanupOK(buf_nblkno))
     813             :     {
     814           0 :         _hash_relbuf(rel, buf_oblkno);
     815           0 :         _hash_relbuf(rel, buf_nblkno);
     816           0 :         goto fail;
     817             :     }
     818             : 
     819             :     /*
     820             :      * Since we are scribbling on the pages in the shared buffers, establish a
     821             :      * critical section.  Any failure in this next code leaves us with a big
     822             :      * problem: the metapage is effectively corrupt but could get written back
     823             :      * to disk.
     824             :      */
     825         644 :     START_CRIT_SECTION();
     826             : 
     827             :     /*
     828             :      * Okay to proceed with split.  Update the metapage bucket mapping info.
     829             :      */
     830         644 :     metap->hashm_maxbucket = new_bucket;
     831             : 
     832         644 :     if (new_bucket > metap->hashm_highmask)
     833             :     {
     834             :         /* Starting a new doubling */
     835          14 :         metap->hashm_lowmask = metap->hashm_highmask;
     836          14 :         metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
     837          14 :         metap_update_masks = true;
     838             :     }
     839             : 
     840             :     /*
     841             :      * If the split point is increasing we need to adjust the hashm_spares[]
     842             :      * array and hashm_ovflpoint so that future overflow pages will be created
     843             :      * beyond this new batch of bucket pages.
     844             :      */
     845         644 :     if (spare_ndx > metap->hashm_ovflpoint)
     846             :     {
     847          36 :         metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
     848          36 :         metap->hashm_ovflpoint = spare_ndx;
     849          36 :         metap_update_splitpoint = true;
     850             :     }
     851             : 
     852         644 :     MarkBufferDirty(metabuf);
     853             : 
     854             :     /*
     855             :      * Copy bucket mapping info now; this saves re-accessing the meta page
     856             :      * inside _hash_splitbucket's inner loop.  Note that once we drop the
     857             :      * split lock, other splits could begin, so these values might be out of
     858             :      * date before _hash_splitbucket finishes.  That's okay, since all it
     859             :      * needs is to tell which of these two buckets to map hashkeys into.
     860             :      */
     861         644 :     maxbucket = metap->hashm_maxbucket;
     862         644 :     highmask = metap->hashm_highmask;
     863         644 :     lowmask = metap->hashm_lowmask;
     864             : 
     865         644 :     opage = BufferGetPage(buf_oblkno);
     866         644 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
     867             : 
     868             :     /*
     869             :      * Mark the old bucket to indicate that split is in progress.  (At
     870             :      * operation end, we will clear the split-in-progress flag.)  Also, for a
     871             :      * primary bucket page, hasho_prevblkno stores the number of buckets that
     872             :      * existed as of the last split, so we must update that value here.
     873             :      */
     874         644 :     oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
     875         644 :     oopaque->hasho_prevblkno = maxbucket;
     876             : 
     877         644 :     MarkBufferDirty(buf_oblkno);
     878             : 
     879         644 :     npage = BufferGetPage(buf_nblkno);
     880             : 
     881             :     /*
     882             :      * initialize the new bucket's primary page and mark it to indicate that
     883             :      * split is in progress.
     884             :      */
     885         644 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
     886         644 :     nopaque->hasho_prevblkno = maxbucket;
     887         644 :     nopaque->hasho_nextblkno = InvalidBlockNumber;
     888         644 :     nopaque->hasho_bucket = new_bucket;
     889         644 :     nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
     890         644 :     nopaque->hasho_page_id = HASHO_PAGE_ID;
     891             : 
     892         644 :     MarkBufferDirty(buf_nblkno);
     893             : 
     894             :     /* XLOG stuff */
     895         644 :     if (RelationNeedsWAL(rel))
     896             :     {
     897             :         xl_hash_split_allocate_page xlrec;
     898             :         XLogRecPtr  recptr;
     899             : 
     900         644 :         xlrec.new_bucket = maxbucket;
     901         644 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
     902         644 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
     903         644 :         xlrec.flags = 0;
     904             : 
     905         644 :         XLogBeginInsert();
     906             : 
     907         644 :         XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
     908         644 :         XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
     909         644 :         XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
     910             : 
     911         644 :         if (metap_update_masks)
     912             :         {
     913          14 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
     914          14 :             XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32));
     915          14 :             XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32));
     916             :         }
     917             : 
     918         644 :         if (metap_update_splitpoint)
     919             :         {
     920          36 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
     921          36 :             XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
     922             :                                 sizeof(uint32));
     923          36 :             XLogRegisterBufData(2,
     924          36 :                                 (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
     925             :                                 sizeof(uint32));
     926             :         }
     927             : 
     928         644 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage);
     929             : 
     930         644 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
     931             : 
     932         644 :         PageSetLSN(BufferGetPage(buf_oblkno), recptr);
     933         644 :         PageSetLSN(BufferGetPage(buf_nblkno), recptr);
     934         644 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     935             :     }
     936             : 
     937         644 :     END_CRIT_SECTION();
     938             : 
     939             :     /* drop lock, but keep pin */
     940         644 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     941             : 
     942             :     /* Relocate records to the new bucket */
     943         644 :     _hash_splitbucket(rel, metabuf,
     944             :                       old_bucket, new_bucket,
     945             :                       buf_oblkno, buf_nblkno, NULL,
     946             :                       maxbucket, highmask, lowmask);
     947             : 
     948             :     /* all done, now release the pins on primary buckets. */
     949         644 :     _hash_dropbuf(rel, buf_oblkno);
     950         644 :     _hash_dropbuf(rel, buf_nblkno);
     951             : 
     952         644 :     return;
     953             : 
     954             :     /* Here if decide not to split or fail to acquire old bucket lock */
     955           0 : fail:
     956             : 
     957             :     /* We didn't write the metapage, so just drop lock */
     958           0 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     959             : }
     960             : 
     961             : 
     962             : /*
     963             :  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
     964             :  *
     965             :  * This does not need to initialize the new bucket pages; we'll do that as
     966             :  * each one is used by _hash_expandtable().  But we have to extend the logical
     967             :  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
     968             :  * sync with ours, so that we don't get complaints from smgr.
     969             :  *
     970             :  * We do this by writing a page of zeroes at the end of the splitpoint range.
     971             :  * We expect that the filesystem will ensure that the intervening pages read
     972             :  * as zeroes too.  On many filesystems this "hole" will not be allocated
     973             :  * immediately, which means that the index file may end up more fragmented
     974             :  * than if we forced it all to be allocated now; but since we don't scan
     975             :  * hash indexes sequentially anyway, that probably doesn't matter.
     976             :  *
     977             :  * XXX It's annoying that this code is executed with the metapage lock held.
     978             :  * We need to interlock against _hash_addovflpage() adding a new overflow page
     979             :  * concurrently, but it'd likely be better to use LockRelationForExtension
     980             :  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
     981             :  * so it may not be worth worrying about.
     982             :  *
     983             :  * Returns true if successful, or false if allocation failed due to
     984             :  * BlockNumber overflow.
     985             :  */
     986             : static bool
     987          36 : _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
     988             : {
     989             :     BlockNumber lastblock;
     990             :     PGAlignedBlock zerobuf;
     991             :     Page        page;
     992             :     HashPageOpaque ovflopaque;
     993             : 
     994          36 :     lastblock = firstblock + nblocks - 1;
     995             : 
     996             :     /*
     997             :      * Check for overflow in block number calculation; if so, we cannot extend
     998             :      * the index anymore.
     999             :      */
    1000          36 :     if (lastblock < firstblock || lastblock == InvalidBlockNumber)
    1001           0 :         return false;
    1002             : 
    1003          36 :     page = (Page) zerobuf.data;
    1004             : 
    1005             :     /*
    1006             :      * Initialize the page.  Just zeroing the page won't work; see
    1007             :      * _hash_freeovflpage for similar usage.  We take care to make the special
    1008             :      * space valid for the benefit of tools such as pageinspect.
    1009             :      */
    1010          36 :     _hash_pageinit(page, BLCKSZ);
    1011             : 
    1012          36 :     ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page);
    1013             : 
    1014          36 :     ovflopaque->hasho_prevblkno = InvalidBlockNumber;
    1015          36 :     ovflopaque->hasho_nextblkno = InvalidBlockNumber;
    1016          36 :     ovflopaque->hasho_bucket = -1;
    1017          36 :     ovflopaque->hasho_flag = LH_UNUSED_PAGE;
    1018          36 :     ovflopaque->hasho_page_id = HASHO_PAGE_ID;
    1019             : 
    1020          36 :     if (RelationNeedsWAL(rel))
    1021          36 :         log_newpage(&rel->rd_node,
    1022             :                     MAIN_FORKNUM,
    1023             :                     lastblock,
    1024             :                     zerobuf.data,
    1025             :                     true);
    1026             : 
    1027          36 :     RelationOpenSmgr(rel);
    1028          36 :     PageSetChecksumInplace(page, lastblock);
    1029          36 :     smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf.data, false);
    1030             : 
    1031          36 :     return true;
    1032             : }
    1033             : 
    1034             : 
    1035             : /*
    1036             :  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
    1037             :  *
    1038             :  * This routine is used to partition the tuples between old and new bucket and
    1039             :  * is used to finish the incomplete split operations.  To finish the previously
    1040             :  * interrupted split operation, the caller needs to fill htab.  If htab is set,
    1041             :  * then we skip the movement of tuples that exists in htab, otherwise NULL
    1042             :  * value of htab indicates movement of all the tuples that belong to the new
    1043             :  * bucket.
    1044             :  *
    1045             :  * We are splitting a bucket that consists of a base bucket page and zero
    1046             :  * or more overflow (bucket chain) pages.  We must relocate tuples that
    1047             :  * belong in the new bucket.
    1048             :  *
    1049             :  * The caller must hold cleanup locks on both buckets to ensure that
    1050             :  * no one else is trying to access them (see README).
    1051             :  *
    1052             :  * The caller must hold a pin, but no lock, on the metapage buffer.
    1053             :  * The buffer is returned in the same state.  (The metapage is only
    1054             :  * touched if it becomes necessary to add or remove overflow pages.)
    1055             :  *
    1056             :  * Split needs to retain pin on primary bucket pages of both old and new
    1057             :  * buckets till end of operation.  This is to prevent vacuum from starting
    1058             :  * while a split is in progress.
    1059             :  *
    1060             :  * In addition, the caller must have created the new bucket's base page,
    1061             :  * which is passed in buffer nbuf, pinned and write-locked.  The lock will be
    1062             :  * released here and pin must be released by the caller.  (The API is set up
    1063             :  * this way because we must do _hash_getnewbuf() before releasing the metapage
    1064             :  * write lock.  So instead of passing the new bucket's start block number, we
    1065             :  * pass an actual buffer.)
    1066             :  */
    1067             : static void
    1068         644 : _hash_splitbucket(Relation rel,
    1069             :                   Buffer metabuf,
    1070             :                   Bucket obucket,
    1071             :                   Bucket nbucket,
    1072             :                   Buffer obuf,
    1073             :                   Buffer nbuf,
    1074             :                   HTAB *htab,
    1075             :                   uint32 maxbucket,
    1076             :                   uint32 highmask,
    1077             :                   uint32 lowmask)
    1078             : {
    1079             :     Buffer      bucket_obuf;
    1080             :     Buffer      bucket_nbuf;
    1081             :     Page        opage;
    1082             :     Page        npage;
    1083             :     HashPageOpaque oopaque;
    1084             :     HashPageOpaque nopaque;
    1085             :     OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
    1086             :     IndexTuple  itups[MaxIndexTuplesPerPage];
    1087         644 :     Size        all_tups_size = 0;
    1088             :     int         i;
    1089         644 :     uint16      nitups = 0;
    1090             : 
    1091         644 :     bucket_obuf = obuf;
    1092         644 :     opage = BufferGetPage(obuf);
    1093         644 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1094             : 
    1095         644 :     bucket_nbuf = nbuf;
    1096         644 :     npage = BufferGetPage(nbuf);
    1097         644 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1098             : 
    1099             :     /* Copy the predicate locks from old bucket to new bucket. */
    1100         644 :     PredicateLockPageSplit(rel,
    1101             :                            BufferGetBlockNumber(bucket_obuf),
    1102             :                            BufferGetBlockNumber(bucket_nbuf));
    1103             : 
    1104             :     /*
    1105             :      * Partition the tuples in the old bucket between the old bucket and the
    1106             :      * new bucket, advancing along the old bucket's overflow bucket chain and
    1107             :      * adding overflow pages to the new bucket as needed.  Outer loop iterates
    1108             :      * once per page in old bucket.
    1109             :      */
    1110             :     for (;;)
    1111         226 :     {
    1112             :         BlockNumber oblkno;
    1113             :         OffsetNumber ooffnum;
    1114             :         OffsetNumber omaxoffnum;
    1115             : 
    1116             :         /* Scan each tuple in old page */
    1117         870 :         omaxoffnum = PageGetMaxOffsetNumber(opage);
    1118      193330 :         for (ooffnum = FirstOffsetNumber;
    1119             :              ooffnum <= omaxoffnum;
    1120      192460 :              ooffnum = OffsetNumberNext(ooffnum))
    1121             :         {
    1122             :             IndexTuple  itup;
    1123             :             Size        itemsz;
    1124             :             Bucket      bucket;
    1125      192460 :             bool        found = false;
    1126             : 
    1127             :             /* skip dead tuples */
    1128      192460 :             if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
    1129           0 :                 continue;
    1130             : 
    1131             :             /*
    1132             :              * Before inserting a tuple, probe the hash table containing TIDs
    1133             :              * of tuples belonging to new bucket, if we find a match, then
    1134             :              * skip that tuple, else fetch the item's hash key (conveniently
    1135             :              * stored in the item) and determine which bucket it now belongs
    1136             :              * in.
    1137             :              */
    1138      192460 :             itup = (IndexTuple) PageGetItem(opage,
    1139             :                                             PageGetItemId(opage, ooffnum));
    1140             : 
    1141      192460 :             if (htab)
    1142           0 :                 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
    1143             : 
    1144      192460 :             if (found)
    1145           0 :                 continue;
    1146             : 
    1147      192460 :             bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
    1148             :                                           maxbucket, highmask, lowmask);
    1149             : 
    1150      192460 :             if (bucket == nbucket)
    1151             :             {
    1152             :                 IndexTuple  new_itup;
    1153             : 
    1154             :                 /*
    1155             :                  * make a copy of index tuple as we have to scribble on it.
    1156             :                  */
    1157       77684 :                 new_itup = CopyIndexTuple(itup);
    1158             : 
    1159             :                 /*
    1160             :                  * mark the index tuple as moved by split, such tuples are
    1161             :                  * skipped by scan if there is split in progress for a bucket.
    1162             :                  */
    1163       77684 :                 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
    1164             : 
    1165             :                 /*
    1166             :                  * insert the tuple into the new bucket.  if it doesn't fit on
    1167             :                  * the current page in the new bucket, we must allocate a new
    1168             :                  * overflow page and place the tuple on that page instead.
    1169             :                  */
    1170       77684 :                 itemsz = IndexTupleSize(new_itup);
    1171       77684 :                 itemsz = MAXALIGN(itemsz);
    1172             : 
    1173       77684 :                 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
    1174             :                 {
    1175             :                     /*
    1176             :                      * Change the shared buffer state in critical section,
    1177             :                      * otherwise any error could make it unrecoverable.
    1178             :                      */
    1179          52 :                     START_CRIT_SECTION();
    1180             : 
    1181          52 :                     _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1182          52 :                     MarkBufferDirty(nbuf);
    1183             :                     /* log the split operation before releasing the lock */
    1184          52 :                     log_split_page(rel, nbuf);
    1185             : 
    1186          52 :                     END_CRIT_SECTION();
    1187             : 
    1188             :                     /* drop lock, but keep pin */
    1189          52 :                     LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1190             : 
    1191             :                     /* be tidy */
    1192       21216 :                     for (i = 0; i < nitups; i++)
    1193       21164 :                         pfree(itups[i]);
    1194          52 :                     nitups = 0;
    1195          52 :                     all_tups_size = 0;
    1196             : 
    1197             :                     /* chain to a new overflow page */
    1198          52 :                     nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
    1199          52 :                     npage = BufferGetPage(nbuf);
    1200          52 :                     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1201             :                 }
    1202             : 
    1203       77684 :                 itups[nitups++] = new_itup;
    1204       77684 :                 all_tups_size += itemsz;
    1205             :             }
    1206             :             else
    1207             :             {
    1208             :                 /*
    1209             :                  * the tuple stays on this page, so nothing to do.
    1210             :                  */
    1211             :                 Assert(bucket == obucket);
    1212             :             }
    1213             :         }
    1214             : 
    1215         870 :         oblkno = oopaque->hasho_nextblkno;
    1216             : 
    1217             :         /* retain the pin on the old primary bucket */
    1218         870 :         if (obuf == bucket_obuf)
    1219         644 :             LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1220             :         else
    1221         226 :             _hash_relbuf(rel, obuf);
    1222             : 
    1223             :         /* Exit loop if no more overflow pages in old bucket */
    1224         870 :         if (!BlockNumberIsValid(oblkno))
    1225             :         {
    1226             :             /*
    1227             :              * Change the shared buffer state in critical section, otherwise
    1228             :              * any error could make it unrecoverable.
    1229             :              */
    1230         644 :             START_CRIT_SECTION();
    1231             : 
    1232         644 :             _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1233         644 :             MarkBufferDirty(nbuf);
    1234             :             /* log the split operation before releasing the lock */
    1235         644 :             log_split_page(rel, nbuf);
    1236             : 
    1237         644 :             END_CRIT_SECTION();
    1238             : 
    1239         644 :             if (nbuf == bucket_nbuf)
    1240         640 :                 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1241             :             else
    1242           4 :                 _hash_relbuf(rel, nbuf);
    1243             : 
    1244             :             /* be tidy */
    1245       57164 :             for (i = 0; i < nitups; i++)
    1246       56520 :                 pfree(itups[i]);
    1247         644 :             break;
    1248             :         }
    1249             : 
    1250             :         /* Else, advance to next old page */
    1251         226 :         obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
    1252         226 :         opage = BufferGetPage(obuf);
    1253         226 :         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1254             :     }
    1255             : 
    1256             :     /*
    1257             :      * We're at the end of the old bucket chain, so we're done partitioning
    1258             :      * the tuples.  Mark the old and new buckets to indicate split is
    1259             :      * finished.
    1260             :      *
    1261             :      * To avoid deadlocks due to locking order of buckets, first lock the old
    1262             :      * bucket and then the new bucket.
    1263             :      */
    1264         644 :     LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
    1265         644 :     opage = BufferGetPage(bucket_obuf);
    1266         644 :     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    1267             : 
    1268         644 :     LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
    1269         644 :     npage = BufferGetPage(bucket_nbuf);
    1270         644 :     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1271             : 
    1272         644 :     START_CRIT_SECTION();
    1273             : 
    1274         644 :     oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
    1275         644 :     nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
    1276             : 
    1277             :     /*
    1278             :      * After the split is finished, mark the old bucket to indicate that it
    1279             :      * contains deletable tuples.  We will clear split-cleanup flag after
    1280             :      * deleting such tuples either at the end of split or at the next split
    1281             :      * from old bucket or at the time of vacuum.
    1282             :      */
    1283         644 :     oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    1284             : 
    1285             :     /*
    1286             :      * now write the buffers, here we don't release the locks as caller is
    1287             :      * responsible to release locks.
    1288             :      */
    1289         644 :     MarkBufferDirty(bucket_obuf);
    1290         644 :     MarkBufferDirty(bucket_nbuf);
    1291             : 
    1292         644 :     if (RelationNeedsWAL(rel))
    1293             :     {
    1294             :         XLogRecPtr  recptr;
    1295             :         xl_hash_split_complete xlrec;
    1296             : 
    1297         644 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
    1298         644 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
    1299             : 
    1300         644 :         XLogBeginInsert();
    1301             : 
    1302         644 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
    1303             : 
    1304         644 :         XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
    1305         644 :         XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
    1306             : 
    1307         644 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
    1308             : 
    1309         644 :         PageSetLSN(BufferGetPage(bucket_obuf), recptr);
    1310         644 :         PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
    1311             :     }
    1312             : 
    1313         644 :     END_CRIT_SECTION();
    1314             : 
    1315             :     /*
    1316             :      * If possible, clean up the old bucket.  We might not be able to do this
    1317             :      * if someone else has a pin on it, but if not then we can go ahead.  This
    1318             :      * isn't absolutely necessary, but it reduces bloat; if we don't do it
    1319             :      * now, VACUUM will do it eventually, but maybe not until new overflow
    1320             :      * pages have been allocated.  Note that there's no need to clean up the
    1321             :      * new bucket.
    1322             :      */
    1323         644 :     if (IsBufferCleanupOK(bucket_obuf))
    1324             :     {
    1325         644 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1326         644 :         hashbucketcleanup(rel, obucket, bucket_obuf,
    1327             :                           BufferGetBlockNumber(bucket_obuf), NULL,
    1328             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
    1329             :                           NULL, NULL);
    1330             :     }
    1331             :     else
    1332             :     {
    1333           0 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1334           0 :         LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
    1335             :     }
    1336         644 : }
    1337             : 
    1338             : /*
    1339             :  *  _hash_finish_split() -- Finish the previously interrupted split operation
    1340             :  *
    1341             :  * To complete the split operation, we form the hash table of TIDs in new
    1342             :  * bucket which is then used by split operation to skip tuples that are
    1343             :  * already moved before the split operation was previously interrupted.
    1344             :  *
    1345             :  * The caller must hold a pin, but no lock, on the metapage and old bucket's
    1346             :  * primary page buffer.  The buffers are returned in the same state.  (The
    1347             :  * metapage is only touched if it becomes necessary to add or remove overflow
    1348             :  * pages.)
    1349             :  */
    1350             : void
    1351           0 : _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
    1352             :                    uint32 maxbucket, uint32 highmask, uint32 lowmask)
    1353             : {
    1354             :     HASHCTL     hash_ctl;
    1355             :     HTAB       *tidhtab;
    1356           0 :     Buffer      bucket_nbuf = InvalidBuffer;
    1357             :     Buffer      nbuf;
    1358             :     Page        npage;
    1359             :     BlockNumber nblkno;
    1360             :     BlockNumber bucket_nblkno;
    1361             :     HashPageOpaque npageopaque;
    1362             :     Bucket      nbucket;
    1363             :     bool        found;
    1364             : 
    1365             :     /* Initialize hash tables used to track TIDs */
    1366           0 :     memset(&hash_ctl, 0, sizeof(hash_ctl));
    1367           0 :     hash_ctl.keysize = sizeof(ItemPointerData);
    1368           0 :     hash_ctl.entrysize = sizeof(ItemPointerData);
    1369           0 :     hash_ctl.hcxt = CurrentMemoryContext;
    1370             : 
    1371             :     tidhtab =
    1372           0 :         hash_create("bucket ctids",
    1373             :                     256,        /* arbitrary initial size */
    1374             :                     &hash_ctl,
    1375             :                     HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1376             : 
    1377           0 :     bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
    1378             : 
    1379             :     /*
    1380             :      * Scan the new bucket and build hash table of TIDs
    1381             :      */
    1382             :     for (;;)
    1383           0 :     {
    1384             :         OffsetNumber noffnum;
    1385             :         OffsetNumber nmaxoffnum;
    1386             : 
    1387           0 :         nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
    1388             :                             LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
    1389             : 
    1390             :         /* remember the primary bucket buffer to acquire cleanup lock on it. */
    1391           0 :         if (nblkno == bucket_nblkno)
    1392           0 :             bucket_nbuf = nbuf;
    1393             : 
    1394           0 :         npage = BufferGetPage(nbuf);
    1395           0 :         npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1396             : 
    1397             :         /* Scan each tuple in new page */
    1398           0 :         nmaxoffnum = PageGetMaxOffsetNumber(npage);
    1399           0 :         for (noffnum = FirstOffsetNumber;
    1400             :              noffnum <= nmaxoffnum;
    1401           0 :              noffnum = OffsetNumberNext(noffnum))
    1402             :         {
    1403             :             IndexTuple  itup;
    1404             : 
    1405             :             /* Fetch the item's TID and insert it in hash table. */
    1406           0 :             itup = (IndexTuple) PageGetItem(npage,
    1407             :                                             PageGetItemId(npage, noffnum));
    1408             : 
    1409           0 :             (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
    1410             : 
    1411             :             Assert(!found);
    1412             :         }
    1413             : 
    1414           0 :         nblkno = npageopaque->hasho_nextblkno;
    1415             : 
    1416             :         /*
    1417             :          * release our write lock without modifying buffer and ensure to
    1418             :          * retain the pin on primary bucket.
    1419             :          */
    1420           0 :         if (nbuf == bucket_nbuf)
    1421           0 :             LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1422             :         else
    1423           0 :             _hash_relbuf(rel, nbuf);
    1424             : 
    1425             :         /* Exit loop if no more overflow pages in new bucket */
    1426           0 :         if (!BlockNumberIsValid(nblkno))
    1427           0 :             break;
    1428             :     }
    1429             : 
    1430             :     /*
    1431             :      * Conditionally get the cleanup lock on old and new buckets to perform
    1432             :      * the split operation.  If we don't get the cleanup locks, silently give
    1433             :      * up and next insertion on old bucket will try again to complete the
    1434             :      * split.
    1435             :      */
    1436           0 :     if (!ConditionalLockBufferForCleanup(obuf))
    1437             :     {
    1438           0 :         hash_destroy(tidhtab);
    1439           0 :         return;
    1440             :     }
    1441           0 :     if (!ConditionalLockBufferForCleanup(bucket_nbuf))
    1442             :     {
    1443           0 :         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1444           0 :         hash_destroy(tidhtab);
    1445           0 :         return;
    1446             :     }
    1447             : 
    1448           0 :     npage = BufferGetPage(bucket_nbuf);
    1449           0 :     npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    1450           0 :     nbucket = npageopaque->hasho_bucket;
    1451             : 
    1452           0 :     _hash_splitbucket(rel, metabuf, obucket,
    1453             :                       nbucket, obuf, bucket_nbuf, tidhtab,
    1454             :                       maxbucket, highmask, lowmask);
    1455             : 
    1456           0 :     _hash_dropbuf(rel, bucket_nbuf);
    1457           0 :     hash_destroy(tidhtab);
    1458             : }
    1459             : 
    1460             : /*
    1461             :  *  log_split_page() -- Log the split operation
    1462             :  *
    1463             :  *  We log the split operation when the new page in new bucket gets full,
    1464             :  *  so we log the entire page.
    1465             :  *
    1466             :  *  'buf' must be locked by the caller which is also responsible for unlocking
    1467             :  *  it.
    1468             :  */
    1469             : static void
    1470         696 : log_split_page(Relation rel, Buffer buf)
    1471             : {
    1472         696 :     if (RelationNeedsWAL(rel))
    1473             :     {
    1474             :         XLogRecPtr  recptr;
    1475             : 
    1476         696 :         XLogBeginInsert();
    1477             : 
    1478         696 :         XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
    1479             : 
    1480         696 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
    1481             : 
    1482         696 :         PageSetLSN(BufferGetPage(buf), recptr);
    1483             :     }
    1484         696 : }
    1485             : 
    1486             : /*
    1487             :  *  _hash_getcachedmetap() -- Returns cached metapage data.
    1488             :  *
    1489             :  *  If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
    1490             :  *  the metapage.  If not set, we'll set it before returning if we have to
    1491             :  *  refresh the cache, and return with a pin but no lock on it; caller is
    1492             :  *  responsible for releasing the pin.
    1493             :  *
    1494             :  *  We refresh the cache if it's not initialized yet or force_refresh is true.
    1495             :  */
    1496             : HashMetaPage
    1497      452436 : _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
    1498             : {
    1499             :     Page        page;
    1500             : 
    1501             :     Assert(metabuf);
    1502      452436 :     if (force_refresh || rel->rd_amcache == NULL)
    1503             :     {
    1504         742 :         char       *cache = NULL;
    1505             : 
    1506             :         /*
    1507             :          * It's important that we don't set rd_amcache to an invalid value.
    1508             :          * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
    1509             :          * install a pointer to the newly-allocated storage in the actual
    1510             :          * relcache entry until both have succeeded.
    1511             :          */
    1512         742 :         if (rel->rd_amcache == NULL)
    1513         352 :             cache = MemoryContextAlloc(rel->rd_indexcxt,
    1514             :                                        sizeof(HashMetaPageData));
    1515             : 
    1516             :         /* Read the metapage. */
    1517         742 :         if (BufferIsValid(*metabuf))
    1518           2 :             LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
    1519             :         else
    1520         740 :             *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
    1521             :                                     LH_META_PAGE);
    1522         742 :         page = BufferGetPage(*metabuf);
    1523             : 
    1524             :         /* Populate the cache. */
    1525         742 :         if (rel->rd_amcache == NULL)
    1526         352 :             rel->rd_amcache = cache;
    1527         742 :         memcpy(rel->rd_amcache, HashPageGetMeta(page),
    1528             :                sizeof(HashMetaPageData));
    1529             : 
    1530             :         /* Release metapage lock, but keep the pin. */
    1531         742 :         LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
    1532             :     }
    1533             : 
    1534      452436 :     return (HashMetaPage) rel->rd_amcache;
    1535             : }
    1536             : 
    1537             : /*
    1538             :  *  _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
    1539             :  *                                       hashkey.
    1540             :  *
    1541             :  *  Bucket pages do not move or get removed once they are allocated. This give
    1542             :  *  us an opportunity to use the previously saved metapage contents to reach
    1543             :  *  the target bucket buffer, instead of reading from the metapage every time.
    1544             :  *  This saves one buffer access every time we want to reach the target bucket
    1545             :  *  buffer, which is very helpful savings in bufmgr traffic and contention.
    1546             :  *
    1547             :  *  The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
    1548             :  *  bucket buffer has to be locked for reading or writing.
    1549             :  *
    1550             :  *  The out parameter cachedmetap is set with metapage contents used for
    1551             :  *  hashkey to bucket buffer mapping. Some callers need this info to reach the
    1552             :  *  old bucket in case of bucket split, see _hash_doinsert().
    1553             :  */
    1554             : Buffer
    1555      452038 : _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
    1556             :                                 HashMetaPage *cachedmetap)
    1557             : {
    1558             :     HashMetaPage metap;
    1559             :     Buffer      buf;
    1560      452038 :     Buffer      metabuf = InvalidBuffer;
    1561             :     Page        page;
    1562             :     Bucket      bucket;
    1563             :     BlockNumber blkno;
    1564             :     HashPageOpaque opaque;
    1565             : 
    1566             :     /* We read from target bucket buffer, hence locking is must. */
    1567             :     Assert(access == HASH_READ || access == HASH_WRITE);
    1568             : 
    1569      452038 :     metap = _hash_getcachedmetap(rel, &metabuf, false);
    1570             :     Assert(metap != NULL);
    1571             : 
    1572             :     /*
    1573             :      * Loop until we get a lock on the correct target bucket.
    1574             :      */
    1575             :     for (;;)
    1576             :     {
    1577             :         /*
    1578             :          * Compute the target bucket number, and convert to block number.
    1579             :          */
    1580      452426 :         bucket = _hash_hashkey2bucket(hashkey,
    1581             :                                       metap->hashm_maxbucket,
    1582             :                                       metap->hashm_highmask,
    1583             :                                       metap->hashm_lowmask);
    1584             : 
    1585      452426 :         blkno = BUCKET_TO_BLKNO(metap, bucket);
    1586             : 
    1587             :         /* Fetch the primary bucket page for the bucket */
    1588      452426 :         buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
    1589      452426 :         page = BufferGetPage(buf);
    1590      452426 :         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
    1591             :         Assert(opaque->hasho_bucket == bucket);
    1592             :         Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
    1593             : 
    1594             :         /*
    1595             :          * If this bucket hasn't been split, we're done.
    1596             :          */
    1597      452426 :         if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
    1598      452038 :             break;
    1599             : 
    1600             :         /* Drop lock on this buffer, update cached metapage, and retry. */
    1601         388 :         _hash_relbuf(rel, buf);
    1602         388 :         metap = _hash_getcachedmetap(rel, &metabuf, true);
    1603             :         Assert(metap != NULL);
    1604             :     }
    1605             : 
    1606      452038 :     if (BufferIsValid(metabuf))
    1607         740 :         _hash_dropbuf(rel, metabuf);
    1608             : 
    1609      452038 :     if (cachedmetap)
    1610      451736 :         *cachedmetap = metap;
    1611             : 
    1612      452038 :     return buf;
    1613             : }

Generated by: LCOV version 1.13