LCOV - code coverage report
Current view: top level - src/backend/access/hash - hashpage.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 381 465 81.9 %
Date: 2024-11-21 08:14:44 Functions: 18 19 94.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * hashpage.c
       4             :  *    Hash table page management code for the Postgres hash access method
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/access/hash/hashpage.c
      12             :  *
      13             :  * NOTES
      14             :  *    Postgres hash pages look like ordinary relation pages.  The opaque
      15             :  *    data at high addresses includes information about the page including
      16             :  *    whether a page is an overflow page or a true bucket, the bucket
      17             :  *    number, and the block numbers of the preceding and following pages
      18             :  *    in the same bucket.
      19             :  *
      20             :  *    The first page in a hash relation, page zero, is special -- it stores
      21             :  *    information describing the hash table; it is referred to as the
      22             :  *    "meta page." Pages one and higher store the actual data.
      23             :  *
      24             :  *    There are also bitmap pages, which are not manipulated here;
      25             :  *    see hashovfl.c.
      26             :  *
      27             :  *-------------------------------------------------------------------------
      28             :  */
      29             : #include "postgres.h"
      30             : 
      31             : #include "access/hash.h"
      32             : #include "access/hash_xlog.h"
      33             : #include "access/xloginsert.h"
      34             : #include "miscadmin.h"
      35             : #include "port/pg_bitutils.h"
      36             : #include "storage/predicate.h"
      37             : #include "storage/smgr.h"
      38             : #include "utils/rel.h"
      39             : 
      40             : static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
      41             :                                 uint32 nblocks);
      42             : static void _hash_splitbucket(Relation rel, Buffer metabuf,
      43             :                               Bucket obucket, Bucket nbucket,
      44             :                               Buffer obuf,
      45             :                               Buffer nbuf,
      46             :                               HTAB *htab,
      47             :                               uint32 maxbucket,
      48             :                               uint32 highmask, uint32 lowmask);
      49             : static void log_split_page(Relation rel, Buffer buf);
      50             : 
      51             : 
      52             : /*
      53             :  *  _hash_getbuf() -- Get a buffer by block number for read or write.
      54             :  *
      55             :  *      'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
      56             :  *      'flags' is a bitwise OR of the allowed page types.
      57             :  *
      58             :  *      This must be used only to fetch pages that are expected to be valid
      59             :  *      already.  _hash_checkpage() is applied using the given flags.
      60             :  *
      61             :  *      When this routine returns, the appropriate lock is set on the
      62             :  *      requested buffer and its reference count has been incremented
      63             :  *      (ie, the buffer is "locked and pinned").
      64             :  *
      65             :  *      P_NEW is disallowed because this routine can only be used
      66             :  *      to access pages that are known to be before the filesystem EOF.
      67             :  *      Extending the index should be done with _hash_getnewbuf.
      68             :  */
      69             : Buffer
      70     1972342 : _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
      71             : {
      72             :     Buffer      buf;
      73             : 
      74     1972342 :     if (blkno == P_NEW)
      75           0 :         elog(ERROR, "hash AM does not use P_NEW");
      76             : 
      77     1972342 :     buf = ReadBuffer(rel, blkno);
      78             : 
      79     1972342 :     if (access != HASH_NOLOCK)
      80     1247652 :         LockBuffer(buf, access);
      81             : 
      82             :     /* ref count and lock type are correct */
      83             : 
      84     1972342 :     _hash_checkpage(rel, buf, flags);
      85             : 
      86     1972342 :     return buf;
      87             : }
      88             : 
      89             : /*
      90             :  * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup.
      91             :  *
      92             :  *      We read the page and try to acquire a cleanup lock.  If we get it,
      93             :  *      we return the buffer; otherwise, we return InvalidBuffer.
      94             :  */
      95             : Buffer
      96        1332 : _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
      97             : {
      98             :     Buffer      buf;
      99             : 
     100        1332 :     if (blkno == P_NEW)
     101           0 :         elog(ERROR, "hash AM does not use P_NEW");
     102             : 
     103        1332 :     buf = ReadBuffer(rel, blkno);
     104             : 
     105        1332 :     if (!ConditionalLockBufferForCleanup(buf))
     106             :     {
     107           0 :         ReleaseBuffer(buf);
     108           0 :         return InvalidBuffer;
     109             :     }
     110             : 
     111             :     /* ref count and lock type are correct */
     112             : 
     113        1332 :     _hash_checkpage(rel, buf, flags);
     114             : 
     115        1332 :     return buf;
     116             : }
     117             : 
     118             : /*
     119             :  *  _hash_getinitbuf() -- Get and initialize a buffer by block number.
     120             :  *
     121             :  *      This must be used only to fetch pages that are known to be before
     122             :  *      the index's filesystem EOF, but are to be filled from scratch.
     123             :  *      _hash_pageinit() is applied automatically.  Otherwise it has
     124             :  *      effects similar to _hash_getbuf() with access = HASH_WRITE.
     125             :  *
     126             :  *      When this routine returns, a write lock is set on the
     127             :  *      requested buffer and its reference count has been incremented
     128             :  *      (ie, the buffer is "locked and pinned").
     129             :  *
     130             :  *      P_NEW is disallowed because this routine can only be used
     131             :  *      to access pages that are known to be before the filesystem EOF.
     132             :  *      Extending the index should be done with _hash_getnewbuf.
     133             :  */
     134             : Buffer
     135          70 : _hash_getinitbuf(Relation rel, BlockNumber blkno)
     136             : {
     137             :     Buffer      buf;
     138             : 
     139          70 :     if (blkno == P_NEW)
     140           0 :         elog(ERROR, "hash AM does not use P_NEW");
     141             : 
     142          70 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
     143             :                              NULL);
     144             : 
     145             :     /* ref count and lock type are correct */
     146             : 
     147             :     /* initialize the page */
     148          70 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     149             : 
     150          70 :     return buf;
     151             : }
     152             : 
     153             : /*
     154             :  *  _hash_initbuf() -- Get and initialize a buffer by bucket number.
     155             :  */
     156             : void
     157        8428 : _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag,
     158             :               bool initpage)
     159             : {
     160             :     HashPageOpaque pageopaque;
     161             :     Page        page;
     162             : 
     163        8428 :     page = BufferGetPage(buf);
     164             : 
     165             :     /* initialize the page */
     166        8428 :     if (initpage)
     167         576 :         _hash_pageinit(page, BufferGetPageSize(buf));
     168             : 
     169        8428 :     pageopaque = HashPageGetOpaque(page);
     170             : 
     171             :     /*
     172             :      * Set hasho_prevblkno with current hashm_maxbucket. This value will be
     173             :      * used to validate cached HashMetaPageData. See
     174             :      * _hash_getbucketbuf_from_hashkey().
     175             :      */
     176        8428 :     pageopaque->hasho_prevblkno = max_bucket;
     177        8428 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     178        8428 :     pageopaque->hasho_bucket = num_bucket;
     179        8428 :     pageopaque->hasho_flag = flag;
     180        8428 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     181        8428 : }
     182             : 
     183             : /*
     184             :  *  _hash_getnewbuf() -- Get a new page at the end of the index.
     185             :  *
     186             :  *      This has the same API as _hash_getinitbuf, except that we are adding
     187             :  *      a page to the index, and hence expect the page to be past the
     188             :  *      logical EOF.  (However, we have to support the case where it isn't,
     189             :  *      since a prior try might have crashed after extending the filesystem
     190             :  *      EOF but before updating the metapage to reflect the added page.)
     191             :  *
     192             :  *      It is caller's responsibility to ensure that only one process can
     193             :  *      extend the index at a time.  In practice, this function is called
     194             :  *      only while holding write lock on the metapage, because adding a page
     195             :  *      is always associated with an update of metapage data.
     196             :  */
     197             : Buffer
     198       10162 : _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
     199             : {
     200       10162 :     BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
     201             :     Buffer      buf;
     202             : 
     203       10162 :     if (blkno == P_NEW)
     204           0 :         elog(ERROR, "hash AM does not use P_NEW");
     205       10162 :     if (blkno > nblocks)
     206           0 :         elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
     207             :              RelationGetRelationName(rel));
     208             : 
     209             :     /* smgr insists we explicitly extend the relation */
     210       10162 :     if (blkno == nblocks)
     211             :     {
     212        8830 :         buf = ExtendBufferedRel(BMR_REL(rel), forkNum, NULL,
     213             :                                 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
     214        8830 :         if (BufferGetBlockNumber(buf) != blkno)
     215           0 :             elog(ERROR, "unexpected hash relation size: %u, should be %u",
     216             :                  BufferGetBlockNumber(buf), blkno);
     217             :     }
     218             :     else
     219             :     {
     220        1332 :         buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
     221             :                                  NULL);
     222             :     }
     223             : 
     224             :     /* ref count and lock type are correct */
     225             : 
     226             :     /* initialize the page */
     227       10162 :     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
     228             : 
     229       10162 :     return buf;
     230             : }
     231             : 
     232             : /*
     233             :  *  _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
     234             :  *
     235             :  *      This is identical to _hash_getbuf() but also allows a buffer access
     236             :  *      strategy to be specified.  We use this for VACUUM operations.
     237             :  */
     238             : Buffer
     239        1198 : _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
     240             :                            int access, int flags,
     241             :                            BufferAccessStrategy bstrategy)
     242             : {
     243             :     Buffer      buf;
     244             : 
     245        1198 :     if (blkno == P_NEW)
     246           0 :         elog(ERROR, "hash AM does not use P_NEW");
     247             : 
     248        1198 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
     249             : 
     250        1198 :     if (access != HASH_NOLOCK)
     251        1198 :         LockBuffer(buf, access);
     252             : 
     253             :     /* ref count and lock type are correct */
     254             : 
     255        1198 :     _hash_checkpage(rel, buf, flags);
     256             : 
     257        1198 :     return buf;
     258             : }
     259             : 
     260             : /*
     261             :  *  _hash_relbuf() -- release a locked buffer.
     262             :  *
     263             :  * Lock and pin (refcount) are both dropped.
     264             :  */
     265             : void
     266     1163256 : _hash_relbuf(Relation rel, Buffer buf)
     267             : {
     268     1163256 :     UnlockReleaseBuffer(buf);
     269     1163256 : }
     270             : 
     271             : /*
     272             :  *  _hash_dropbuf() -- release an unlocked buffer.
     273             :  *
     274             :  * This is used to unpin a buffer on which we hold no lock.
     275             :  */
     276             : void
     277      822042 : _hash_dropbuf(Relation rel, Buffer buf)
     278             : {
     279      822042 :     ReleaseBuffer(buf);
     280      822042 : }
     281             : 
     282             : /*
     283             :  *  _hash_dropscanbuf() -- release buffers used in scan.
     284             :  *
     285             :  * This routine unpins the buffers used during scan on which we
     286             :  * hold no lock.
     287             :  */
     288             : void
     289        1310 : _hash_dropscanbuf(Relation rel, HashScanOpaque so)
     290             : {
     291             :     /* release pin we hold on primary bucket page */
     292        1310 :     if (BufferIsValid(so->hashso_bucket_buf) &&
     293         532 :         so->hashso_bucket_buf != so->currPos.buf)
     294         126 :         _hash_dropbuf(rel, so->hashso_bucket_buf);
     295        1310 :     so->hashso_bucket_buf = InvalidBuffer;
     296             : 
     297             :     /* release pin we hold on primary bucket page  of bucket being split */
     298        1310 :     if (BufferIsValid(so->hashso_split_bucket_buf) &&
     299           0 :         so->hashso_split_bucket_buf != so->currPos.buf)
     300           0 :         _hash_dropbuf(rel, so->hashso_split_bucket_buf);
     301        1310 :     so->hashso_split_bucket_buf = InvalidBuffer;
     302             : 
     303             :     /* release any pin we still hold */
     304        1310 :     if (BufferIsValid(so->currPos.buf))
     305         406 :         _hash_dropbuf(rel, so->currPos.buf);
     306        1310 :     so->currPos.buf = InvalidBuffer;
     307             : 
     308             :     /* reset split scan */
     309        1310 :     so->hashso_buc_populated = false;
     310        1310 :     so->hashso_buc_split = false;
     311        1310 : }
     312             : 
     313             : 
     314             : /*
     315             :  *  _hash_init() -- Initialize the metadata page of a hash index,
     316             :  *              the initial buckets, and the initial bitmap page.
     317             :  *
     318             :  * The initial number of buckets is dependent on num_tuples, an estimate
     319             :  * of the number of tuples to be loaded into the index initially.  The
     320             :  * chosen number of buckets is returned.
     321             :  *
     322             :  * We are fairly cavalier about locking here, since we know that no one else
     323             :  * could be accessing this index.  In particular the rule about not holding
     324             :  * multiple buffer locks is ignored.
     325             :  */
     326             : uint32
     327         324 : _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
     328             : {
     329             :     Buffer      metabuf;
     330             :     Buffer      buf;
     331             :     Buffer      bitmapbuf;
     332             :     Page        pg;
     333             :     HashMetaPage metap;
     334             :     RegProcedure procid;
     335             :     int32       data_width;
     336             :     int32       item_width;
     337             :     int32       ffactor;
     338             :     uint32      num_buckets;
     339             :     uint32      i;
     340             :     bool        use_wal;
     341             : 
     342             :     /* safety check */
     343         324 :     if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
     344           0 :         elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
     345             :              RelationGetRelationName(rel));
     346             : 
     347             :     /*
     348             :      * WAL log creation of pages if the relation is persistent, or this is the
     349             :      * init fork.  Init forks for unlogged relations always need to be WAL
     350             :      * logged.
     351             :      */
     352         324 :     use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM;
     353             : 
     354             :     /*
     355             :      * Determine the target fill factor (in tuples per bucket) for this index.
     356             :      * The idea is to make the fill factor correspond to pages about as full
     357             :      * as the user-settable fillfactor parameter says.  We can compute it
     358             :      * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
     359             :      */
     360         324 :     data_width = sizeof(uint32);
     361         324 :     item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
     362             :         sizeof(ItemIdData);     /* include the line pointer */
     363         324 :     ffactor = HashGetTargetPageUsage(rel) / item_width;
     364             :     /* keep to a sane range */
     365         324 :     if (ffactor < 10)
     366           0 :         ffactor = 10;
     367             : 
     368         324 :     procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
     369             : 
     370             :     /*
     371             :      * We initialize the metapage, the first N bucket pages, and the first
     372             :      * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
     373             :      * calls to occur.  This ensures that the smgr level has the right idea of
     374             :      * the physical index length.
     375             :      *
     376             :      * Critical section not required, because on error the creation of the
     377             :      * whole relation will be rolled back.
     378             :      */
     379         324 :     metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
     380         324 :     _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false);
     381         324 :     MarkBufferDirty(metabuf);
     382             : 
     383         324 :     pg = BufferGetPage(metabuf);
     384         324 :     metap = HashPageGetMeta(pg);
     385             : 
     386             :     /* XLOG stuff */
     387         324 :     if (use_wal)
     388             :     {
     389             :         xl_hash_init_meta_page xlrec;
     390             :         XLogRecPtr  recptr;
     391             : 
     392         196 :         xlrec.num_tuples = num_tuples;
     393         196 :         xlrec.procid = metap->hashm_procid;
     394         196 :         xlrec.ffactor = metap->hashm_ffactor;
     395             : 
     396         196 :         XLogBeginInsert();
     397         196 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage);
     398         196 :         XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
     399             : 
     400         196 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE);
     401             : 
     402         196 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     403             :     }
     404             : 
     405         324 :     num_buckets = metap->hashm_maxbucket + 1;
     406             : 
     407             :     /*
     408             :      * Release buffer lock on the metapage while we initialize buckets.
     409             :      * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
     410             :      * won't accomplish anything.  It's a bad idea to hold buffer locks for
     411             :      * long intervals in any case, since that can block the bgwriter.
     412             :      */
     413         324 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     414             : 
     415             :     /*
     416             :      * Initialize and WAL Log the first N buckets
     417             :      */
     418        8176 :     for (i = 0; i < num_buckets; i++)
     419             :     {
     420             :         BlockNumber blkno;
     421             : 
     422             :         /* Allow interrupts, in case N is huge */
     423        7852 :         CHECK_FOR_INTERRUPTS();
     424             : 
     425        7852 :         blkno = BUCKET_TO_BLKNO(metap, i);
     426        7852 :         buf = _hash_getnewbuf(rel, blkno, forkNum);
     427        7852 :         _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false);
     428        7852 :         MarkBufferDirty(buf);
     429             : 
     430        7852 :         if (use_wal)
     431        5304 :             log_newpage(&rel->rd_locator,
     432             :                         forkNum,
     433             :                         blkno,
     434             :                         BufferGetPage(buf),
     435             :                         true);
     436        7852 :         _hash_relbuf(rel, buf);
     437             :     }
     438             : 
     439             :     /* Now reacquire buffer lock on metapage */
     440         324 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     441             : 
     442             :     /*
     443             :      * Initialize bitmap page
     444             :      */
     445         324 :     bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum);
     446         324 :     _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false);
     447         324 :     MarkBufferDirty(bitmapbuf);
     448             : 
     449             :     /* add the new bitmap page to the metapage's list of bitmaps */
     450             :     /* metapage already has a write lock */
     451         324 :     if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
     452           0 :         ereport(ERROR,
     453             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     454             :                  errmsg("out of overflow pages in hash index \"%s\"",
     455             :                         RelationGetRelationName(rel))));
     456             : 
     457         324 :     metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1;
     458             : 
     459         324 :     metap->hashm_nmaps++;
     460         324 :     MarkBufferDirty(metabuf);
     461             : 
     462             :     /* XLOG stuff */
     463         324 :     if (use_wal)
     464             :     {
     465             :         xl_hash_init_bitmap_page xlrec;
     466             :         XLogRecPtr  recptr;
     467             : 
     468         196 :         xlrec.bmsize = metap->hashm_bmsize;
     469             : 
     470         196 :         XLogBeginInsert();
     471         196 :         XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage);
     472         196 :         XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT);
     473             : 
     474             :         /*
     475             :          * This is safe only because nobody else can be modifying the index at
     476             :          * this stage; it's only visible to the transaction that is creating
     477             :          * it.
     478             :          */
     479         196 :         XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
     480             : 
     481         196 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE);
     482             : 
     483         196 :         PageSetLSN(BufferGetPage(bitmapbuf), recptr);
     484         196 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     485             :     }
     486             : 
     487             :     /* all done */
     488         324 :     _hash_relbuf(rel, bitmapbuf);
     489         324 :     _hash_relbuf(rel, metabuf);
     490             : 
     491         324 :     return num_buckets;
     492             : }
     493             : 
     494             : /*
     495             :  *  _hash_init_metabuffer() -- Initialize the metadata page of a hash index.
     496             :  */
     497             : void
     498         372 : _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid,
     499             :                       uint16 ffactor, bool initpage)
     500             : {
     501             :     HashMetaPage metap;
     502             :     HashPageOpaque pageopaque;
     503             :     Page        page;
     504             :     double      dnumbuckets;
     505             :     uint32      num_buckets;
     506             :     uint32      spare_index;
     507             :     uint32      lshift;
     508             : 
     509             :     /*
     510             :      * Choose the number of initial bucket pages to match the fill factor
     511             :      * given the estimated number of tuples.  We round up the result to the
     512             :      * total number of buckets which has to be allocated before using its
     513             :      * hashm_spares element. However always force at least 2 bucket pages. The
     514             :      * upper limit is determined by considerations explained in
     515             :      * _hash_expandtable().
     516             :      */
     517         372 :     dnumbuckets = num_tuples / ffactor;
     518         372 :     if (dnumbuckets <= 2.0)
     519         110 :         num_buckets = 2;
     520         262 :     else if (dnumbuckets >= (double) 0x40000000)
     521           0 :         num_buckets = 0x40000000;
     522             :     else
     523         262 :         num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets));
     524             : 
     525         372 :     spare_index = _hash_spareindex(num_buckets);
     526             :     Assert(spare_index < HASH_MAX_SPLITPOINTS);
     527             : 
     528         372 :     page = BufferGetPage(buf);
     529         372 :     if (initpage)
     530          48 :         _hash_pageinit(page, BufferGetPageSize(buf));
     531             : 
     532         372 :     pageopaque = HashPageGetOpaque(page);
     533         372 :     pageopaque->hasho_prevblkno = InvalidBlockNumber;
     534         372 :     pageopaque->hasho_nextblkno = InvalidBlockNumber;
     535         372 :     pageopaque->hasho_bucket = InvalidBucket;
     536         372 :     pageopaque->hasho_flag = LH_META_PAGE;
     537         372 :     pageopaque->hasho_page_id = HASHO_PAGE_ID;
     538             : 
     539         372 :     metap = HashPageGetMeta(page);
     540             : 
     541         372 :     metap->hashm_magic = HASH_MAGIC;
     542         372 :     metap->hashm_version = HASH_VERSION;
     543         372 :     metap->hashm_ntuples = 0;
     544         372 :     metap->hashm_nmaps = 0;
     545         372 :     metap->hashm_ffactor = ffactor;
     546         372 :     metap->hashm_bsize = HashGetMaxBitmapSize(page);
     547             : 
     548             :     /* find largest bitmap array size that will fit in page size */
     549         372 :     lshift = pg_leftmost_one_pos32(metap->hashm_bsize);
     550             :     Assert(lshift > 0);
     551         372 :     metap->hashm_bmsize = 1 << lshift;
     552         372 :     metap->hashm_bmshift = lshift + BYTE_TO_BIT;
     553             :     Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
     554             : 
     555             :     /*
     556             :      * Label the index with its primary hash support function's OID.  This is
     557             :      * pretty useless for normal operation (in fact, hashm_procid is not used
     558             :      * anywhere), but it might be handy for forensic purposes so we keep it.
     559             :      */
     560         372 :     metap->hashm_procid = procid;
     561             : 
     562             :     /*
     563             :      * We initialize the index with N buckets, 0 .. N-1, occupying physical
     564             :      * blocks 1 to N.  The first freespace bitmap page is in block N+1.
     565             :      */
     566         372 :     metap->hashm_maxbucket = num_buckets - 1;
     567             : 
     568             :     /*
     569             :      * Set highmask as next immediate ((2 ^ x) - 1), which should be
     570             :      * sufficient to cover num_buckets.
     571             :      */
     572         372 :     metap->hashm_highmask = pg_nextpower2_32(num_buckets + 1) - 1;
     573         372 :     metap->hashm_lowmask = (metap->hashm_highmask >> 1);
     574             : 
     575         372 :     MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
     576         372 :     MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
     577             : 
     578             :     /* Set up mapping for one spare page after the initial splitpoints */
     579         372 :     metap->hashm_spares[spare_index] = 1;
     580         372 :     metap->hashm_ovflpoint = spare_index;
     581         372 :     metap->hashm_firstfree = 0;
     582             : 
     583             :     /*
     584             :      * Set pd_lower just past the end of the metadata.  This is essential,
     585             :      * because without doing so, metadata will be lost if xlog.c compresses
     586             :      * the page.
     587             :      */
     588         372 :     ((PageHeader) page)->pd_lower =
     589         372 :         ((char *) metap + sizeof(HashMetaPageData)) - (char *) page;
     590         372 : }
     591             : 
     592             : /*
     593             :  *  _hash_pageinit() -- Initialize a new hash index page.
     594             :  */
     595             : void
     596       11148 : _hash_pageinit(Page page, Size size)
     597             : {
     598       11148 :     PageInit(page, size, sizeof(HashPageOpaqueData));
     599       11148 : }
     600             : 
     601             : /*
     602             :  * Attempt to expand the hash table by creating one new bucket.
     603             :  *
     604             :  * This will silently do nothing if we don't get cleanup lock on old or
     605             :  * new bucket.
     606             :  *
     607             :  * Complete the pending splits and remove the tuples from old bucket,
     608             :  * if there are any left over from the previous split.
     609             :  *
     610             :  * The caller must hold a pin, but no lock, on the metapage buffer.
     611             :  * The buffer is returned in the same state.
     612             :  */
     613             : void
     614        1332 : _hash_expandtable(Relation rel, Buffer metabuf)
     615             : {
     616             :     HashMetaPage metap;
     617             :     Bucket      old_bucket;
     618             :     Bucket      new_bucket;
     619             :     uint32      spare_ndx;
     620             :     BlockNumber start_oblkno;
     621             :     BlockNumber start_nblkno;
     622             :     Buffer      buf_nblkno;
     623             :     Buffer      buf_oblkno;
     624             :     Page        opage;
     625             :     Page        npage;
     626             :     HashPageOpaque oopaque;
     627             :     HashPageOpaque nopaque;
     628             :     uint32      maxbucket;
     629             :     uint32      highmask;
     630             :     uint32      lowmask;
     631        1332 :     bool        metap_update_masks = false;
     632        1332 :     bool        metap_update_splitpoint = false;
     633             : 
     634        1332 : restart_expand:
     635             : 
     636             :     /*
     637             :      * Write-lock the meta page.  It used to be necessary to acquire a
     638             :      * heavyweight lock to begin a split, but that is no longer required.
     639             :      */
     640        1332 :     LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
     641             : 
     642        1332 :     _hash_checkpage(rel, metabuf, LH_META_PAGE);
     643        1332 :     metap = HashPageGetMeta(BufferGetPage(metabuf));
     644             : 
     645             :     /*
     646             :      * Check to see if split is still needed; someone else might have already
     647             :      * done one while we waited for the lock.
     648             :      *
     649             :      * Make sure this stays in sync with _hash_doinsert()
     650             :      */
     651        1332 :     if (metap->hashm_ntuples <=
     652        1332 :         (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
     653           0 :         goto fail;
     654             : 
     655             :     /*
     656             :      * Can't split anymore if maxbucket has reached its maximum possible
     657             :      * value.
     658             :      *
     659             :      * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
     660             :      * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
     661             :      * to half that to prevent failure of pg_ceil_log2_32() and insufficient
     662             :      * space in hashm_spares[].  It's moot anyway because an index with 2^32
     663             :      * buckets would certainly overflow BlockNumber and hence
     664             :      * _hash_alloc_buckets() would fail, but if we supported buckets smaller
     665             :      * than a disk block then this would be an independent constraint.
     666             :      *
     667             :      * If you change this, see also the maximum initial number of buckets in
     668             :      * _hash_init().
     669             :      */
     670        1332 :     if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
     671           0 :         goto fail;
     672             : 
     673             :     /*
     674             :      * Determine which bucket is to be split, and attempt to take cleanup lock
     675             :      * on the old bucket.  If we can't get the lock, give up.
     676             :      *
     677             :      * The cleanup lock protects us not only against other backends, but
     678             :      * against our own backend as well.
     679             :      *
     680             :      * The cleanup lock is mainly to protect the split from concurrent
     681             :      * inserts. See src/backend/access/hash/README, Lock Definitions for
     682             :      * further details.  Due to this locking restriction, if there is any
     683             :      * pending scan, the split will give up which is not good, but harmless.
     684             :      */
     685        1332 :     new_bucket = metap->hashm_maxbucket + 1;
     686             : 
     687        1332 :     old_bucket = (new_bucket & metap->hashm_lowmask);
     688             : 
     689        1332 :     start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
     690             : 
     691        1332 :     buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
     692        1332 :     if (!buf_oblkno)
     693           0 :         goto fail;
     694             : 
     695        1332 :     opage = BufferGetPage(buf_oblkno);
     696        1332 :     oopaque = HashPageGetOpaque(opage);
     697             : 
     698             :     /*
     699             :      * We want to finish the split from a bucket as there is no apparent
     700             :      * benefit by not doing so and it will make the code complicated to finish
     701             :      * the split that involves multiple buckets considering the case where new
     702             :      * split also fails.  We don't need to consider the new bucket for
     703             :      * completing the split here as it is not possible that a re-split of new
     704             :      * bucket starts when there is still a pending split from old bucket.
     705             :      */
     706        1332 :     if (H_BUCKET_BEING_SPLIT(oopaque))
     707             :     {
     708             :         /*
     709             :          * Copy bucket mapping info now; refer the comment in code below where
     710             :          * we copy this information before calling _hash_splitbucket to see
     711             :          * why this is okay.
     712             :          */
     713           0 :         maxbucket = metap->hashm_maxbucket;
     714           0 :         highmask = metap->hashm_highmask;
     715           0 :         lowmask = metap->hashm_lowmask;
     716             : 
     717             :         /*
     718             :          * Release the lock on metapage and old_bucket, before completing the
     719             :          * split.
     720             :          */
     721           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     722           0 :         LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK);
     723             : 
     724           0 :         _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket,
     725             :                            highmask, lowmask);
     726             : 
     727             :         /* release the pin on old buffer and retry for expand. */
     728           0 :         _hash_dropbuf(rel, buf_oblkno);
     729             : 
     730           0 :         goto restart_expand;
     731             :     }
     732             : 
     733             :     /*
     734             :      * Clean the tuples remained from the previous split.  This operation
     735             :      * requires cleanup lock and we already have one on the old bucket, so
     736             :      * let's do it. We also don't want to allow further splits from the bucket
     737             :      * till the garbage of previous split is cleaned.  This has two
     738             :      * advantages; first, it helps in avoiding the bloat due to garbage and
     739             :      * second is, during cleanup of bucket, we are always sure that the
     740             :      * garbage tuples belong to most recently split bucket.  On the contrary,
     741             :      * if we allow cleanup of bucket after meta page is updated to indicate
     742             :      * the new split and before the actual split, the cleanup operation won't
     743             :      * be able to decide whether the tuple has been moved to the newly created
     744             :      * bucket and ended up deleting such tuples.
     745             :      */
     746        1332 :     if (H_NEEDS_SPLIT_CLEANUP(oopaque))
     747             :     {
     748             :         /*
     749             :          * Copy bucket mapping info now; refer to the comment in code below
     750             :          * where we copy this information before calling _hash_splitbucket to
     751             :          * see why this is okay.
     752             :          */
     753           0 :         maxbucket = metap->hashm_maxbucket;
     754           0 :         highmask = metap->hashm_highmask;
     755           0 :         lowmask = metap->hashm_lowmask;
     756             : 
     757             :         /* Release the metapage lock. */
     758           0 :         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     759             : 
     760           0 :         hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL,
     761             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
     762             :                           NULL, NULL);
     763             : 
     764           0 :         _hash_dropbuf(rel, buf_oblkno);
     765             : 
     766           0 :         goto restart_expand;
     767             :     }
     768             : 
     769             :     /*
     770             :      * There shouldn't be any active scan on new bucket.
     771             :      *
     772             :      * Note: it is safe to compute the new bucket's blkno here, even though we
     773             :      * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
     774             :      * the current value of hashm_spares[hashm_ovflpoint] correctly shows
     775             :      * where we are going to put a new splitpoint's worth of buckets.
     776             :      */
     777        1332 :     start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
     778             : 
     779             :     /*
     780             :      * If the split point is increasing we need to allocate a new batch of
     781             :      * bucket pages.
     782             :      */
     783        1332 :     spare_ndx = _hash_spareindex(new_bucket + 1);
     784        1332 :     if (spare_ndx > metap->hashm_ovflpoint)
     785             :     {
     786             :         uint32      buckets_to_add;
     787             : 
     788             :         Assert(spare_ndx == metap->hashm_ovflpoint + 1);
     789             : 
     790             :         /*
     791             :          * We treat allocation of buckets as a separate WAL-logged action.
     792             :          * Even if we fail after this operation, won't leak bucket pages;
     793             :          * rather, the next split will consume this space. In any case, even
     794             :          * without failure we don't use all the space in one split operation.
     795             :          */
     796          56 :         buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket;
     797          56 :         if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add))
     798             :         {
     799             :             /* can't split due to BlockNumber overflow */
     800           0 :             _hash_relbuf(rel, buf_oblkno);
     801           0 :             goto fail;
     802             :         }
     803             :     }
     804             : 
     805             :     /*
     806             :      * Physically allocate the new bucket's primary page.  We want to do this
     807             :      * before changing the metapage's mapping info, in case we can't get the
     808             :      * disk space.
     809             :      *
     810             :      * XXX It doesn't make sense to call _hash_getnewbuf first, zeroing the
     811             :      * buffer, and then only afterwards check whether we have a cleanup lock.
     812             :      * However, since no scan can be accessing the buffer yet, any concurrent
     813             :      * accesses will just be from processes like the bgwriter or checkpointer
     814             :      * which don't care about its contents, so it doesn't really matter.
     815             :      */
     816        1332 :     buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM);
     817        1332 :     if (!IsBufferCleanupOK(buf_nblkno))
     818             :     {
     819           0 :         _hash_relbuf(rel, buf_oblkno);
     820           0 :         _hash_relbuf(rel, buf_nblkno);
     821           0 :         goto fail;
     822             :     }
     823             : 
     824             :     /*
     825             :      * Since we are scribbling on the pages in the shared buffers, establish a
     826             :      * critical section.  Any failure in this next code leaves us with a big
     827             :      * problem: the metapage is effectively corrupt but could get written back
     828             :      * to disk.
     829             :      */
     830        1332 :     START_CRIT_SECTION();
     831             : 
     832             :     /*
     833             :      * Okay to proceed with split.  Update the metapage bucket mapping info.
     834             :      */
     835        1332 :     metap->hashm_maxbucket = new_bucket;
     836             : 
     837        1332 :     if (new_bucket > metap->hashm_highmask)
     838             :     {
     839             :         /* Starting a new doubling */
     840          20 :         metap->hashm_lowmask = metap->hashm_highmask;
     841          20 :         metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
     842          20 :         metap_update_masks = true;
     843             :     }
     844             : 
     845             :     /*
     846             :      * If the split point is increasing we need to adjust the hashm_spares[]
     847             :      * array and hashm_ovflpoint so that future overflow pages will be created
     848             :      * beyond this new batch of bucket pages.
     849             :      */
     850        1332 :     if (spare_ndx > metap->hashm_ovflpoint)
     851             :     {
     852          56 :         metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
     853          56 :         metap->hashm_ovflpoint = spare_ndx;
     854          56 :         metap_update_splitpoint = true;
     855             :     }
     856             : 
     857        1332 :     MarkBufferDirty(metabuf);
     858             : 
     859             :     /*
     860             :      * Copy bucket mapping info now; this saves re-accessing the meta page
     861             :      * inside _hash_splitbucket's inner loop.  Note that once we drop the
     862             :      * split lock, other splits could begin, so these values might be out of
     863             :      * date before _hash_splitbucket finishes.  That's okay, since all it
     864             :      * needs is to tell which of these two buckets to map hashkeys into.
     865             :      */
     866        1332 :     maxbucket = metap->hashm_maxbucket;
     867        1332 :     highmask = metap->hashm_highmask;
     868        1332 :     lowmask = metap->hashm_lowmask;
     869             : 
     870        1332 :     opage = BufferGetPage(buf_oblkno);
     871        1332 :     oopaque = HashPageGetOpaque(opage);
     872             : 
     873             :     /*
     874             :      * Mark the old bucket to indicate that split is in progress.  (At
     875             :      * operation end, we will clear the split-in-progress flag.)  Also, for a
     876             :      * primary bucket page, hasho_prevblkno stores the number of buckets that
     877             :      * existed as of the last split, so we must update that value here.
     878             :      */
     879        1332 :     oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
     880        1332 :     oopaque->hasho_prevblkno = maxbucket;
     881             : 
     882        1332 :     MarkBufferDirty(buf_oblkno);
     883             : 
     884        1332 :     npage = BufferGetPage(buf_nblkno);
     885             : 
     886             :     /*
     887             :      * initialize the new bucket's primary page and mark it to indicate that
     888             :      * split is in progress.
     889             :      */
     890        1332 :     nopaque = HashPageGetOpaque(npage);
     891        1332 :     nopaque->hasho_prevblkno = maxbucket;
     892        1332 :     nopaque->hasho_nextblkno = InvalidBlockNumber;
     893        1332 :     nopaque->hasho_bucket = new_bucket;
     894        1332 :     nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
     895        1332 :     nopaque->hasho_page_id = HASHO_PAGE_ID;
     896             : 
     897        1332 :     MarkBufferDirty(buf_nblkno);
     898             : 
     899             :     /* XLOG stuff */
     900        1332 :     if (RelationNeedsWAL(rel))
     901             :     {
     902             :         xl_hash_split_allocate_page xlrec;
     903             :         XLogRecPtr  recptr;
     904             : 
     905        1080 :         xlrec.new_bucket = maxbucket;
     906        1080 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
     907        1080 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
     908        1080 :         xlrec.flags = 0;
     909             : 
     910        1080 :         XLogBeginInsert();
     911             : 
     912        1080 :         XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD);
     913        1080 :         XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT);
     914        1080 :         XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD);
     915             : 
     916        1080 :         if (metap_update_masks)
     917             :         {
     918          20 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS;
     919          20 :             XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32));
     920          20 :             XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32));
     921             :         }
     922             : 
     923        1080 :         if (metap_update_splitpoint)
     924             :         {
     925          48 :             xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT;
     926          48 :             XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint,
     927             :                                 sizeof(uint32));
     928          48 :             XLogRegisterBufData(2,
     929          48 :                                 (char *) &metap->hashm_spares[metap->hashm_ovflpoint],
     930             :                                 sizeof(uint32));
     931             :         }
     932             : 
     933        1080 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage);
     934             : 
     935        1080 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE);
     936             : 
     937        1080 :         PageSetLSN(BufferGetPage(buf_oblkno), recptr);
     938        1080 :         PageSetLSN(BufferGetPage(buf_nblkno), recptr);
     939        1080 :         PageSetLSN(BufferGetPage(metabuf), recptr);
     940             :     }
     941             : 
     942        1332 :     END_CRIT_SECTION();
     943             : 
     944             :     /* drop lock, but keep pin */
     945        1332 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     946             : 
     947             :     /* Relocate records to the new bucket */
     948        1332 :     _hash_splitbucket(rel, metabuf,
     949             :                       old_bucket, new_bucket,
     950             :                       buf_oblkno, buf_nblkno, NULL,
     951             :                       maxbucket, highmask, lowmask);
     952             : 
     953             :     /* all done, now release the pins on primary buckets. */
     954        1332 :     _hash_dropbuf(rel, buf_oblkno);
     955        1332 :     _hash_dropbuf(rel, buf_nblkno);
     956             : 
     957        1332 :     return;
     958             : 
     959             :     /* Here if decide not to split or fail to acquire old bucket lock */
     960           0 : fail:
     961             : 
     962             :     /* We didn't write the metapage, so just drop lock */
     963           0 :     LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
     964             : }
     965             : 
     966             : 
     967             : /*
     968             :  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
     969             :  *
     970             :  * This does not need to initialize the new bucket pages; we'll do that as
     971             :  * each one is used by _hash_expandtable().  But we have to extend the logical
     972             :  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
     973             :  * sync with ours, so that we don't get complaints from smgr.
     974             :  *
     975             :  * We do this by writing a page of zeroes at the end of the splitpoint range.
     976             :  * We expect that the filesystem will ensure that the intervening pages read
     977             :  * as zeroes too.  On many filesystems this "hole" will not be allocated
     978             :  * immediately, which means that the index file may end up more fragmented
     979             :  * than if we forced it all to be allocated now; but since we don't scan
     980             :  * hash indexes sequentially anyway, that probably doesn't matter.
     981             :  *
     982             :  * XXX It's annoying that this code is executed with the metapage lock held.
     983             :  * We need to interlock against _hash_addovflpage() adding a new overflow page
     984             :  * concurrently, but it'd likely be better to use LockRelationForExtension
     985             :  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
     986             :  * so it may not be worth worrying about.
     987             :  *
     988             :  * Returns true if successful, or false if allocation failed due to
     989             :  * BlockNumber overflow.
     990             :  */
     991             : static bool
     992          56 : _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
     993             : {
     994             :     BlockNumber lastblock;
     995             :     PGIOAlignedBlock zerobuf;
     996             :     Page        page;
     997             :     HashPageOpaque ovflopaque;
     998             : 
     999          56 :     lastblock = firstblock + nblocks - 1;
    1000             : 
    1001             :     /*
    1002             :      * Check for overflow in block number calculation; if so, we cannot extend
    1003             :      * the index anymore.
    1004             :      */
    1005          56 :     if (lastblock < firstblock || lastblock == InvalidBlockNumber)
    1006           0 :         return false;
    1007             : 
    1008          56 :     page = (Page) zerobuf.data;
    1009             : 
    1010             :     /*
    1011             :      * Initialize the page.  Just zeroing the page won't work; see
    1012             :      * _hash_freeovflpage for similar usage.  We take care to make the special
    1013             :      * space valid for the benefit of tools such as pageinspect.
    1014             :      */
    1015          56 :     _hash_pageinit(page, BLCKSZ);
    1016             : 
    1017          56 :     ovflopaque = HashPageGetOpaque(page);
    1018             : 
    1019          56 :     ovflopaque->hasho_prevblkno = InvalidBlockNumber;
    1020          56 :     ovflopaque->hasho_nextblkno = InvalidBlockNumber;
    1021          56 :     ovflopaque->hasho_bucket = InvalidBucket;
    1022          56 :     ovflopaque->hasho_flag = LH_UNUSED_PAGE;
    1023          56 :     ovflopaque->hasho_page_id = HASHO_PAGE_ID;
    1024             : 
    1025          56 :     if (RelationNeedsWAL(rel))
    1026          48 :         log_newpage(&rel->rd_locator,
    1027             :                     MAIN_FORKNUM,
    1028             :                     lastblock,
    1029             :                     zerobuf.data,
    1030             :                     true);
    1031             : 
    1032          56 :     PageSetChecksumInplace(page, lastblock);
    1033          56 :     smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data,
    1034             :                false);
    1035             : 
    1036          56 :     return true;
    1037             : }
    1038             : 
    1039             : 
    1040             : /*
    1041             :  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
    1042             :  *
    1043             :  * This routine is used to partition the tuples between old and new bucket and
    1044             :  * is used to finish the incomplete split operations.  To finish the previously
    1045             :  * interrupted split operation, the caller needs to fill htab.  If htab is set,
    1046             :  * then we skip the movement of tuples that exists in htab, otherwise NULL
    1047             :  * value of htab indicates movement of all the tuples that belong to the new
    1048             :  * bucket.
    1049             :  *
    1050             :  * We are splitting a bucket that consists of a base bucket page and zero
    1051             :  * or more overflow (bucket chain) pages.  We must relocate tuples that
    1052             :  * belong in the new bucket.
    1053             :  *
    1054             :  * The caller must hold cleanup locks on both buckets to ensure that
    1055             :  * no one else is trying to access them (see README).
    1056             :  *
    1057             :  * The caller must hold a pin, but no lock, on the metapage buffer.
    1058             :  * The buffer is returned in the same state.  (The metapage is only
    1059             :  * touched if it becomes necessary to add or remove overflow pages.)
    1060             :  *
    1061             :  * Split needs to retain pin on primary bucket pages of both old and new
    1062             :  * buckets till end of operation.  This is to prevent vacuum from starting
    1063             :  * while a split is in progress.
    1064             :  *
    1065             :  * In addition, the caller must have created the new bucket's base page,
    1066             :  * which is passed in buffer nbuf, pinned and write-locked.  The lock will be
    1067             :  * released here and pin must be released by the caller.  (The API is set up
    1068             :  * this way because we must do _hash_getnewbuf() before releasing the metapage
    1069             :  * write lock.  So instead of passing the new bucket's start block number, we
    1070             :  * pass an actual buffer.)
    1071             :  */
    1072             : static void
    1073        1332 : _hash_splitbucket(Relation rel,
    1074             :                   Buffer metabuf,
    1075             :                   Bucket obucket,
    1076             :                   Bucket nbucket,
    1077             :                   Buffer obuf,
    1078             :                   Buffer nbuf,
    1079             :                   HTAB *htab,
    1080             :                   uint32 maxbucket,
    1081             :                   uint32 highmask,
    1082             :                   uint32 lowmask)
    1083             : {
    1084             :     Buffer      bucket_obuf;
    1085             :     Buffer      bucket_nbuf;
    1086             :     Page        opage;
    1087             :     Page        npage;
    1088             :     HashPageOpaque oopaque;
    1089             :     HashPageOpaque nopaque;
    1090             :     OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
    1091             :     IndexTuple  itups[MaxIndexTuplesPerPage];
    1092        1332 :     Size        all_tups_size = 0;
    1093             :     int         i;
    1094        1332 :     uint16      nitups = 0;
    1095             : 
    1096        1332 :     bucket_obuf = obuf;
    1097        1332 :     opage = BufferGetPage(obuf);
    1098        1332 :     oopaque = HashPageGetOpaque(opage);
    1099             : 
    1100        1332 :     bucket_nbuf = nbuf;
    1101        1332 :     npage = BufferGetPage(nbuf);
    1102        1332 :     nopaque = HashPageGetOpaque(npage);
    1103             : 
    1104             :     /* Copy the predicate locks from old bucket to new bucket. */
    1105        1332 :     PredicateLockPageSplit(rel,
    1106             :                            BufferGetBlockNumber(bucket_obuf),
    1107             :                            BufferGetBlockNumber(bucket_nbuf));
    1108             : 
    1109             :     /*
    1110             :      * Partition the tuples in the old bucket between the old bucket and the
    1111             :      * new bucket, advancing along the old bucket's overflow bucket chain and
    1112             :      * adding overflow pages to the new bucket as needed.  Outer loop iterates
    1113             :      * once per page in old bucket.
    1114             :      */
    1115             :     for (;;)
    1116         338 :     {
    1117             :         BlockNumber oblkno;
    1118             :         OffsetNumber ooffnum;
    1119             :         OffsetNumber omaxoffnum;
    1120             : 
    1121             :         /* Scan each tuple in old page */
    1122        1670 :         omaxoffnum = PageGetMaxOffsetNumber(opage);
    1123      306892 :         for (ooffnum = FirstOffsetNumber;
    1124             :              ooffnum <= omaxoffnum;
    1125      305222 :              ooffnum = OffsetNumberNext(ooffnum))
    1126             :         {
    1127             :             IndexTuple  itup;
    1128             :             Size        itemsz;
    1129             :             Bucket      bucket;
    1130      305222 :             bool        found = false;
    1131             : 
    1132             :             /* skip dead tuples */
    1133      305222 :             if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
    1134           0 :                 continue;
    1135             : 
    1136             :             /*
    1137             :              * Before inserting a tuple, probe the hash table containing TIDs
    1138             :              * of tuples belonging to new bucket, if we find a match, then
    1139             :              * skip that tuple, else fetch the item's hash key (conveniently
    1140             :              * stored in the item) and determine which bucket it now belongs
    1141             :              * in.
    1142             :              */
    1143      305222 :             itup = (IndexTuple) PageGetItem(opage,
    1144             :                                             PageGetItemId(opage, ooffnum));
    1145             : 
    1146      305222 :             if (htab)
    1147           0 :                 (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);
    1148             : 
    1149      305222 :             if (found)
    1150           0 :                 continue;
    1151             : 
    1152      305222 :             bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
    1153             :                                           maxbucket, highmask, lowmask);
    1154             : 
    1155      305222 :             if (bucket == nbucket)
    1156             :             {
    1157             :                 IndexTuple  new_itup;
    1158             : 
    1159             :                 /*
    1160             :                  * make a copy of index tuple as we have to scribble on it.
    1161             :                  */
    1162      124966 :                 new_itup = CopyIndexTuple(itup);
    1163             : 
    1164             :                 /*
    1165             :                  * mark the index tuple as moved by split, such tuples are
    1166             :                  * skipped by scan if there is split in progress for a bucket.
    1167             :                  */
    1168      124966 :                 new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;
    1169             : 
    1170             :                 /*
    1171             :                  * insert the tuple into the new bucket.  if it doesn't fit on
    1172             :                  * the current page in the new bucket, we must allocate a new
    1173             :                  * overflow page and place the tuple on that page instead.
    1174             :                  */
    1175      124966 :                 itemsz = IndexTupleSize(new_itup);
    1176      124966 :                 itemsz = MAXALIGN(itemsz);
    1177             : 
    1178      124966 :                 if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
    1179             :                 {
    1180             :                     /*
    1181             :                      * Change the shared buffer state in critical section,
    1182             :                      * otherwise any error could make it unrecoverable.
    1183             :                      */
    1184          78 :                     START_CRIT_SECTION();
    1185             : 
    1186          78 :                     _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1187          78 :                     MarkBufferDirty(nbuf);
    1188             :                     /* log the split operation before releasing the lock */
    1189          78 :                     log_split_page(rel, nbuf);
    1190             : 
    1191          78 :                     END_CRIT_SECTION();
    1192             : 
    1193             :                     /* drop lock, but keep pin */
    1194          78 :                     LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1195             : 
    1196             :                     /* be tidy */
    1197       31824 :                     for (i = 0; i < nitups; i++)
    1198       31746 :                         pfree(itups[i]);
    1199          78 :                     nitups = 0;
    1200          78 :                     all_tups_size = 0;
    1201             : 
    1202             :                     /* chain to a new overflow page */
    1203          78 :                     nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf));
    1204          78 :                     npage = BufferGetPage(nbuf);
    1205          78 :                     nopaque = HashPageGetOpaque(npage);
    1206             :                 }
    1207             : 
    1208      124966 :                 itups[nitups++] = new_itup;
    1209      124966 :                 all_tups_size += itemsz;
    1210             :             }
    1211             :             else
    1212             :             {
    1213             :                 /*
    1214             :                  * the tuple stays on this page, so nothing to do.
    1215             :                  */
    1216             :                 Assert(bucket == obucket);
    1217             :             }
    1218             :         }
    1219             : 
    1220        1670 :         oblkno = oopaque->hasho_nextblkno;
    1221             : 
    1222             :         /* retain the pin on the old primary bucket */
    1223        1670 :         if (obuf == bucket_obuf)
    1224        1332 :             LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1225             :         else
    1226         338 :             _hash_relbuf(rel, obuf);
    1227             : 
    1228             :         /* Exit loop if no more overflow pages in old bucket */
    1229        1670 :         if (!BlockNumberIsValid(oblkno))
    1230             :         {
    1231             :             /*
    1232             :              * Change the shared buffer state in critical section, otherwise
    1233             :              * any error could make it unrecoverable.
    1234             :              */
    1235        1332 :             START_CRIT_SECTION();
    1236             : 
    1237        1332 :             _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
    1238        1332 :             MarkBufferDirty(nbuf);
    1239             :             /* log the split operation before releasing the lock */
    1240        1332 :             log_split_page(rel, nbuf);
    1241             : 
    1242        1332 :             END_CRIT_SECTION();
    1243             : 
    1244        1332 :             if (nbuf == bucket_nbuf)
    1245        1326 :                 LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1246             :             else
    1247           6 :                 _hash_relbuf(rel, nbuf);
    1248             : 
    1249             :             /* be tidy */
    1250       94552 :             for (i = 0; i < nitups; i++)
    1251       93220 :                 pfree(itups[i]);
    1252        1332 :             break;
    1253             :         }
    1254             : 
    1255             :         /* Else, advance to next old page */
    1256         338 :         obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
    1257         338 :         opage = BufferGetPage(obuf);
    1258         338 :         oopaque = HashPageGetOpaque(opage);
    1259             :     }
    1260             : 
    1261             :     /*
    1262             :      * We're at the end of the old bucket chain, so we're done partitioning
    1263             :      * the tuples.  Mark the old and new buckets to indicate split is
    1264             :      * finished.
    1265             :      *
    1266             :      * To avoid deadlocks due to locking order of buckets, first lock the old
    1267             :      * bucket and then the new bucket.
    1268             :      */
    1269        1332 :     LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
    1270        1332 :     opage = BufferGetPage(bucket_obuf);
    1271        1332 :     oopaque = HashPageGetOpaque(opage);
    1272             : 
    1273        1332 :     LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
    1274        1332 :     npage = BufferGetPage(bucket_nbuf);
    1275        1332 :     nopaque = HashPageGetOpaque(npage);
    1276             : 
    1277        1332 :     START_CRIT_SECTION();
    1278             : 
    1279        1332 :     oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
    1280        1332 :     nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;
    1281             : 
    1282             :     /*
    1283             :      * After the split is finished, mark the old bucket to indicate that it
    1284             :      * contains deletable tuples.  We will clear split-cleanup flag after
    1285             :      * deleting such tuples either at the end of split or at the next split
    1286             :      * from old bucket or at the time of vacuum.
    1287             :      */
    1288        1332 :     oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    1289             : 
    1290             :     /*
    1291             :      * now write the buffers, here we don't release the locks as caller is
    1292             :      * responsible to release locks.
    1293             :      */
    1294        1332 :     MarkBufferDirty(bucket_obuf);
    1295        1332 :     MarkBufferDirty(bucket_nbuf);
    1296             : 
    1297        1332 :     if (RelationNeedsWAL(rel))
    1298             :     {
    1299             :         XLogRecPtr  recptr;
    1300             :         xl_hash_split_complete xlrec;
    1301             : 
    1302        1080 :         xlrec.old_bucket_flag = oopaque->hasho_flag;
    1303        1080 :         xlrec.new_bucket_flag = nopaque->hasho_flag;
    1304             : 
    1305        1080 :         XLogBeginInsert();
    1306             : 
    1307        1080 :         XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);
    1308             : 
    1309        1080 :         XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
    1310        1080 :         XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);
    1311             : 
    1312        1080 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);
    1313             : 
    1314        1080 :         PageSetLSN(BufferGetPage(bucket_obuf), recptr);
    1315        1080 :         PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
    1316             :     }
    1317             : 
    1318        1332 :     END_CRIT_SECTION();
    1319             : 
    1320             :     /*
    1321             :      * If possible, clean up the old bucket.  We might not be able to do this
    1322             :      * if someone else has a pin on it, but if not then we can go ahead.  This
    1323             :      * isn't absolutely necessary, but it reduces bloat; if we don't do it
    1324             :      * now, VACUUM will do it eventually, but maybe not until new overflow
    1325             :      * pages have been allocated.  Note that there's no need to clean up the
    1326             :      * new bucket.
    1327             :      */
    1328        1332 :     if (IsBufferCleanupOK(bucket_obuf))
    1329             :     {
    1330        1332 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1331        1332 :         hashbucketcleanup(rel, obucket, bucket_obuf,
    1332             :                           BufferGetBlockNumber(bucket_obuf), NULL,
    1333             :                           maxbucket, highmask, lowmask, NULL, NULL, true,
    1334             :                           NULL, NULL);
    1335             :     }
    1336             :     else
    1337             :     {
    1338           0 :         LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
    1339           0 :         LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
    1340             :     }
    1341        1332 : }
    1342             : 
    1343             : /*
    1344             :  *  _hash_finish_split() -- Finish the previously interrupted split operation
    1345             :  *
    1346             :  * To complete the split operation, we form the hash table of TIDs in new
    1347             :  * bucket which is then used by split operation to skip tuples that are
    1348             :  * already moved before the split operation was previously interrupted.
    1349             :  *
    1350             :  * The caller must hold a pin, but no lock, on the metapage and old bucket's
    1351             :  * primary page buffer.  The buffers are returned in the same state.  (The
    1352             :  * metapage is only touched if it becomes necessary to add or remove overflow
    1353             :  * pages.)
    1354             :  */
    1355             : void
    1356           0 : _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
    1357             :                    uint32 maxbucket, uint32 highmask, uint32 lowmask)
    1358             : {
    1359             :     HASHCTL     hash_ctl;
    1360             :     HTAB       *tidhtab;
    1361           0 :     Buffer      bucket_nbuf = InvalidBuffer;
    1362             :     Buffer      nbuf;
    1363             :     Page        npage;
    1364             :     BlockNumber nblkno;
    1365             :     BlockNumber bucket_nblkno;
    1366             :     HashPageOpaque npageopaque;
    1367             :     Bucket      nbucket;
    1368             :     bool        found;
    1369             : 
    1370             :     /* Initialize hash tables used to track TIDs */
    1371           0 :     hash_ctl.keysize = sizeof(ItemPointerData);
    1372           0 :     hash_ctl.entrysize = sizeof(ItemPointerData);
    1373           0 :     hash_ctl.hcxt = CurrentMemoryContext;
    1374             : 
    1375             :     tidhtab =
    1376           0 :         hash_create("bucket ctids",
    1377             :                     256,        /* arbitrary initial size */
    1378             :                     &hash_ctl,
    1379             :                     HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    1380             : 
    1381           0 :     bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);
    1382             : 
    1383             :     /*
    1384             :      * Scan the new bucket and build hash table of TIDs
    1385             :      */
    1386             :     for (;;)
    1387           0 :     {
    1388             :         OffsetNumber noffnum;
    1389             :         OffsetNumber nmaxoffnum;
    1390             : 
    1391           0 :         nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
    1392             :                             LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
    1393             : 
    1394             :         /* remember the primary bucket buffer to acquire cleanup lock on it. */
    1395           0 :         if (nblkno == bucket_nblkno)
    1396           0 :             bucket_nbuf = nbuf;
    1397             : 
    1398           0 :         npage = BufferGetPage(nbuf);
    1399           0 :         npageopaque = HashPageGetOpaque(npage);
    1400             : 
    1401             :         /* Scan each tuple in new page */
    1402           0 :         nmaxoffnum = PageGetMaxOffsetNumber(npage);
    1403           0 :         for (noffnum = FirstOffsetNumber;
    1404             :              noffnum <= nmaxoffnum;
    1405           0 :              noffnum = OffsetNumberNext(noffnum))
    1406             :         {
    1407             :             IndexTuple  itup;
    1408             : 
    1409             :             /* Fetch the item's TID and insert it in hash table. */
    1410           0 :             itup = (IndexTuple) PageGetItem(npage,
    1411             :                                             PageGetItemId(npage, noffnum));
    1412             : 
    1413           0 :             (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);
    1414             : 
    1415             :             Assert(!found);
    1416             :         }
    1417             : 
    1418           0 :         nblkno = npageopaque->hasho_nextblkno;
    1419             : 
    1420             :         /*
    1421             :          * release our write lock without modifying buffer and ensure to
    1422             :          * retain the pin on primary bucket.
    1423             :          */
    1424           0 :         if (nbuf == bucket_nbuf)
    1425           0 :             LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
    1426             :         else
    1427           0 :             _hash_relbuf(rel, nbuf);
    1428             : 
    1429             :         /* Exit loop if no more overflow pages in new bucket */
    1430           0 :         if (!BlockNumberIsValid(nblkno))
    1431           0 :             break;
    1432             :     }
    1433             : 
    1434             :     /*
    1435             :      * Conditionally get the cleanup lock on old and new buckets to perform
    1436             :      * the split operation.  If we don't get the cleanup locks, silently give
    1437             :      * up and next insertion on old bucket will try again to complete the
    1438             :      * split.
    1439             :      */
    1440           0 :     if (!ConditionalLockBufferForCleanup(obuf))
    1441             :     {
    1442           0 :         hash_destroy(tidhtab);
    1443           0 :         return;
    1444             :     }
    1445           0 :     if (!ConditionalLockBufferForCleanup(bucket_nbuf))
    1446             :     {
    1447           0 :         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    1448           0 :         hash_destroy(tidhtab);
    1449           0 :         return;
    1450             :     }
    1451             : 
    1452           0 :     npage = BufferGetPage(bucket_nbuf);
    1453           0 :     npageopaque = HashPageGetOpaque(npage);
    1454           0 :     nbucket = npageopaque->hasho_bucket;
    1455             : 
    1456           0 :     _hash_splitbucket(rel, metabuf, obucket,
    1457             :                       nbucket, obuf, bucket_nbuf, tidhtab,
    1458             :                       maxbucket, highmask, lowmask);
    1459             : 
    1460           0 :     _hash_dropbuf(rel, bucket_nbuf);
    1461           0 :     hash_destroy(tidhtab);
    1462             : }
    1463             : 
    1464             : /*
    1465             :  *  log_split_page() -- Log the split operation
    1466             :  *
    1467             :  *  We log the split operation when the new page in new bucket gets full,
    1468             :  *  so we log the entire page.
    1469             :  *
    1470             :  *  'buf' must be locked by the caller which is also responsible for unlocking
    1471             :  *  it.
    1472             :  */
    1473             : static void
    1474        1410 : log_split_page(Relation rel, Buffer buf)
    1475             : {
    1476        1410 :     if (RelationNeedsWAL(rel))
    1477             :     {
    1478             :         XLogRecPtr  recptr;
    1479             : 
    1480        1158 :         XLogBeginInsert();
    1481             : 
    1482        1158 :         XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
    1483             : 
    1484        1158 :         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE);
    1485             : 
    1486        1158 :         PageSetLSN(BufferGetPage(buf), recptr);
    1487             :     }
    1488        1410 : }
    1489             : 
    1490             : /*
    1491             :  *  _hash_getcachedmetap() -- Returns cached metapage data.
    1492             :  *
    1493             :  *  If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
    1494             :  *  the metapage.  If not set, we'll set it before returning if we have to
    1495             :  *  refresh the cache, and return with a pin but no lock on it; caller is
    1496             :  *  responsible for releasing the pin.
    1497             :  *
    1498             :  *  We refresh the cache if it's not initialized yet or force_refresh is true.
    1499             :  */
    1500             : HashMetaPage
    1501      725826 : _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
    1502             : {
    1503             :     Page        page;
    1504             : 
    1505             :     Assert(metabuf);
    1506      725826 :     if (force_refresh || rel->rd_amcache == NULL)
    1507             :     {
    1508        1110 :         char       *cache = NULL;
    1509             : 
    1510             :         /*
    1511             :          * It's important that we don't set rd_amcache to an invalid value.
    1512             :          * Either MemoryContextAlloc or _hash_getbuf could fail, so don't
    1513             :          * install a pointer to the newly-allocated storage in the actual
    1514             :          * relcache entry until both have succeeded.
    1515             :          */
    1516        1110 :         if (rel->rd_amcache == NULL)
    1517         508 :             cache = MemoryContextAlloc(rel->rd_indexcxt,
    1518             :                                        sizeof(HashMetaPageData));
    1519             : 
    1520             :         /* Read the metapage. */
    1521        1110 :         if (BufferIsValid(*metabuf))
    1522           0 :             LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
    1523             :         else
    1524        1110 :             *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
    1525             :                                     LH_META_PAGE);
    1526        1110 :         page = BufferGetPage(*metabuf);
    1527             : 
    1528             :         /* Populate the cache. */
    1529        1110 :         if (rel->rd_amcache == NULL)
    1530         508 :             rel->rd_amcache = cache;
    1531        1110 :         memcpy(rel->rd_amcache, HashPageGetMeta(page),
    1532             :                sizeof(HashMetaPageData));
    1533             : 
    1534             :         /* Release metapage lock, but keep the pin. */
    1535        1110 :         LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
    1536             :     }
    1537             : 
    1538      725826 :     return (HashMetaPage) rel->rd_amcache;
    1539             : }
    1540             : 
    1541             : /*
    1542             :  *  _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
    1543             :  *                                       hashkey.
    1544             :  *
    1545             :  *  Bucket pages do not move or get removed once they are allocated. This give
    1546             :  *  us an opportunity to use the previously saved metapage contents to reach
    1547             :  *  the target bucket buffer, instead of reading from the metapage every time.
    1548             :  *  This saves one buffer access every time we want to reach the target bucket
    1549             :  *  buffer, which is very helpful savings in bufmgr traffic and contention.
    1550             :  *
    1551             :  *  The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
    1552             :  *  bucket buffer has to be locked for reading or writing.
    1553             :  *
    1554             :  *  The out parameter cachedmetap is set with metapage contents used for
    1555             :  *  hashkey to bucket buffer mapping. Some callers need this info to reach the
    1556             :  *  old bucket in case of bucket split, see _hash_doinsert().
    1557             :  */
    1558             : Buffer
    1559      725194 : _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
    1560             :                                 HashMetaPage *cachedmetap)
    1561             : {
    1562             :     HashMetaPage metap;
    1563             :     Buffer      buf;
    1564      725194 :     Buffer      metabuf = InvalidBuffer;
    1565             :     Page        page;
    1566             :     Bucket      bucket;
    1567             :     BlockNumber blkno;
    1568             :     HashPageOpaque opaque;
    1569             : 
    1570             :     /* We read from target bucket buffer, hence locking is must. */
    1571             :     Assert(access == HASH_READ || access == HASH_WRITE);
    1572             : 
    1573      725194 :     metap = _hash_getcachedmetap(rel, &metabuf, false);
    1574             :     Assert(metap != NULL);
    1575             : 
    1576             :     /*
    1577             :      * Loop until we get a lock on the correct target bucket.
    1578             :      */
    1579             :     for (;;)
    1580             :     {
    1581             :         /*
    1582             :          * Compute the target bucket number, and convert to block number.
    1583             :          */
    1584      725796 :         bucket = _hash_hashkey2bucket(hashkey,
    1585             :                                       metap->hashm_maxbucket,
    1586             :                                       metap->hashm_highmask,
    1587             :                                       metap->hashm_lowmask);
    1588             : 
    1589      725796 :         blkno = BUCKET_TO_BLKNO(metap, bucket);
    1590             : 
    1591             :         /* Fetch the primary bucket page for the bucket */
    1592      725796 :         buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
    1593      725796 :         page = BufferGetPage(buf);
    1594      725796 :         opaque = HashPageGetOpaque(page);
    1595             :         Assert(opaque->hasho_bucket == bucket);
    1596             :         Assert(opaque->hasho_prevblkno != InvalidBlockNumber);
    1597             : 
    1598             :         /*
    1599             :          * If this bucket hasn't been split, we're done.
    1600             :          */
    1601      725796 :         if (opaque->hasho_prevblkno <= metap->hashm_maxbucket)
    1602      725194 :             break;
    1603             : 
    1604             :         /* Drop lock on this buffer, update cached metapage, and retry. */
    1605         602 :         _hash_relbuf(rel, buf);
    1606         602 :         metap = _hash_getcachedmetap(rel, &metabuf, true);
    1607             :         Assert(metap != NULL);
    1608             :     }
    1609             : 
    1610      725194 :     if (BufferIsValid(metabuf))
    1611        1108 :         _hash_dropbuf(rel, metabuf);
    1612             : 
    1613      725194 :     if (cachedmetap)
    1614      724662 :         *cachedmetap = metap;
    1615             : 
    1616      725194 :     return buf;
    1617             : }

Generated by: LCOV version 1.14