LCOV - code coverage report
Current view: top level - src/backend/utils/hash - dynahash.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 412 510 80.8 %
Date: 2025-01-18 04:15:08 Functions: 34 37 91.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * dynahash.c
       4             :  *    dynamic chained hash tables
       5             :  *
       6             :  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
       7             :  * shared memory.  For shared hash tables, it is the caller's responsibility
       8             :  * to provide appropriate access interlocking.  The simplest convention is
       9             :  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
      10             :  * hash_seq_search) need only shared lock, but any update requires exclusive
      11             :  * lock.  For heavily-used shared tables, the single-lock approach creates a
      12             :  * concurrency bottleneck, so we also support "partitioned" locking wherein
      13             :  * there are multiple LWLocks guarding distinct subsets of the table.  To use
      14             :  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
      15             :  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
      16             :  * Therefore, each hash bucket chain operates independently, and no fields
      17             :  * of the hash header change after init except nentries and freeList.
      18             :  * (A partitioned table uses multiple copies of those fields, guarded by
      19             :  * spinlocks, for additional concurrency.)
      20             :  * This lets any subset of the hash buckets be treated as a separately
      21             :  * lockable partition.  We expect callers to use the low-order bits of a
      22             :  * lookup key's hash value as a partition number --- this will work because
      23             :  * of the way calc_bucket() maps hash values to bucket numbers.
      24             :  *
      25             :  * For hash tables in shared memory, the memory allocator function should
      26             :  * match malloc's semantics of returning NULL on failure.  For hash tables
      27             :  * in local memory, we typically use palloc() which will throw error on
      28             :  * failure.  The code in this file has to cope with both cases.
      29             :  *
      30             :  * dynahash.c provides support for these types of lookup keys:
      31             :  *
      32             :  * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
      33             :  * compared as though by strcmp().  This is selected by specifying the
      34             :  * HASH_STRINGS flag to hash_create.
      35             :  *
      36             :  * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
      37             :  * (Caller must ensure there are no undefined padding bits in the keys!)
      38             :  * This is selected by specifying the HASH_BLOBS flag to hash_create.
      39             :  *
      40             :  * 3. More complex key behavior can be selected by specifying user-supplied
      41             :  * hashing, comparison, and/or key-copying functions.  At least a hashing
      42             :  * function must be supplied; comparison defaults to memcmp() and key copying
      43             :  * to memcpy() when a user-defined hashing function is selected.
      44             :  *
      45             :  * Compared to simplehash, dynahash has the following benefits:
      46             :  *
      47             :  * - It supports partitioning, which is useful for shared memory access using
      48             :  *   locks.
      49             :  * - Shared memory hashes are allocated in a fixed size area at startup and
      50             :  *   are discoverable by name from other processes.
      51             :  * - Because entries don't need to be moved in the case of hash conflicts,
      52             :  *   dynahash has better performance for large entries.
      53             :  * - Guarantees stable pointers to entries.
      54             :  *
      55             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      56             :  * Portions Copyright (c) 1994, Regents of the University of California
      57             :  *
      58             :  *
      59             :  * IDENTIFICATION
      60             :  *    src/backend/utils/hash/dynahash.c
      61             :  *
      62             :  *-------------------------------------------------------------------------
      63             :  */
      64             : 
      65             : /*
      66             :  * Original comments:
      67             :  *
      68             :  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
      69             :  * Coded into C, with minor code improvements, and with hsearch(3) interface,
      70             :  * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
      71             :  * also, hcreate/hdestroy routines added to simulate hsearch(3).
      72             :  *
      73             :  * These routines simulate hsearch(3) and family, with the important
      74             :  * difference that the hash table is dynamic - can grow indefinitely
      75             :  * beyond its original size (as supplied to hcreate()).
      76             :  *
      77             :  * Performance appears to be comparable to that of hsearch(3).
      78             :  * The 'source-code' options referred to in hsearch(3)'s 'man' page
      79             :  * are not implemented; otherwise functionality is identical.
      80             :  *
      81             :  * Compilation controls:
      82             :  * HASH_DEBUG controls some informative traces, mainly for debugging.
      83             :  * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
      84             :  * when combined with HASH_DEBUG, these are displayed by hdestroy().
      85             :  *
      86             :  * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
      87             :  * concatenation property, in probably unnecessary code 'optimization'.
      88             :  *
      89             :  * Modified margo@postgres.berkeley.edu February 1990
      90             :  *      added multiple table interface
      91             :  * Modified by sullivan@postgres.berkeley.edu April 1990
      92             :  *      changed ctl structure for shared memory
      93             :  */
      94             : 
      95             : #include "postgres.h"
      96             : 
      97             : #include <limits.h>
      98             : 
      99             : #include "access/xact.h"
     100             : #include "common/hashfn.h"
     101             : #include "port/pg_bitutils.h"
     102             : #include "storage/shmem.h"
     103             : #include "storage/spin.h"
     104             : #include "utils/dynahash.h"
     105             : #include "utils/memutils.h"
     106             : 
     107             : 
     108             : /*
     109             :  * Constants
     110             :  *
     111             :  * A hash table has a top-level "directory", each of whose entries points
     112             :  * to a "segment" of ssize bucket headers.  The maximum number of hash
     113             :  * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
     114             :  * the number of records in the table can be larger, but we don't want a
     115             :  * whole lot of records per bucket or performance goes down.
     116             :  *
     117             :  * In a hash table allocated in shared memory, the directory cannot be
     118             :  * expanded because it must stay at a fixed address.  The directory size
     119             :  * should be selected using hash_select_dirsize (and you'd better have
     120             :  * a good idea of the maximum number of entries!).  For non-shared hash
     121             :  * tables, the initial directory size can be left at the default.
     122             :  */
     123             : #define DEF_SEGSIZE            256
     124             : #define DEF_SEGSIZE_SHIFT      8    /* must be log2(DEF_SEGSIZE) */
     125             : #define DEF_DIRSIZE            256
     126             : 
     127             : /* Number of freelists to be used for a partitioned hash table. */
     128             : #define NUM_FREELISTS           32
     129             : 
     130             : /* A hash bucket is a linked list of HASHELEMENTs */
     131             : typedef HASHELEMENT *HASHBUCKET;
     132             : 
     133             : /* A hash segment is an array of bucket headers */
     134             : typedef HASHBUCKET *HASHSEGMENT;
     135             : 
     136             : /*
     137             :  * Per-freelist data.
     138             :  *
     139             :  * In a partitioned hash table, each freelist is associated with a specific
     140             :  * set of hashcodes, as determined by the FREELIST_IDX() macro below.
     141             :  * nentries tracks the number of live hashtable entries having those hashcodes
     142             :  * (NOT the number of entries in the freelist, as you might expect).
     143             :  *
     144             :  * The coverage of a freelist might be more or less than one partition, so it
     145             :  * needs its own lock rather than relying on caller locking.  Relying on that
     146             :  * wouldn't work even if the coverage was the same, because of the occasional
     147             :  * need to "borrow" entries from another freelist; see get_hash_entry().
     148             :  *
     149             :  * Using an array of FreeListData instead of separate arrays of mutexes,
     150             :  * nentries and freeLists helps to reduce sharing of cache lines between
     151             :  * different mutexes.
     152             :  */
     153             : typedef struct
     154             : {
     155             :     slock_t     mutex;          /* spinlock for this freelist */
     156             :     long        nentries;       /* number of entries in associated buckets */
     157             :     HASHELEMENT *freeList;      /* chain of free elements */
     158             : } FreeListData;
     159             : 
     160             : /*
     161             :  * Header structure for a hash table --- contains all changeable info
     162             :  *
     163             :  * In a shared-memory hash table, the HASHHDR is in shared memory, while
     164             :  * each backend has a local HTAB struct.  For a non-shared table, there isn't
     165             :  * any functional difference between HASHHDR and HTAB, but we separate them
     166             :  * anyway to share code between shared and non-shared tables.
     167             :  */
     168             : struct HASHHDR
     169             : {
     170             :     /*
     171             :      * The freelist can become a point of contention in high-concurrency hash
     172             :      * tables, so we use an array of freelists, each with its own mutex and
     173             :      * nentries count, instead of just a single one.  Although the freelists
     174             :      * normally operate independently, we will scavenge entries from freelists
     175             :      * other than a hashcode's default freelist when necessary.
     176             :      *
     177             :      * If the hash table is not partitioned, only freeList[0] is used and its
     178             :      * spinlock is not used at all; callers' locking is assumed sufficient.
     179             :      */
     180             :     FreeListData freeList[NUM_FREELISTS];
     181             : 
     182             :     /* These fields can change, but not in a partitioned table */
     183             :     /* Also, dsize can't change in a shared table, even if unpartitioned */
     184             :     long        dsize;          /* directory size */
     185             :     long        nsegs;          /* number of allocated segments (<= dsize) */
     186             :     uint32      max_bucket;     /* ID of maximum bucket in use */
     187             :     uint32      high_mask;      /* mask to modulo into entire table */
     188             :     uint32      low_mask;       /* mask to modulo into lower half of table */
     189             : 
     190             :     /* These fields are fixed at hashtable creation */
     191             :     Size        keysize;        /* hash key length in bytes */
     192             :     Size        entrysize;      /* total user element size in bytes */
     193             :     long        num_partitions; /* # partitions (must be power of 2), or 0 */
     194             :     long        max_dsize;      /* 'dsize' limit if directory is fixed size */
     195             :     long        ssize;          /* segment size --- must be power of 2 */
     196             :     int         sshift;         /* segment shift = log2(ssize) */
     197             :     int         nelem_alloc;    /* number of entries to allocate at once */
     198             : 
     199             : #ifdef HASH_STATISTICS
     200             : 
     201             :     /*
     202             :      * Count statistics here.  NB: stats code doesn't bother with mutex, so
     203             :      * counts could be corrupted a bit in a partitioned table.
     204             :      */
     205             :     long        accesses;
     206             :     long        collisions;
     207             : #endif
     208             : };
     209             : 
     210             : #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
     211             : 
     212             : #define FREELIST_IDX(hctl, hashcode) \
     213             :     (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
     214             : 
     215             : /*
     216             :  * Top control structure for a hashtable --- in a shared table, each backend
     217             :  * has its own copy (OK since no fields change at runtime)
     218             :  */
     219             : struct HTAB
     220             : {
     221             :     HASHHDR    *hctl;           /* => shared control information */
     222             :     HASHSEGMENT *dir;           /* directory of segment starts */
     223             :     HashValueFunc hash;         /* hash function */
     224             :     HashCompareFunc match;      /* key comparison function */
     225             :     HashCopyFunc keycopy;       /* key copying function */
     226             :     HashAllocFunc alloc;        /* memory allocator */
     227             :     MemoryContext hcxt;         /* memory context if default allocator used */
     228             :     char       *tabname;        /* table name (for error messages) */
     229             :     bool        isshared;       /* true if table is in shared memory */
     230             :     bool        isfixed;        /* if true, don't enlarge */
     231             : 
     232             :     /* freezing a shared table isn't allowed, so we can keep state here */
     233             :     bool        frozen;         /* true = no more inserts allowed */
     234             : 
     235             :     /* We keep local copies of these fixed values to reduce contention */
     236             :     Size        keysize;        /* hash key length in bytes */
     237             :     long        ssize;          /* segment size --- must be power of 2 */
     238             :     int         sshift;         /* segment shift = log2(ssize) */
     239             : };
     240             : 
     241             : /*
     242             :  * Key (also entry) part of a HASHELEMENT
     243             :  */
     244             : #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
     245             : 
     246             : /*
     247             :  * Obtain element pointer given pointer to key
     248             :  */
     249             : #define ELEMENT_FROM_KEY(key)  \
     250             :     ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
     251             : 
     252             : /*
     253             :  * Fast MOD arithmetic, assuming that y is a power of 2 !
     254             :  */
     255             : #define MOD(x,y)               ((x) & ((y)-1))
     256             : 
     257             : #ifdef HASH_STATISTICS
     258             : static long hash_accesses,
     259             :             hash_collisions,
     260             :             hash_expansions;
     261             : #endif
     262             : 
     263             : /*
     264             :  * Private function prototypes
     265             :  */
     266             : static void *DynaHashAlloc(Size size);
     267             : static HASHSEGMENT seg_alloc(HTAB *hashp);
     268             : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
     269             : static bool dir_realloc(HTAB *hashp);
     270             : static bool expand_table(HTAB *hashp);
     271             : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
     272             : static void hdefault(HTAB *hashp);
     273             : static int  choose_nelem_alloc(Size entrysize);
     274             : static bool init_htab(HTAB *hashp, long nelem);
     275             : static void hash_corrupted(HTAB *hashp) pg_attribute_noreturn();
     276             : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
     277             :                                   HASHBUCKET **bucketptr);
     278             : static long next_pow2_long(long num);
     279             : static int  next_pow2_int(long num);
     280             : static void register_seq_scan(HTAB *hashp);
     281             : static void deregister_seq_scan(HTAB *hashp);
     282             : static bool has_seq_scans(HTAB *hashp);
     283             : 
     284             : 
     285             : /*
     286             :  * memory allocation support
     287             :  */
     288             : static MemoryContext CurrentDynaHashCxt = NULL;
     289             : 
     290             : static void *
     291     2579248 : DynaHashAlloc(Size size)
     292             : {
     293             :     Assert(MemoryContextIsValid(CurrentDynaHashCxt));
     294     2579248 :     return MemoryContextAllocExtended(CurrentDynaHashCxt, size,
     295             :                                       MCXT_ALLOC_NO_OOM);
     296             : }
     297             : 
     298             : 
     299             : /*
     300             :  * HashCompareFunc for string keys
     301             :  *
     302             :  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
     303             :  * bytes, so we can only compare that many ... hence strncmp is almost but
     304             :  * not quite the right thing.
     305             :  */
     306             : static int
     307      908862 : string_compare(const char *key1, const char *key2, Size keysize)
     308             : {
     309      908862 :     return strncmp(key1, key2, keysize - 1);
     310             : }
     311             : 
     312             : 
     313             : /************************** CREATE ROUTINES **********************/
     314             : 
     315             : /*
     316             :  * hash_create -- create a new dynamic hash table
     317             :  *
     318             :  *  tabname: a name for the table (for debugging purposes)
     319             :  *  nelem: maximum number of elements expected
     320             :  *  *info: additional table parameters, as indicated by flags
     321             :  *  flags: bitmask indicating which parameters to take from *info
     322             :  *
     323             :  * The flags value *must* include HASH_ELEM.  (Formerly, this was nominally
     324             :  * optional, but the default keysize and entrysize values were useless.)
     325             :  * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
     326             :  * or HASH_FUNCTION, to define the key hashing semantics (C strings,
     327             :  * binary blobs, or custom, respectively).  Callers specifying a custom
     328             :  * hash function will likely also want to use HASH_COMPARE, and perhaps
     329             :  * also HASH_KEYCOPY, to control key comparison and copying.
     330             :  * Another often-used flag is HASH_CONTEXT, to allocate the hash table
     331             :  * under info->hcxt rather than under TopMemoryContext; the default
     332             :  * behavior is only suitable for session-lifespan hash tables.
     333             :  * Other flags bits are special-purpose and seldom used, except for those
     334             :  * associated with shared-memory hash tables, for which see ShmemInitHash().
     335             :  *
     336             :  * Fields in *info are read only when the associated flags bit is set.
     337             :  * It is not necessary to initialize other fields of *info.
     338             :  * Neither tabname nor *info need persist after the hash_create() call.
     339             :  *
     340             :  * Note: It is deprecated for callers of hash_create() to explicitly specify
     341             :  * string_hash, tag_hash, uint32_hash, or oid_hash.  Just set HASH_STRINGS or
     342             :  * HASH_BLOBS.  Use HASH_FUNCTION only when you want something other than
     343             :  * one of these.
     344             :  *
     345             :  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
     346             :  * estimate, since we can't expand the table on the fly.  But an unshared
     347             :  * hashtable can be expanded on-the-fly, so it's better for nelem to be
     348             :  * on the small side and let the table grow if it's exceeded.  An overly
     349             :  * large nelem will penalize hash_seq_search speed without buying much.
     350             :  */
     351             : HTAB *
     352      480650 : hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
     353             : {
     354             :     HTAB       *hashp;
     355             :     HASHHDR    *hctl;
     356             : 
     357             :     /*
     358             :      * Hash tables now allocate space for key and data, but you have to say
     359             :      * how much space to allocate.
     360             :      */
     361             :     Assert(flags & HASH_ELEM);
     362             :     Assert(info->keysize > 0);
     363             :     Assert(info->entrysize >= info->keysize);
     364             : 
     365             :     /*
     366             :      * For shared hash tables, we have a local hash header (HTAB struct) that
     367             :      * we allocate in TopMemoryContext; all else is in shared memory.
     368             :      *
     369             :      * For non-shared hash tables, everything including the hash header is in
     370             :      * a memory context created specially for the hash table --- this makes
     371             :      * hash_destroy very simple.  The memory context is made a child of either
     372             :      * a context specified by the caller, or TopMemoryContext if nothing is
     373             :      * specified.
     374             :      */
     375      480650 :     if (flags & HASH_SHARED_MEM)
     376             :     {
     377             :         /* Set up to allocate the hash header */
     378       17276 :         CurrentDynaHashCxt = TopMemoryContext;
     379             :     }
     380             :     else
     381             :     {
     382             :         /* Create the hash table's private memory context */
     383      463374 :         if (flags & HASH_CONTEXT)
     384      205450 :             CurrentDynaHashCxt = info->hcxt;
     385             :         else
     386      257924 :             CurrentDynaHashCxt = TopMemoryContext;
     387      463374 :         CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
     388             :                                                    "dynahash",
     389             :                                                    ALLOCSET_DEFAULT_SIZES);
     390             :     }
     391             : 
     392             :     /* Initialize the hash header, plus a copy of the table name */
     393      480650 :     hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) + 1);
     394     6248450 :     MemSet(hashp, 0, sizeof(HTAB));
     395             : 
     396      480650 :     hashp->tabname = (char *) (hashp + 1);
     397      480650 :     strcpy(hashp->tabname, tabname);
     398             : 
     399             :     /* If we have a private context, label it with hashtable's name */
     400      480650 :     if (!(flags & HASH_SHARED_MEM))
     401      463374 :         MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
     402             : 
     403             :     /*
     404             :      * Select the appropriate hash function (see comments at head of file).
     405             :      */
     406      480650 :     if (flags & HASH_FUNCTION)
     407             :     {
     408             :         Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
     409       22348 :         hashp->hash = info->hash;
     410             :     }
     411      458302 :     else if (flags & HASH_BLOBS)
     412             :     {
     413             :         Assert(!(flags & HASH_STRINGS));
     414             :         /* We can optimize hashing for common key sizes */
     415      364626 :         if (info->keysize == sizeof(uint32))
     416      207186 :             hashp->hash = uint32_hash;
     417             :         else
     418      157440 :             hashp->hash = tag_hash;
     419             :     }
     420             :     else
     421             :     {
     422             :         /*
     423             :          * string_hash used to be considered the default hash method, and in a
     424             :          * non-assert build it effectively still is.  But we now consider it
     425             :          * an assertion error to not say HASH_STRINGS explicitly.  To help
     426             :          * catch mistaken usage of HASH_STRINGS, we also insist on a
     427             :          * reasonably long string length: if the keysize is only 4 or 8 bytes,
     428             :          * it's almost certainly an integer or pointer not a string.
     429             :          */
     430             :         Assert(flags & HASH_STRINGS);
     431             :         Assert(info->keysize > 8);
     432             : 
     433       93676 :         hashp->hash = string_hash;
     434             :     }
     435             : 
     436             :     /*
     437             :      * If you don't specify a match function, it defaults to string_compare if
     438             :      * you used string_hash, and to memcmp otherwise.
     439             :      *
     440             :      * Note: explicitly specifying string_hash is deprecated, because this
     441             :      * might not work for callers in loadable modules on some platforms due to
     442             :      * referencing a trampoline instead of the string_hash function proper.
     443             :      * Specify HASH_STRINGS instead.
     444             :      */
     445      480650 :     if (flags & HASH_COMPARE)
     446       11058 :         hashp->match = info->match;
     447      469592 :     else if (hashp->hash == string_hash)
     448       93676 :         hashp->match = (HashCompareFunc) string_compare;
     449             :     else
     450      375916 :         hashp->match = memcmp;
     451             : 
     452             :     /*
     453             :      * Similarly, the key-copying function defaults to strlcpy or memcpy.
     454             :      */
     455      480650 :     if (flags & HASH_KEYCOPY)
     456           0 :         hashp->keycopy = info->keycopy;
     457      480650 :     else if (hashp->hash == string_hash)
     458             :     {
     459             :         /*
     460             :          * The signature of keycopy is meant for memcpy(), which returns
     461             :          * void*, but strlcpy() returns size_t.  Since we never use the return
     462             :          * value of keycopy, and size_t is pretty much always the same size as
     463             :          * void *, this should be safe.  The extra cast in the middle is to
     464             :          * avoid warnings from -Wcast-function-type.
     465             :          */
     466       93676 :         hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
     467             :     }
     468             :     else
     469      386974 :         hashp->keycopy = memcpy;
     470             : 
     471             :     /* And select the entry allocation function, too. */
     472      480650 :     if (flags & HASH_ALLOC)
     473       17276 :         hashp->alloc = info->alloc;
     474             :     else
     475      463374 :         hashp->alloc = DynaHashAlloc;
     476             : 
     477      480650 :     if (flags & HASH_SHARED_MEM)
     478             :     {
     479             :         /*
     480             :          * ctl structure and directory are preallocated for shared memory
     481             :          * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
     482             :          * well.
     483             :          */
     484       17276 :         hashp->hctl = info->hctl;
     485       17276 :         hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
     486       17276 :         hashp->hcxt = NULL;
     487       17276 :         hashp->isshared = true;
     488             : 
     489             :         /* hash table already exists, we're just attaching to it */
     490       17276 :         if (flags & HASH_ATTACH)
     491             :         {
     492             :             /* make local copies of some heavily-used values */
     493           0 :             hctl = hashp->hctl;
     494           0 :             hashp->keysize = hctl->keysize;
     495           0 :             hashp->ssize = hctl->ssize;
     496           0 :             hashp->sshift = hctl->sshift;
     497             : 
     498           0 :             return hashp;
     499             :         }
     500             :     }
     501             :     else
     502             :     {
     503             :         /* setup hash table defaults */
     504      463374 :         hashp->hctl = NULL;
     505      463374 :         hashp->dir = NULL;
     506      463374 :         hashp->hcxt = CurrentDynaHashCxt;
     507      463374 :         hashp->isshared = false;
     508             :     }
     509             : 
     510      480650 :     if (!hashp->hctl)
     511             :     {
     512      463374 :         hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
     513      463374 :         if (!hashp->hctl)
     514           0 :             ereport(ERROR,
     515             :                     (errcode(ERRCODE_OUT_OF_MEMORY),
     516             :                      errmsg("out of memory")));
     517             :     }
     518             : 
     519      480650 :     hashp->frozen = false;
     520             : 
     521      480650 :     hdefault(hashp);
     522             : 
     523      480650 :     hctl = hashp->hctl;
     524             : 
     525      480650 :     if (flags & HASH_PARTITION)
     526             :     {
     527             :         /* Doesn't make sense to partition a local hash table */
     528             :         Assert(flags & HASH_SHARED_MEM);
     529             : 
     530             :         /*
     531             :          * The number of partitions had better be a power of 2. Also, it must
     532             :          * be less than INT_MAX (see init_htab()), so call the int version of
     533             :          * next_pow2.
     534             :          */
     535             :         Assert(info->num_partitions == next_pow2_int(info->num_partitions));
     536             : 
     537        9590 :         hctl->num_partitions = info->num_partitions;
     538             :     }
     539             : 
     540      480650 :     if (flags & HASH_SEGMENT)
     541             :     {
     542           0 :         hctl->ssize = info->ssize;
     543           0 :         hctl->sshift = my_log2(info->ssize);
     544             :         /* ssize had better be a power of 2 */
     545             :         Assert(hctl->ssize == (1L << hctl->sshift));
     546             :     }
     547             : 
     548             :     /*
     549             :      * SHM hash tables have fixed directory size passed by the caller.
     550             :      */
     551      480650 :     if (flags & HASH_DIRSIZE)
     552             :     {
     553       17276 :         hctl->max_dsize = info->max_dsize;
     554       17276 :         hctl->dsize = info->dsize;
     555             :     }
     556             : 
     557             :     /* remember the entry sizes, too */
     558      480650 :     hctl->keysize = info->keysize;
     559      480650 :     hctl->entrysize = info->entrysize;
     560             : 
     561             :     /* make local copies of heavily-used constant fields */
     562      480650 :     hashp->keysize = hctl->keysize;
     563      480650 :     hashp->ssize = hctl->ssize;
     564      480650 :     hashp->sshift = hctl->sshift;
     565             : 
     566             :     /* Build the hash directory structure */
     567      480650 :     if (!init_htab(hashp, nelem))
     568           0 :         elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
     569             : 
     570             :     /*
     571             :      * For a shared hash table, preallocate the requested number of elements.
     572             :      * This reduces problems with run-time out-of-shared-memory conditions.
     573             :      *
     574             :      * For a non-shared hash table, preallocate the requested number of
     575             :      * elements if it's less than our chosen nelem_alloc.  This avoids wasting
     576             :      * space if the caller correctly estimates a small table size.
     577             :      */
     578      480650 :     if ((flags & HASH_SHARED_MEM) ||
     579      463374 :         nelem < hctl->nelem_alloc)
     580             :     {
     581             :         int         i,
     582             :                     freelist_partitions,
     583             :                     nelem_alloc,
     584             :                     nelem_alloc_first;
     585             : 
     586             :         /*
     587             :          * If hash table is partitioned, give each freelist an equal share of
     588             :          * the initial allocation.  Otherwise only freeList[0] is used.
     589             :          */
     590      221458 :         if (IS_PARTITIONED(hashp->hctl))
     591        9590 :             freelist_partitions = NUM_FREELISTS;
     592             :         else
     593      211868 :             freelist_partitions = 1;
     594             : 
     595      221458 :         nelem_alloc = nelem / freelist_partitions;
     596      221458 :         if (nelem_alloc <= 0)
     597           0 :             nelem_alloc = 1;
     598             : 
     599             :         /*
     600             :          * Make sure we'll allocate all the requested elements; freeList[0]
     601             :          * gets the excess if the request isn't divisible by NUM_FREELISTS.
     602             :          */
     603      221458 :         if (nelem_alloc * freelist_partitions < nelem)
     604          94 :             nelem_alloc_first =
     605          94 :                 nelem - nelem_alloc * (freelist_partitions - 1);
     606             :         else
     607      221364 :             nelem_alloc_first = nelem_alloc;
     608             : 
     609      740206 :         for (i = 0; i < freelist_partitions; i++)
     610             :         {
     611      518748 :             int         temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
     612             : 
     613      518748 :             if (!element_alloc(hashp, temp, i))
     614           0 :                 ereport(ERROR,
     615             :                         (errcode(ERRCODE_OUT_OF_MEMORY),
     616             :                          errmsg("out of memory")));
     617             :         }
     618             :     }
     619             : 
     620      480650 :     if (flags & HASH_FIXED_SIZE)
     621        5754 :         hashp->isfixed = true;
     622      480650 :     return hashp;
     623             : }
     624             : 
     625             : /*
     626             :  * Set default HASHHDR parameters.
     627             :  */
     628             : static void
     629      480650 : hdefault(HTAB *hashp)
     630             : {
     631      480650 :     HASHHDR    *hctl = hashp->hctl;
     632             : 
     633    51429550 :     MemSet(hctl, 0, sizeof(HASHHDR));
     634             : 
     635      480650 :     hctl->dsize = DEF_DIRSIZE;
     636      480650 :     hctl->nsegs = 0;
     637             : 
     638      480650 :     hctl->num_partitions = 0;    /* not partitioned */
     639             : 
     640             :     /* table has no fixed maximum size */
     641      480650 :     hctl->max_dsize = NO_MAX_DSIZE;
     642             : 
     643      480650 :     hctl->ssize = DEF_SEGSIZE;
     644      480650 :     hctl->sshift = DEF_SEGSIZE_SHIFT;
     645             : 
     646             : #ifdef HASH_STATISTICS
     647             :     hctl->accesses = hctl->collisions = 0;
     648             : #endif
     649      480650 : }
     650             : 
     651             : /*
     652             :  * Given the user-specified entry size, choose nelem_alloc, ie, how many
     653             :  * elements to add to the hash table when we need more.
     654             :  */
     655             : static int
     656      512758 : choose_nelem_alloc(Size entrysize)
     657             : {
     658             :     int         nelem_alloc;
     659             :     Size        elementSize;
     660             :     Size        allocSize;
     661             : 
     662             :     /* Each element has a HASHELEMENT header plus user data. */
     663             :     /* NB: this had better match element_alloc() */
     664      512758 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
     665             : 
     666             :     /*
     667             :      * The idea here is to choose nelem_alloc at least 32, but round up so
     668             :      * that the allocation request will be a power of 2 or just less. This
     669             :      * makes little difference for hash tables in shared memory, but for hash
     670             :      * tables managed by palloc, the allocation request will be rounded up to
     671             :      * a power of 2 anyway.  If we fail to take this into account, we'll waste
     672             :      * as much as half the allocated space.
     673             :      */
     674      512758 :     allocSize = 32 * 4;         /* assume elementSize at least 8 */
     675             :     do
     676             :     {
     677     2103772 :         allocSize <<= 1;
     678     2103772 :         nelem_alloc = allocSize / elementSize;
     679     2103772 :     } while (nelem_alloc < 32);
     680             : 
     681      512758 :     return nelem_alloc;
     682             : }
     683             : 
     684             : /*
     685             :  * Compute derived fields of hctl and build the initial directory/segment
     686             :  * arrays
     687             :  */
     688             : static bool
     689      480650 : init_htab(HTAB *hashp, long nelem)
     690             : {
     691      480650 :     HASHHDR    *hctl = hashp->hctl;
     692             :     HASHSEGMENT *segp;
     693             :     int         nbuckets;
     694             :     int         nsegs;
     695             :     int         i;
     696             : 
     697             :     /*
     698             :      * initialize mutexes if it's a partitioned table
     699             :      */
     700      480650 :     if (IS_PARTITIONED(hctl))
     701      316470 :         for (i = 0; i < NUM_FREELISTS; i++)
     702      306880 :             SpinLockInit(&(hctl->freeList[i].mutex));
     703             : 
     704             :     /*
     705             :      * Allocate space for the next greater power of two number of buckets,
     706             :      * assuming a desired maximum load factor of 1.
     707             :      */
     708      480650 :     nbuckets = next_pow2_int(nelem);
     709             : 
     710             :     /*
     711             :      * In a partitioned table, nbuckets must be at least equal to
     712             :      * num_partitions; were it less, keys with apparently different partition
     713             :      * numbers would map to the same bucket, breaking partition independence.
     714             :      * (Normally nbuckets will be much bigger; this is just a safety check.)
     715             :      */
     716      480650 :     while (nbuckets < hctl->num_partitions)
     717           0 :         nbuckets <<= 1;
     718             : 
     719      480650 :     hctl->max_bucket = hctl->low_mask = nbuckets - 1;
     720      480650 :     hctl->high_mask = (nbuckets << 1) - 1;
     721             : 
     722             :     /*
     723             :      * Figure number of directory segments needed, round up to a power of 2
     724             :      */
     725      480650 :     nsegs = (nbuckets - 1) / hctl->ssize + 1;
     726      480650 :     nsegs = next_pow2_int(nsegs);
     727             : 
     728             :     /*
     729             :      * Make sure directory is big enough. If pre-allocated directory is too
     730             :      * small, choke (caller screwed up).
     731             :      */
     732      480650 :     if (nsegs > hctl->dsize)
     733             :     {
     734           0 :         if (!(hashp->dir))
     735           0 :             hctl->dsize = nsegs;
     736             :         else
     737           0 :             return false;
     738             :     }
     739             : 
     740             :     /* Allocate a directory */
     741      480650 :     if (!(hashp->dir))
     742             :     {
     743      463374 :         CurrentDynaHashCxt = hashp->hcxt;
     744      463374 :         hashp->dir = (HASHSEGMENT *)
     745      463374 :             hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
     746      463374 :         if (!hashp->dir)
     747           0 :             return false;
     748             :     }
     749             : 
     750             :     /* Allocate initial segments */
     751     1491482 :     for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
     752             :     {
     753     1010832 :         *segp = seg_alloc(hashp);
     754     1010832 :         if (*segp == NULL)
     755           0 :             return false;
     756             :     }
     757             : 
     758             :     /* Choose number of entries to allocate at a time */
     759      480650 :     hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
     760             : 
     761             : #ifdef HASH_DEBUG
     762             :     fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
     763             :             "TABLE POINTER   ", hashp,
     764             :             "DIRECTORY SIZE  ", hctl->dsize,
     765             :             "SEGMENT SIZE    ", hctl->ssize,
     766             :             "SEGMENT SHIFT   ", hctl->sshift,
     767             :             "MAX BUCKET      ", hctl->max_bucket,
     768             :             "HIGH MASK       ", hctl->high_mask,
     769             :             "LOW  MASK       ", hctl->low_mask,
     770             :             "NSEGS           ", hctl->nsegs);
     771             : #endif
     772      480650 :     return true;
     773             : }
     774             : 
     775             : /*
     776             :  * Estimate the space needed for a hashtable containing the given number
     777             :  * of entries of given size.
     778             :  * NOTE: this is used to estimate the footprint of hashtables in shared
     779             :  * memory; therefore it does not count HTAB which is in local memory.
     780             :  * NB: assumes that all hash structure parameters have default values!
     781             :  */
     782             : Size
     783       32108 : hash_estimate_size(long num_entries, Size entrysize)
     784             : {
     785             :     Size        size;
     786             :     long        nBuckets,
     787             :                 nSegments,
     788             :                 nDirEntries,
     789             :                 nElementAllocs,
     790             :                 elementSize,
     791             :                 elementAllocCnt;
     792             : 
     793             :     /* estimate number of buckets wanted */
     794       32108 :     nBuckets = next_pow2_long(num_entries);
     795             :     /* # of segments needed for nBuckets */
     796       32108 :     nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
     797             :     /* directory entries */
     798       32108 :     nDirEntries = DEF_DIRSIZE;
     799       32108 :     while (nDirEntries < nSegments)
     800           0 :         nDirEntries <<= 1;        /* dir_alloc doubles dsize at each call */
     801             : 
     802             :     /* fixed control info */
     803       32108 :     size = MAXALIGN(sizeof(HASHHDR));   /* but not HTAB, per above */
     804             :     /* directory */
     805       32108 :     size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
     806             :     /* segments */
     807       32108 :     size = add_size(size, mul_size(nSegments,
     808             :                                    MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
     809             :     /* elements --- allocated in groups of choose_nelem_alloc() entries */
     810       32108 :     elementAllocCnt = choose_nelem_alloc(entrysize);
     811       32108 :     nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
     812       32108 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
     813       32108 :     size = add_size(size,
     814             :                     mul_size(nElementAllocs,
     815             :                              mul_size(elementAllocCnt, elementSize)));
     816             : 
     817       32108 :     return size;
     818             : }
     819             : 
     820             : /*
     821             :  * Select an appropriate directory size for a hashtable with the given
     822             :  * maximum number of entries.
     823             :  * This is only needed for hashtables in shared memory, whose directories
     824             :  * cannot be expanded dynamically.
     825             :  * NB: assumes that all hash structure parameters have default values!
     826             :  *
     827             :  * XXX this had better agree with the behavior of init_htab()...
     828             :  */
     829             : long
     830       17276 : hash_select_dirsize(long num_entries)
     831             : {
     832             :     long        nBuckets,
     833             :                 nSegments,
     834             :                 nDirEntries;
     835             : 
     836             :     /* estimate number of buckets wanted */
     837       17276 :     nBuckets = next_pow2_long(num_entries);
     838             :     /* # of segments needed for nBuckets */
     839       17276 :     nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
     840             :     /* directory entries */
     841       17276 :     nDirEntries = DEF_DIRSIZE;
     842       17276 :     while (nDirEntries < nSegments)
     843           0 :         nDirEntries <<= 1;        /* dir_alloc doubles dsize at each call */
     844             : 
     845       17276 :     return nDirEntries;
     846             : }
     847             : 
     848             : /*
     849             :  * Compute the required initial memory allocation for a shared-memory
     850             :  * hashtable with the given parameters.  We need space for the HASHHDR
     851             :  * and for the (non expansible) directory.
     852             :  */
     853             : Size
     854       17276 : hash_get_shared_size(HASHCTL *info, int flags)
     855             : {
     856             :     Assert(flags & HASH_DIRSIZE);
     857             :     Assert(info->dsize == info->max_dsize);
     858       17276 :     return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
     859             : }
     860             : 
     861             : 
     862             : /********************** DESTROY ROUTINES ************************/
     863             : 
     864             : void
     865      115388 : hash_destroy(HTAB *hashp)
     866             : {
     867      115388 :     if (hashp != NULL)
     868             :     {
     869             :         /* allocation method must be one we know how to free, too */
     870             :         Assert(hashp->alloc == DynaHashAlloc);
     871             :         /* so this hashtable must have its own context */
     872             :         Assert(hashp->hcxt != NULL);
     873             : 
     874      115388 :         hash_stats("destroy", hashp);
     875             : 
     876             :         /*
     877             :          * Free everything by destroying the hash table's memory context.
     878             :          */
     879      115388 :         MemoryContextDelete(hashp->hcxt);
     880             :     }
     881      115388 : }
     882             : 
     883             : void
     884      115388 : hash_stats(const char *where, HTAB *hashp)
     885             : {
     886             : #ifdef HASH_STATISTICS
     887             :     fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
     888             :             where, hashp->hctl->accesses, hashp->hctl->collisions);
     889             : 
     890             :     fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
     891             :             hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
     892             :             hashp->hctl->max_bucket, hashp->hctl->nsegs);
     893             :     fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
     894             :             where, hash_accesses, hash_collisions);
     895             :     fprintf(stderr, "hash_stats: total expansions %ld\n",
     896             :             hash_expansions);
     897             : #endif
     898      115388 : }
     899             : 
     900             : /*******************************SEARCH ROUTINES *****************************/
     901             : 
     902             : 
     903             : /*
     904             :  * get_hash_value -- exported routine to calculate a key's hash value
     905             :  *
     906             :  * We export this because for partitioned tables, callers need to compute
     907             :  * the partition number (from the low-order bits of the hash value) before
     908             :  * searching.
     909             :  */
     910             : uint32
     911   145501114 : get_hash_value(HTAB *hashp, const void *keyPtr)
     912             : {
     913   145501114 :     return hashp->hash(keyPtr, hashp->keysize);
     914             : }
     915             : 
     916             : /* Convert a hash value to a bucket number */
     917             : static inline uint32
     918   331298154 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
     919             : {
     920             :     uint32      bucket;
     921             : 
     922   331298154 :     bucket = hash_val & hctl->high_mask;
     923   331298154 :     if (bucket > hctl->max_bucket)
     924   155158474 :         bucket = bucket & hctl->low_mask;
     925             : 
     926   331298154 :     return bucket;
     927             : }
     928             : 
     929             : /*
     930             :  * hash_search -- look up key in table and perform action
     931             :  * hash_search_with_hash_value -- same, with key's hash value already computed
     932             :  *
     933             :  * action is one of:
     934             :  *      HASH_FIND: look up key in table
     935             :  *      HASH_ENTER: look up key in table, creating entry if not present
     936             :  *      HASH_ENTER_NULL: same, but return NULL if out of memory
     937             :  *      HASH_REMOVE: look up key in table, remove entry if present
     938             :  *
     939             :  * Return value is a pointer to the element found/entered/removed if any,
     940             :  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
     941             :  * the result is a dangling pointer that shouldn't be dereferenced!)
     942             :  *
     943             :  * HASH_ENTER will normally ereport a generic "out of memory" error if
     944             :  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
     945             :  * the same except it will return NULL if out of memory.
     946             :  *
     947             :  * If foundPtr isn't NULL, then *foundPtr is set true if we found an
     948             :  * existing entry in the table, false otherwise.  This is needed in the
     949             :  * HASH_ENTER case, but is redundant with the return value otherwise.
     950             :  *
     951             :  * For hash_search_with_hash_value, the hashvalue parameter must have been
     952             :  * calculated with get_hash_value().
     953             :  */
     954             : void *
     955   197687208 : hash_search(HTAB *hashp,
     956             :             const void *keyPtr,
     957             :             HASHACTION action,
     958             :             bool *foundPtr)
     959             : {
     960   197687208 :     return hash_search_with_hash_value(hashp,
     961             :                                        keyPtr,
     962   197687208 :                                        hashp->hash(keyPtr, hashp->keysize),
     963             :                                        action,
     964             :                                        foundPtr);
     965             : }
     966             : 
     967             : void *
     968   328951438 : hash_search_with_hash_value(HTAB *hashp,
     969             :                             const void *keyPtr,
     970             :                             uint32 hashvalue,
     971             :                             HASHACTION action,
     972             :                             bool *foundPtr)
     973             : {
     974   328951438 :     HASHHDR    *hctl = hashp->hctl;
     975   328951438 :     int         freelist_idx = FREELIST_IDX(hctl, hashvalue);
     976             :     Size        keysize;
     977             :     HASHBUCKET  currBucket;
     978             :     HASHBUCKET *prevBucketPtr;
     979             :     HashCompareFunc match;
     980             : 
     981             : #ifdef HASH_STATISTICS
     982             :     hash_accesses++;
     983             :     hctl->accesses++;
     984             : #endif
     985             : 
     986             :     /*
     987             :      * If inserting, check if it is time to split a bucket.
     988             :      *
     989             :      * NOTE: failure to expand table is not a fatal error, it just means we
     990             :      * have to run at higher fill factor than we wanted.  However, if we're
     991             :      * using the palloc allocator then it will throw error anyway on
     992             :      * out-of-memory, so we must do this before modifying the table.
     993             :      */
     994   328951438 :     if (action == HASH_ENTER || action == HASH_ENTER_NULL)
     995             :     {
     996             :         /*
     997             :          * Can't split if running in partitioned mode, nor if frozen, nor if
     998             :          * table is the subject of any active hash_seq_search scans.
     999             :          */
    1000    88507246 :         if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
    1001      682184 :             !IS_PARTITIONED(hctl) && !hashp->frozen &&
    1002      682184 :             !has_seq_scans(hashp))
    1003      682184 :             (void) expand_table(hashp);
    1004             :     }
    1005             : 
    1006             :     /*
    1007             :      * Do the initial lookup
    1008             :      */
    1009   328951438 :     (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
    1010   328951438 :     currBucket = *prevBucketPtr;
    1011             : 
    1012             :     /*
    1013             :      * Follow collision chain looking for matching key
    1014             :      */
    1015   328951438 :     match = hashp->match;        /* save one fetch in inner loop */
    1016   328951438 :     keysize = hashp->keysize;    /* ditto */
    1017             : 
    1018   412602204 :     while (currBucket != NULL)
    1019             :     {
    1020   612902832 :         if (currBucket->hashvalue == hashvalue &&
    1021   264628712 :             match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
    1022   264623354 :             break;
    1023    83650766 :         prevBucketPtr = &(currBucket->link);
    1024    83650766 :         currBucket = *prevBucketPtr;
    1025             : #ifdef HASH_STATISTICS
    1026             :         hash_collisions++;
    1027             :         hctl->collisions++;
    1028             : #endif
    1029             :     }
    1030             : 
    1031   328951438 :     if (foundPtr)
    1032    91087152 :         *foundPtr = (bool) (currBucket != NULL);
    1033             : 
    1034             :     /*
    1035             :      * OK, now what?
    1036             :      */
    1037   328951438 :     switch (action)
    1038             :     {
    1039   196146660 :         case HASH_FIND:
    1040   196146660 :             if (currBucket != NULL)
    1041   185150178 :                 return ELEMENTKEY(currBucket);
    1042    10996482 :             return NULL;
    1043             : 
    1044    44297532 :         case HASH_REMOVE:
    1045    44297532 :             if (currBucket != NULL)
    1046             :             {
    1047             :                 /* if partitioned, must lock to touch nentries and freeList */
    1048    44290392 :                 if (IS_PARTITIONED(hctl))
    1049     9761532 :                     SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
    1050             : 
    1051             :                 /* delete the record from the appropriate nentries counter. */
    1052             :                 Assert(hctl->freeList[freelist_idx].nentries > 0);
    1053    44290392 :                 hctl->freeList[freelist_idx].nentries--;
    1054             : 
    1055             :                 /* remove record from hash bucket's chain. */
    1056    44290392 :                 *prevBucketPtr = currBucket->link;
    1057             : 
    1058             :                 /* add the record to the appropriate freelist. */
    1059    44290392 :                 currBucket->link = hctl->freeList[freelist_idx].freeList;
    1060    44290392 :                 hctl->freeList[freelist_idx].freeList = currBucket;
    1061             : 
    1062    44290392 :                 if (IS_PARTITIONED(hctl))
    1063     9761532 :                     SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1064             : 
    1065             :                 /*
    1066             :                  * better hope the caller is synchronizing access to this
    1067             :                  * element, because someone else is going to reuse it the next
    1068             :                  * time something is added to the table
    1069             :                  */
    1070    44290392 :                 return ELEMENTKEY(currBucket);
    1071             :             }
    1072        7140 :             return NULL;
    1073             : 
    1074    88507246 :         case HASH_ENTER:
    1075             :         case HASH_ENTER_NULL:
    1076             :             /* Return existing element if found, else create one */
    1077    88507246 :             if (currBucket != NULL)
    1078    35182784 :                 return ELEMENTKEY(currBucket);
    1079             : 
    1080             :             /* disallow inserts if frozen */
    1081    53324462 :             if (hashp->frozen)
    1082           0 :                 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
    1083             :                      hashp->tabname);
    1084             : 
    1085    53324462 :             currBucket = get_hash_entry(hashp, freelist_idx);
    1086    53324462 :             if (currBucket == NULL)
    1087             :             {
    1088             :                 /* out of memory */
    1089           0 :                 if (action == HASH_ENTER_NULL)
    1090           0 :                     return NULL;
    1091             :                 /* report a generic message */
    1092           0 :                 if (hashp->isshared)
    1093           0 :                     ereport(ERROR,
    1094             :                             (errcode(ERRCODE_OUT_OF_MEMORY),
    1095             :                              errmsg("out of shared memory")));
    1096             :                 else
    1097           0 :                     ereport(ERROR,
    1098             :                             (errcode(ERRCODE_OUT_OF_MEMORY),
    1099             :                              errmsg("out of memory")));
    1100             :             }
    1101             : 
    1102             :             /* link into hashbucket chain */
    1103    53324462 :             *prevBucketPtr = currBucket;
    1104    53324462 :             currBucket->link = NULL;
    1105             : 
    1106             :             /* copy key into record */
    1107    53324462 :             currBucket->hashvalue = hashvalue;
    1108    53324462 :             hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
    1109             : 
    1110             :             /*
    1111             :              * Caller is expected to fill the data field on return.  DO NOT
    1112             :              * insert any code that could possibly throw error here, as doing
    1113             :              * so would leave the table entry incomplete and hence corrupt the
    1114             :              * caller's data structure.
    1115             :              */
    1116             : 
    1117    53324462 :             return ELEMENTKEY(currBucket);
    1118             :     }
    1119             : 
    1120           0 :     elog(ERROR, "unrecognized hash action code: %d", (int) action);
    1121             : 
    1122             :     return NULL;                /* keep compiler quiet */
    1123             : }
    1124             : 
    1125             : /*
    1126             :  * hash_update_hash_key -- change the hash key of an existing table entry
    1127             :  *
    1128             :  * This is equivalent to removing the entry, making a new entry, and copying
    1129             :  * over its data, except that the entry never goes to the table's freelist.
    1130             :  * Therefore this cannot suffer an out-of-memory failure, even if there are
    1131             :  * other processes operating in other partitions of the hashtable.
    1132             :  *
    1133             :  * Returns true if successful, false if the requested new hash key is already
    1134             :  * present.  Throws error if the specified entry pointer isn't actually a
    1135             :  * table member.
    1136             :  *
    1137             :  * NB: currently, there is no special case for old and new hash keys being
    1138             :  * identical, which means we'll report false for that situation.  This is
    1139             :  * preferable for existing uses.
    1140             :  *
    1141             :  * NB: for a partitioned hashtable, caller must hold lock on both relevant
    1142             :  * partitions, if the new hash key would belong to a different partition.
    1143             :  */
    1144             : bool
    1145        1718 : hash_update_hash_key(HTAB *hashp,
    1146             :                      void *existingEntry,
    1147             :                      const void *newKeyPtr)
    1148             : {
    1149        1718 :     HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
    1150             :     uint32      newhashvalue;
    1151             :     Size        keysize;
    1152             :     uint32      bucket;
    1153             :     uint32      newbucket;
    1154             :     HASHBUCKET  currBucket;
    1155             :     HASHBUCKET *prevBucketPtr;
    1156             :     HASHBUCKET *oldPrevPtr;
    1157             :     HashCompareFunc match;
    1158             : 
    1159             : #ifdef HASH_STATISTICS
    1160             :     hash_accesses++;
    1161             :     hctl->accesses++;
    1162             : #endif
    1163             : 
    1164             :     /* disallow updates if frozen */
    1165        1718 :     if (hashp->frozen)
    1166           0 :         elog(ERROR, "cannot update in frozen hashtable \"%s\"",
    1167             :              hashp->tabname);
    1168             : 
    1169             :     /*
    1170             :      * Lookup the existing element using its saved hash value.  We need to do
    1171             :      * this to be able to unlink it from its hash chain, but as a side benefit
    1172             :      * we can verify the validity of the passed existingEntry pointer.
    1173             :      */
    1174        1718 :     bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
    1175             :                                  &prevBucketPtr);
    1176        1718 :     currBucket = *prevBucketPtr;
    1177             : 
    1178        1718 :     while (currBucket != NULL)
    1179             :     {
    1180        1718 :         if (currBucket == existingElement)
    1181        1718 :             break;
    1182           0 :         prevBucketPtr = &(currBucket->link);
    1183           0 :         currBucket = *prevBucketPtr;
    1184             :     }
    1185             : 
    1186        1718 :     if (currBucket == NULL)
    1187           0 :         elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
    1188             :              hashp->tabname);
    1189             : 
    1190        1718 :     oldPrevPtr = prevBucketPtr;
    1191             : 
    1192             :     /*
    1193             :      * Now perform the equivalent of a HASH_ENTER operation to locate the hash
    1194             :      * chain we want to put the entry into.
    1195             :      */
    1196        1718 :     newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
    1197        1718 :     newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
    1198        1718 :     currBucket = *prevBucketPtr;
    1199             : 
    1200             :     /*
    1201             :      * Follow collision chain looking for matching key
    1202             :      */
    1203        1718 :     match = hashp->match;        /* save one fetch in inner loop */
    1204        1718 :     keysize = hashp->keysize;    /* ditto */
    1205             : 
    1206        1818 :     while (currBucket != NULL)
    1207             :     {
    1208         100 :         if (currBucket->hashvalue == newhashvalue &&
    1209           0 :             match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
    1210           0 :             break;
    1211         100 :         prevBucketPtr = &(currBucket->link);
    1212         100 :         currBucket = *prevBucketPtr;
    1213             : #ifdef HASH_STATISTICS
    1214             :         hash_collisions++;
    1215             :         hctl->collisions++;
    1216             : #endif
    1217             :     }
    1218             : 
    1219        1718 :     if (currBucket != NULL)
    1220           0 :         return false;           /* collision with an existing entry */
    1221             : 
    1222        1718 :     currBucket = existingElement;
    1223             : 
    1224             :     /*
    1225             :      * If old and new hash values belong to the same bucket, we need not
    1226             :      * change any chain links, and indeed should not since this simplistic
    1227             :      * update will corrupt the list if currBucket is the last element.  (We
    1228             :      * cannot fall out earlier, however, since we need to scan the bucket to
    1229             :      * check for duplicate keys.)
    1230             :      */
    1231        1718 :     if (bucket != newbucket)
    1232             :     {
    1233             :         /* OK to remove record from old hash bucket's chain. */
    1234        1620 :         *oldPrevPtr = currBucket->link;
    1235             : 
    1236             :         /* link into new hashbucket chain */
    1237        1620 :         *prevBucketPtr = currBucket;
    1238        1620 :         currBucket->link = NULL;
    1239             :     }
    1240             : 
    1241             :     /* copy new key into record */
    1242        1718 :     currBucket->hashvalue = newhashvalue;
    1243        1718 :     hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
    1244             : 
    1245             :     /* rest of record is untouched */
    1246             : 
    1247        1718 :     return true;
    1248             : }
    1249             : 
    1250             : /*
    1251             :  * Allocate a new hashtable entry if possible; return NULL if out of memory.
    1252             :  * (Or, if the underlying space allocator throws error for out-of-memory,
    1253             :  * we won't return at all.)
    1254             :  */
    1255             : static HASHBUCKET
    1256    53324462 : get_hash_entry(HTAB *hashp, int freelist_idx)
    1257             : {
    1258    53324462 :     HASHHDR    *hctl = hashp->hctl;
    1259             :     HASHBUCKET  newElement;
    1260             : 
    1261             :     for (;;)
    1262             :     {
    1263             :         /* if partitioned, must lock to touch nentries and freeList */
    1264    53744752 :         if (IS_PARTITIONED(hctl))
    1265    10764516 :             SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1266             : 
    1267             :         /* try to get an entry from the freelist */
    1268    53744752 :         newElement = hctl->freeList[freelist_idx].freeList;
    1269             : 
    1270    53744752 :         if (newElement != NULL)
    1271    53324462 :             break;
    1272             : 
    1273      420290 :         if (IS_PARTITIONED(hctl))
    1274        2808 :             SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1275             : 
    1276             :         /*
    1277             :          * No free elements in this freelist.  In a partitioned table, there
    1278             :          * might be entries in other freelists, but to reduce contention we
    1279             :          * prefer to first try to get another chunk of buckets from the main
    1280             :          * shmem allocator.  If that fails, though, we *MUST* root through all
    1281             :          * the other freelists before giving up.  There are multiple callers
    1282             :          * that assume that they can allocate every element in the initially
    1283             :          * requested table size, or that deleting an element guarantees they
    1284             :          * can insert a new element, even if shared memory is entirely full.
    1285             :          * Failing because the needed element is in a different freelist is
    1286             :          * not acceptable.
    1287             :          */
    1288      420290 :         if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
    1289             :         {
    1290             :             int         borrow_from_idx;
    1291             : 
    1292           0 :             if (!IS_PARTITIONED(hctl))
    1293           0 :                 return NULL;    /* out of memory */
    1294             : 
    1295             :             /* try to borrow element from another freelist */
    1296           0 :             borrow_from_idx = freelist_idx;
    1297             :             for (;;)
    1298             :             {
    1299           0 :                 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
    1300           0 :                 if (borrow_from_idx == freelist_idx)
    1301           0 :                     break;      /* examined all freelists, fail */
    1302             : 
    1303           0 :                 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
    1304           0 :                 newElement = hctl->freeList[borrow_from_idx].freeList;
    1305             : 
    1306           0 :                 if (newElement != NULL)
    1307             :                 {
    1308           0 :                     hctl->freeList[borrow_from_idx].freeList = newElement->link;
    1309           0 :                     SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
    1310             : 
    1311             :                     /* careful: count the new element in its proper freelist */
    1312           0 :                     SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1313           0 :                     hctl->freeList[freelist_idx].nentries++;
    1314           0 :                     SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1315             : 
    1316           0 :                     return newElement;
    1317             :                 }
    1318             : 
    1319           0 :                 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
    1320             :             }
    1321             : 
    1322             :             /* no elements available to borrow either, so out of memory */
    1323           0 :             return NULL;
    1324             :         }
    1325             :     }
    1326             : 
    1327             :     /* remove entry from freelist, bump nentries */
    1328    53324462 :     hctl->freeList[freelist_idx].freeList = newElement->link;
    1329    53324462 :     hctl->freeList[freelist_idx].nentries++;
    1330             : 
    1331    53324462 :     if (IS_PARTITIONED(hctl))
    1332    10761708 :         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1333             : 
    1334    53324462 :     return newElement;
    1335             : }
    1336             : 
    1337             : /*
    1338             :  * hash_get_num_entries -- get the number of entries in a hashtable
    1339             :  */
    1340             : long
    1341      116548 : hash_get_num_entries(HTAB *hashp)
    1342             : {
    1343             :     int         i;
    1344      116548 :     long        sum = hashp->hctl->freeList[0].nentries;
    1345             : 
    1346             :     /*
    1347             :      * We currently don't bother with acquiring the mutexes; it's only
    1348             :      * sensible to call this function if you've got lock on all partitions of
    1349             :      * the table.
    1350             :      */
    1351      116548 :     if (IS_PARTITIONED(hashp->hctl))
    1352             :     {
    1353       90304 :         for (i = 1; i < NUM_FREELISTS; i++)
    1354       87482 :             sum += hashp->hctl->freeList[i].nentries;
    1355             :     }
    1356             : 
    1357      116548 :     return sum;
    1358             : }
    1359             : 
    1360             : /*
    1361             :  * hash_seq_init/_search/_term
    1362             :  *          Sequentially search through hash table and return
    1363             :  *          all the elements one by one, return NULL when no more.
    1364             :  *
    1365             :  * hash_seq_term should be called if and only if the scan is abandoned before
    1366             :  * completion; if hash_seq_search returns NULL then it has already done the
    1367             :  * end-of-scan cleanup.
    1368             :  *
    1369             :  * NOTE: caller may delete the returned element before continuing the scan.
    1370             :  * However, deleting any other element while the scan is in progress is
    1371             :  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
    1372             :  * if elements are added to the table while the scan is in progress, it is
    1373             :  * unspecified whether they will be visited by the scan or not.
    1374             :  *
    1375             :  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
    1376             :  * worry about hash_seq_term cleanup, if the hashtable is first locked against
    1377             :  * further insertions by calling hash_freeze.
    1378             :  *
    1379             :  * NOTE: to use this with a partitioned hashtable, caller had better hold
    1380             :  * at least shared lock on all partitions of the table throughout the scan!
    1381             :  * We can cope with insertions or deletions by our own backend, but *not*
    1382             :  * with concurrent insertions or deletions by another.
    1383             :  */
    1384             : void
    1385     4128704 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
    1386             : {
    1387     4128704 :     status->hashp = hashp;
    1388     4128704 :     status->curBucket = 0;
    1389     4128704 :     status->curEntry = NULL;
    1390     4128704 :     status->hasHashvalue = false;
    1391     4128704 :     if (!hashp->frozen)
    1392     4128704 :         register_seq_scan(hashp);
    1393     4128704 : }
    1394             : 
    1395             : /*
    1396             :  * Same as above but scan by the given hash value.
    1397             :  * See also hash_seq_search().
    1398             :  *
    1399             :  * NOTE: the default hash function doesn't match syscache hash function.
    1400             :  * Thus, if you're going to use this function in syscache callback, make sure
    1401             :  * you're using custom hash function.  See relatt_cache_syshash()
    1402             :  * for example.
    1403             :  */
    1404             : void
    1405     1375968 : hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp,
    1406             :                               uint32 hashvalue)
    1407             : {
    1408             :     HASHBUCKET *bucketPtr;
    1409             : 
    1410     1375968 :     hash_seq_init(status, hashp);
    1411             : 
    1412     1375968 :     status->hasHashvalue = true;
    1413     1375968 :     status->hashvalue = hashvalue;
    1414             : 
    1415     1375968 :     status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
    1416     1375968 :     status->curEntry = *bucketPtr;
    1417     1375968 : }
    1418             : 
    1419             : void *
    1420    48805198 : hash_seq_search(HASH_SEQ_STATUS *status)
    1421             : {
    1422             :     HTAB       *hashp;
    1423             :     HASHHDR    *hctl;
    1424             :     uint32      max_bucket;
    1425             :     long        ssize;
    1426             :     long        segment_num;
    1427             :     long        segment_ndx;
    1428             :     HASHSEGMENT segp;
    1429             :     uint32      curBucket;
    1430             :     HASHELEMENT *curElem;
    1431             : 
    1432    48805198 :     if (status->hasHashvalue)
    1433             :     {
    1434             :         /*
    1435             :          * Scan entries only in the current bucket because only this bucket
    1436             :          * can contain entries with the given hash value.
    1437             :          */
    1438     1567250 :         while ((curElem = status->curEntry) != NULL)
    1439             :         {
    1440      191282 :             status->curEntry = curElem->link;
    1441      191282 :             if (status->hashvalue != curElem->hashvalue)
    1442      182616 :                 continue;
    1443        8666 :             return (void *) ELEMENTKEY(curElem);
    1444             :         }
    1445             : 
    1446     1375968 :         hash_seq_term(status);
    1447     1375968 :         return NULL;
    1448             :     }
    1449             : 
    1450    47420564 :     if ((curElem = status->curEntry) != NULL)
    1451             :     {
    1452             :         /* Continuing scan of curBucket... */
    1453    13847650 :         status->curEntry = curElem->link;
    1454    13847650 :         if (status->curEntry == NULL)    /* end of this bucket */
    1455     9709228 :             ++status->curBucket;
    1456    13847650 :         return ELEMENTKEY(curElem);
    1457             :     }
    1458             : 
    1459             :     /*
    1460             :      * Search for next nonempty bucket starting at curBucket.
    1461             :      */
    1462    33572914 :     curBucket = status->curBucket;
    1463    33572914 :     hashp = status->hashp;
    1464    33572914 :     hctl = hashp->hctl;
    1465    33572914 :     ssize = hashp->ssize;
    1466    33572914 :     max_bucket = hctl->max_bucket;
    1467             : 
    1468    33572914 :     if (curBucket > max_bucket)
    1469             :     {
    1470      100580 :         hash_seq_term(status);
    1471      100580 :         return NULL;            /* search is done */
    1472             :     }
    1473             : 
    1474             :     /*
    1475             :      * first find the right segment in the table directory.
    1476             :      */
    1477    33472334 :     segment_num = curBucket >> hashp->sshift;
    1478    33472334 :     segment_ndx = MOD(curBucket, ssize);
    1479             : 
    1480    33472334 :     segp = hashp->dir[segment_num];
    1481             : 
    1482             :     /*
    1483             :      * Pick up the first item in this bucket's chain.  If chain is not empty
    1484             :      * we can begin searching it.  Otherwise we have to advance to find the
    1485             :      * next nonempty bucket.  We try to optimize that case since searching a
    1486             :      * near-empty hashtable has to iterate this loop a lot.
    1487             :      */
    1488   165780114 :     while ((curElem = segp[segment_ndx]) == NULL)
    1489             :     {
    1490             :         /* empty bucket, advance to next */
    1491   134927960 :         if (++curBucket > max_bucket)
    1492             :         {
    1493     2620180 :             status->curBucket = curBucket;
    1494     2620180 :             hash_seq_term(status);
    1495     2620180 :             return NULL;        /* search is done */
    1496             :         }
    1497   132307780 :         if (++segment_ndx >= ssize)
    1498             :         {
    1499      240400 :             segment_num++;
    1500      240400 :             segment_ndx = 0;
    1501      240400 :             segp = hashp->dir[segment_num];
    1502             :         }
    1503             :     }
    1504             : 
    1505             :     /* Begin scan of curBucket... */
    1506    30852154 :     status->curEntry = curElem->link;
    1507    30852154 :     if (status->curEntry == NULL)    /* end of this bucket */
    1508    21142594 :         ++curBucket;
    1509    30852154 :     status->curBucket = curBucket;
    1510    30852154 :     return ELEMENTKEY(curElem);
    1511             : }
    1512             : 
    1513             : void
    1514     4128684 : hash_seq_term(HASH_SEQ_STATUS *status)
    1515             : {
    1516     4128684 :     if (!status->hashp->frozen)
    1517     4128684 :         deregister_seq_scan(status->hashp);
    1518     4128684 : }
    1519             : 
    1520             : /*
    1521             :  * hash_freeze
    1522             :  *          Freeze a hashtable against future insertions (deletions are
    1523             :  *          still allowed)
    1524             :  *
    1525             :  * The reason for doing this is that by preventing any more bucket splits,
    1526             :  * we no longer need to worry about registering hash_seq_search scans,
    1527             :  * and thus caller need not be careful about ensuring hash_seq_term gets
    1528             :  * called at the right times.
    1529             :  *
    1530             :  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
    1531             :  * with active scans (since hash_seq_term would then do the wrong thing).
    1532             :  */
    1533             : void
    1534           0 : hash_freeze(HTAB *hashp)
    1535             : {
    1536           0 :     if (hashp->isshared)
    1537           0 :         elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
    1538           0 :     if (!hashp->frozen && has_seq_scans(hashp))
    1539           0 :         elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
    1540             :              hashp->tabname);
    1541           0 :     hashp->frozen = true;
    1542           0 : }
    1543             : 
    1544             : 
    1545             : /********************************* UTILITIES ************************/
    1546             : 
    1547             : /*
    1548             :  * Expand the table by adding one more hash bucket.
    1549             :  */
    1550             : static bool
    1551      682184 : expand_table(HTAB *hashp)
    1552             : {
    1553      682184 :     HASHHDR    *hctl = hashp->hctl;
    1554             :     HASHSEGMENT old_seg,
    1555             :                 new_seg;
    1556             :     long        old_bucket,
    1557             :                 new_bucket;
    1558             :     long        new_segnum,
    1559             :                 new_segndx;
    1560             :     long        old_segnum,
    1561             :                 old_segndx;
    1562             :     HASHBUCKET *oldlink,
    1563             :                *newlink;
    1564             :     HASHBUCKET  currElement,
    1565             :                 nextElement;
    1566             : 
    1567             :     Assert(!IS_PARTITIONED(hctl));
    1568             : 
    1569             : #ifdef HASH_STATISTICS
    1570             :     hash_expansions++;
    1571             : #endif
    1572             : 
    1573      682184 :     new_bucket = hctl->max_bucket + 1;
    1574      682184 :     new_segnum = new_bucket >> hashp->sshift;
    1575      682184 :     new_segndx = MOD(new_bucket, hashp->ssize);
    1576             : 
    1577      682184 :     if (new_segnum >= hctl->nsegs)
    1578             :     {
    1579             :         /* Allocate new segment if necessary -- could fail if dir full */
    1580        2304 :         if (new_segnum >= hctl->dsize)
    1581           0 :             if (!dir_realloc(hashp))
    1582           0 :                 return false;
    1583        2304 :         if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
    1584           0 :             return false;
    1585        2304 :         hctl->nsegs++;
    1586             :     }
    1587             : 
    1588             :     /* OK, we created a new bucket */
    1589      682184 :     hctl->max_bucket++;
    1590             : 
    1591             :     /*
    1592             :      * *Before* changing masks, find old bucket corresponding to same hash
    1593             :      * values; values in that bucket may need to be relocated to new bucket.
    1594             :      * Note that new_bucket is certainly larger than low_mask at this point,
    1595             :      * so we can skip the first step of the regular hash mask calc.
    1596             :      */
    1597      682184 :     old_bucket = (new_bucket & hctl->low_mask);
    1598             : 
    1599             :     /*
    1600             :      * If we crossed a power of 2, readjust masks.
    1601             :      */
    1602      682184 :     if ((uint32) new_bucket > hctl->high_mask)
    1603             :     {
    1604        4106 :         hctl->low_mask = hctl->high_mask;
    1605        4106 :         hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
    1606             :     }
    1607             : 
    1608             :     /*
    1609             :      * Relocate records to the new bucket.  NOTE: because of the way the hash
    1610             :      * masking is done in calc_bucket, only one old bucket can need to be
    1611             :      * split at this point.  With a different way of reducing the hash value,
    1612             :      * that might not be true!
    1613             :      */
    1614      682184 :     old_segnum = old_bucket >> hashp->sshift;
    1615      682184 :     old_segndx = MOD(old_bucket, hashp->ssize);
    1616             : 
    1617      682184 :     old_seg = hashp->dir[old_segnum];
    1618      682184 :     new_seg = hashp->dir[new_segnum];
    1619             : 
    1620      682184 :     oldlink = &old_seg[old_segndx];
    1621      682184 :     newlink = &new_seg[new_segndx];
    1622             : 
    1623     1649496 :     for (currElement = *oldlink;
    1624             :          currElement != NULL;
    1625      967312 :          currElement = nextElement)
    1626             :     {
    1627      967312 :         nextElement = currElement->link;
    1628      967312 :         if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
    1629             :         {
    1630      481792 :             *oldlink = currElement;
    1631      481792 :             oldlink = &currElement->link;
    1632             :         }
    1633             :         else
    1634             :         {
    1635      485520 :             *newlink = currElement;
    1636      485520 :             newlink = &currElement->link;
    1637             :         }
    1638             :     }
    1639             :     /* don't forget to terminate the rebuilt hash chains... */
    1640      682184 :     *oldlink = NULL;
    1641      682184 :     *newlink = NULL;
    1642             : 
    1643      682184 :     return true;
    1644             : }
    1645             : 
    1646             : 
    1647             : static bool
    1648           0 : dir_realloc(HTAB *hashp)
    1649             : {
    1650             :     HASHSEGMENT *p;
    1651             :     HASHSEGMENT *old_p;
    1652             :     long        new_dsize;
    1653             :     long        old_dirsize;
    1654             :     long        new_dirsize;
    1655             : 
    1656           0 :     if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
    1657           0 :         return false;
    1658             : 
    1659             :     /* Reallocate directory */
    1660           0 :     new_dsize = hashp->hctl->dsize << 1;
    1661           0 :     old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
    1662           0 :     new_dirsize = new_dsize * sizeof(HASHSEGMENT);
    1663             : 
    1664           0 :     old_p = hashp->dir;
    1665           0 :     CurrentDynaHashCxt = hashp->hcxt;
    1666           0 :     p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
    1667             : 
    1668           0 :     if (p != NULL)
    1669             :     {
    1670           0 :         memcpy(p, old_p, old_dirsize);
    1671           0 :         MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
    1672           0 :         hashp->dir = p;
    1673           0 :         hashp->hctl->dsize = new_dsize;
    1674             : 
    1675             :         /* XXX assume the allocator is palloc, so we know how to free */
    1676             :         Assert(hashp->alloc == DynaHashAlloc);
    1677           0 :         pfree(old_p);
    1678             : 
    1679           0 :         return true;
    1680             :     }
    1681             : 
    1682           0 :     return false;
    1683             : }
    1684             : 
    1685             : 
    1686             : static HASHSEGMENT
    1687     1013136 : seg_alloc(HTAB *hashp)
    1688             : {
    1689             :     HASHSEGMENT segp;
    1690             : 
    1691     1013136 :     CurrentDynaHashCxt = hashp->hcxt;
    1692     1013136 :     segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
    1693             : 
    1694     1013136 :     if (!segp)
    1695           0 :         return NULL;
    1696             : 
    1697     1013136 :     MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
    1698             : 
    1699     1013136 :     return segp;
    1700             : }
    1701             : 
    1702             : /*
    1703             :  * allocate some new elements and link them into the indicated free list
    1704             :  */
    1705             : static bool
    1706      939038 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
    1707             : {
    1708      939038 :     HASHHDR    *hctl = hashp->hctl;
    1709             :     Size        elementSize;
    1710             :     HASHELEMENT *firstElement;
    1711             :     HASHELEMENT *tmpElement;
    1712             :     HASHELEMENT *prevElement;
    1713             :     int         i;
    1714             : 
    1715      939038 :     if (hashp->isfixed)
    1716           0 :         return false;
    1717             : 
    1718             :     /* Each element has a HASHELEMENT header plus user data. */
    1719      939038 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
    1720             : 
    1721      939038 :     CurrentDynaHashCxt = hashp->hcxt;
    1722      939038 :     firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
    1723             : 
    1724      939038 :     if (!firstElement)
    1725           0 :         return false;
    1726             : 
    1727             :     /* prepare to link all the new entries into the freelist */
    1728      939038 :     prevElement = NULL;
    1729      939038 :     tmpElement = firstElement;
    1730    91248156 :     for (i = 0; i < nelem; i++)
    1731             :     {
    1732    90309118 :         tmpElement->link = prevElement;
    1733    90309118 :         prevElement = tmpElement;
    1734    90309118 :         tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
    1735             :     }
    1736             : 
    1737             :     /* if partitioned, must lock to touch freeList */
    1738      939038 :     if (IS_PARTITIONED(hctl))
    1739      309688 :         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1740             : 
    1741             :     /* freelist could be nonempty if two backends did this concurrently */
    1742      939038 :     firstElement->link = hctl->freeList[freelist_idx].freeList;
    1743      939038 :     hctl->freeList[freelist_idx].freeList = prevElement;
    1744             : 
    1745      939038 :     if (IS_PARTITIONED(hctl))
    1746      309688 :         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1747             : 
    1748      939038 :     return true;
    1749             : }
    1750             : 
    1751             : /*
    1752             :  * Do initial lookup of a bucket for the given hash value, retrieving its
    1753             :  * bucket number and its hash bucket.
    1754             :  */
    1755             : static inline uint32
    1756   330330842 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
    1757             : {
    1758   330330842 :     HASHHDR    *hctl = hashp->hctl;
    1759             :     HASHSEGMENT segp;
    1760             :     long        segment_num;
    1761             :     long        segment_ndx;
    1762             :     uint32      bucket;
    1763             : 
    1764   330330842 :     bucket = calc_bucket(hctl, hashvalue);
    1765             : 
    1766   330330842 :     segment_num = bucket >> hashp->sshift;
    1767   330330842 :     segment_ndx = MOD(bucket, hashp->ssize);
    1768             : 
    1769   330330842 :     segp = hashp->dir[segment_num];
    1770             : 
    1771   330330842 :     if (segp == NULL)
    1772           0 :         hash_corrupted(hashp);
    1773             : 
    1774   330330842 :     *bucketptr = &segp[segment_ndx];
    1775   330330842 :     return bucket;
    1776             : }
    1777             : 
    1778             : /* complain when we have detected a corrupted hashtable */
    1779             : static void
    1780           0 : hash_corrupted(HTAB *hashp)
    1781             : {
    1782             :     /*
    1783             :      * If the corruption is in a shared hashtable, we'd better force a
    1784             :      * systemwide restart.  Otherwise, just shut down this one backend.
    1785             :      */
    1786           0 :     if (hashp->isshared)
    1787           0 :         elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
    1788             :     else
    1789           0 :         elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
    1790             : }
    1791             : 
    1792             : /* calculate ceil(log base 2) of num */
    1793             : int
    1794     1096512 : my_log2(long num)
    1795             : {
    1796             :     /*
    1797             :      * guard against too-large input, which would be invalid for
    1798             :      * pg_ceil_log2_*()
    1799             :      */
    1800     1096512 :     if (num > LONG_MAX / 2)
    1801           0 :         num = LONG_MAX / 2;
    1802             : 
    1803             : #if SIZEOF_LONG < 8
    1804             :     return pg_ceil_log2_32(num);
    1805             : #else
    1806     1096512 :     return pg_ceil_log2_64(num);
    1807             : #endif
    1808             : }
    1809             : 
    1810             : /* calculate first power of 2 >= num, bounded to what will fit in a long */
    1811             : static long
    1812       98768 : next_pow2_long(long num)
    1813             : {
    1814             :     /* my_log2's internal range check is sufficient */
    1815       98768 :     return 1L << my_log2(num);
    1816             : }
    1817             : 
    1818             : /* calculate first power of 2 >= num, bounded to what will fit in an int */
    1819             : static int
    1820      961300 : next_pow2_int(long num)
    1821             : {
    1822      961300 :     if (num > INT_MAX / 2)
    1823           0 :         num = INT_MAX / 2;
    1824      961300 :     return 1 << my_log2(num);
    1825             : }
    1826             : 
    1827             : 
    1828             : /************************* SEQ SCAN TRACKING ************************/
    1829             : 
    1830             : /*
    1831             :  * We track active hash_seq_search scans here.  The need for this mechanism
    1832             :  * comes from the fact that a scan will get confused if a bucket split occurs
    1833             :  * while it's in progress: it might visit entries twice, or even miss some
    1834             :  * entirely (if it's partway through the same bucket that splits).  Hence
    1835             :  * we want to inhibit bucket splits if there are any active scans on the
    1836             :  * table being inserted into.  This is a fairly rare case in current usage,
    1837             :  * so just postponing the split until the next insertion seems sufficient.
    1838             :  *
    1839             :  * Given present usages of the function, only a few scans are likely to be
    1840             :  * open concurrently; so a finite-size stack of open scans seems sufficient,
    1841             :  * and we don't worry that linear search is too slow.  Note that we do
    1842             :  * allow multiple scans of the same hashtable to be open concurrently.
    1843             :  *
    1844             :  * This mechanism can support concurrent scan and insertion in a shared
    1845             :  * hashtable if it's the same backend doing both.  It would fail otherwise,
    1846             :  * but locking reasons seem to preclude any such scenario anyway, so we don't
    1847             :  * worry.
    1848             :  *
    1849             :  * This arrangement is reasonably robust if a transient hashtable is deleted
    1850             :  * without notifying us.  The absolute worst case is we might inhibit splits
    1851             :  * in another table created later at exactly the same address.  We will give
    1852             :  * a warning at transaction end for reference leaks, so any bugs leading to
    1853             :  * lack of notification should be easy to catch.
    1854             :  */
    1855             : 
    1856             : #define MAX_SEQ_SCANS 100
    1857             : 
    1858             : static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
    1859             : static int  seq_scan_level[MAX_SEQ_SCANS];  /* subtransaction nest level */
    1860             : static int  num_seq_scans = 0;
    1861             : 
    1862             : 
    1863             : /* Register a table as having an active hash_seq_search scan */
    1864             : static void
    1865     4128704 : register_seq_scan(HTAB *hashp)
    1866             : {
    1867     4128704 :     if (num_seq_scans >= MAX_SEQ_SCANS)
    1868           0 :         elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
    1869             :              hashp->tabname);
    1870     4128704 :     seq_scan_tables[num_seq_scans] = hashp;
    1871     4128704 :     seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
    1872     4128704 :     num_seq_scans++;
    1873     4128704 : }
    1874             : 
    1875             : /* Deregister an active scan */
    1876             : static void
    1877     4128684 : deregister_seq_scan(HTAB *hashp)
    1878             : {
    1879             :     int         i;
    1880             : 
    1881             :     /* Search backward since it's most likely at the stack top */
    1882     4128684 :     for (i = num_seq_scans - 1; i >= 0; i--)
    1883             :     {
    1884     4128684 :         if (seq_scan_tables[i] == hashp)
    1885             :         {
    1886     4128684 :             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
    1887     4128684 :             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
    1888     4128684 :             num_seq_scans--;
    1889     4128684 :             return;
    1890             :         }
    1891             :     }
    1892           0 :     elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
    1893             :          hashp->tabname);
    1894             : }
    1895             : 
    1896             : /* Check if a table has any active scan */
    1897             : static bool
    1898      682184 : has_seq_scans(HTAB *hashp)
    1899             : {
    1900             :     int         i;
    1901             : 
    1902      682186 :     for (i = 0; i < num_seq_scans; i++)
    1903             :     {
    1904           2 :         if (seq_scan_tables[i] == hashp)
    1905           0 :             return true;
    1906             :     }
    1907      682184 :     return false;
    1908             : }
    1909             : 
    1910             : /* Clean up any open scans at end of transaction */
    1911             : void
    1912      787404 : AtEOXact_HashTables(bool isCommit)
    1913             : {
    1914             :     /*
    1915             :      * During abort cleanup, open scans are expected; just silently clean 'em
    1916             :      * out.  An open scan at commit means someone forgot a hash_seq_term()
    1917             :      * call, so complain.
    1918             :      *
    1919             :      * Note: it's tempting to try to print the tabname here, but refrain for
    1920             :      * fear of touching deallocated memory.  This isn't a user-facing message
    1921             :      * anyway, so it needn't be pretty.
    1922             :      */
    1923      787404 :     if (isCommit)
    1924             :     {
    1925             :         int         i;
    1926             : 
    1927      740264 :         for (i = 0; i < num_seq_scans; i++)
    1928             :         {
    1929           0 :             elog(WARNING, "leaked hash_seq_search scan for hash table %p",
    1930             :                  seq_scan_tables[i]);
    1931             :         }
    1932             :     }
    1933      787404 :     num_seq_scans = 0;
    1934      787404 : }
    1935             : 
    1936             : /* Clean up any open scans at end of subtransaction */
    1937             : void
    1938       20096 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
    1939             : {
    1940             :     int         i;
    1941             : 
    1942             :     /*
    1943             :      * Search backward to make cleanup easy.  Note we must check all entries,
    1944             :      * not only those at the end of the array, because deletion technique
    1945             :      * doesn't keep them in order.
    1946             :      */
    1947       20096 :     for (i = num_seq_scans - 1; i >= 0; i--)
    1948             :     {
    1949           0 :         if (seq_scan_level[i] >= nestDepth)
    1950             :         {
    1951           0 :             if (isCommit)
    1952           0 :                 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
    1953             :                      seq_scan_tables[i]);
    1954           0 :             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
    1955           0 :             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
    1956           0 :             num_seq_scans--;
    1957             :         }
    1958             :     }
    1959       20096 : }

Generated by: LCOV version 1.14