LCOV - code coverage report
Current view: top level - src/backend/utils/hash - dynahash.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 83.9 % 515 432
Test Date: 2026-02-17 17:20:33 Functions: 91.9 % 37 34
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * dynahash.c
       4              :  *    dynamic chained hash tables
       5              :  *
       6              :  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
       7              :  * shared memory.  For shared hash tables, it is the caller's responsibility
       8              :  * to provide appropriate access interlocking.  The simplest convention is
       9              :  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
      10              :  * hash_seq_search) need only shared lock, but any update requires exclusive
      11              :  * lock.  For heavily-used shared tables, the single-lock approach creates a
      12              :  * concurrency bottleneck, so we also support "partitioned" locking wherein
      13              :  * there are multiple LWLocks guarding distinct subsets of the table.  To use
      14              :  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
      15              :  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
      16              :  * Therefore, each hash bucket chain operates independently, and no fields
      17              :  * of the hash header change after init except nentries and freeList.
      18              :  * (A partitioned table uses multiple copies of those fields, guarded by
      19              :  * spinlocks, for additional concurrency.)
      20              :  * This lets any subset of the hash buckets be treated as a separately
      21              :  * lockable partition.  We expect callers to use the low-order bits of a
      22              :  * lookup key's hash value as a partition number --- this will work because
      23              :  * of the way calc_bucket() maps hash values to bucket numbers.
      24              :  *
      25              :  * The memory allocator function should match malloc's semantics of returning
      26              :  * NULL on failure.  (This is essential for hash tables in shared memory.
      27              :  * For hash tables in local memory, we used to use palloc() which will throw
      28              :  * error on failure; but we no longer do, so it's untested whether this
      29              :  * module will still cope with that behavior.)
      30              :  *
      31              :  * dynahash.c provides support for these types of lookup keys:
      32              :  *
      33              :  * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
      34              :  * compared as though by strcmp().  This is selected by specifying the
      35              :  * HASH_STRINGS flag to hash_create.
      36              :  *
      37              :  * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
      38              :  * (Caller must ensure there are no undefined padding bits in the keys!)
      39              :  * This is selected by specifying the HASH_BLOBS flag to hash_create.
      40              :  *
      41              :  * 3. More complex key behavior can be selected by specifying user-supplied
      42              :  * hashing, comparison, and/or key-copying functions.  At least a hashing
      43              :  * function must be supplied; comparison defaults to memcmp() and key copying
      44              :  * to memcpy() when a user-defined hashing function is selected.
      45              :  *
      46              :  * Compared to simplehash, dynahash has the following benefits:
      47              :  *
      48              :  * - It supports partitioning, which is useful for shared memory access using
      49              :  *   locks.
      50              :  * - Shared memory hashes are allocated in a fixed size area at startup and
      51              :  *   are discoverable by name from other processes.
      52              :  * - Because entries don't need to be moved in the case of hash conflicts,
      53              :  *   dynahash has better performance for large entries.
      54              :  * - Guarantees stable pointers to entries.
      55              :  *
      56              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      57              :  * Portions Copyright (c) 1994, Regents of the University of California
      58              :  *
      59              :  *
      60              :  * IDENTIFICATION
      61              :  *    src/backend/utils/hash/dynahash.c
      62              :  *
      63              :  *-------------------------------------------------------------------------
      64              :  */
      65              : 
      66              : /*
      67              :  * Original comments:
      68              :  *
      69              :  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
      70              :  * Coded into C, with minor code improvements, and with hsearch(3) interface,
      71              :  * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
      72              :  * also, hcreate/hdestroy routines added to simulate hsearch(3).
      73              :  *
      74              :  * These routines simulate hsearch(3) and family, with the important
      75              :  * difference that the hash table is dynamic - can grow indefinitely
      76              :  * beyond its original size (as supplied to hcreate()).
      77              :  *
      78              :  * Performance appears to be comparable to that of hsearch(3).
      79              :  * The 'source-code' options referred to in hsearch(3)'s 'man' page
      80              :  * are not implemented; otherwise functionality is identical.
      81              :  *
      82              :  * Compilation controls:
      83              :  * HASH_STATISTICS causes some usage statistics to be maintained, which can be
      84              :  * logged by calling hash_stats().
      85              :  *
      86              :  * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
      87              :  * concatenation property, in probably unnecessary code 'optimization'.
      88              :  *
      89              :  * Modified margo@postgres.berkeley.edu February 1990
      90              :  *      added multiple table interface
      91              :  * Modified by sullivan@postgres.berkeley.edu April 1990
      92              :  *      changed ctl structure for shared memory
      93              :  */
      94              : 
      95              : #include "postgres.h"
      96              : 
      97              : #include <limits.h>
      98              : 
      99              : #include "access/xact.h"
     100              : #include "common/hashfn.h"
     101              : #include "lib/ilist.h"
     102              : #include "port/pg_bitutils.h"
     103              : #include "storage/shmem.h"
     104              : #include "storage/spin.h"
     105              : #include "utils/memutils.h"
     106              : 
     107              : 
     108              : /*
     109              :  * Constants
     110              :  *
     111              :  * A hash table has a top-level "directory", each of whose entries points
     112              :  * to a "segment" of ssize bucket headers.  The maximum number of hash
     113              :  * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
     114              :  * the number of records in the table can be larger, but we don't want a
     115              :  * whole lot of records per bucket or performance goes down.
     116              :  *
     117              :  * In a hash table allocated in shared memory, the directory cannot be
     118              :  * expanded because it must stay at a fixed address.  The directory size
     119              :  * should be selected using hash_select_dirsize (and you'd better have
     120              :  * a good idea of the maximum number of entries!).  For non-shared hash
     121              :  * tables, the initial directory size can be left at the default.
     122              :  */
     123              : #define DEF_SEGSIZE            256
     124              : #define DEF_SEGSIZE_SHIFT      8    /* must be log2(DEF_SEGSIZE) */
     125              : #define DEF_DIRSIZE            256
     126              : 
     127              : /* Number of freelists to be used for a partitioned hash table. */
     128              : #define NUM_FREELISTS           32
     129              : 
     130              : /* A hash bucket is a linked list of HASHELEMENTs */
     131              : typedef HASHELEMENT *HASHBUCKET;
     132              : 
     133              : /* A hash segment is an array of bucket headers */
     134              : typedef HASHBUCKET *HASHSEGMENT;
     135              : 
     136              : /*
     137              :  * Per-freelist data.
     138              :  *
     139              :  * In a partitioned hash table, each freelist is associated with a specific
     140              :  * set of hashcodes, as determined by the FREELIST_IDX() macro below.
     141              :  * nentries tracks the number of live hashtable entries having those hashcodes
     142              :  * (NOT the number of entries in the freelist, as you might expect).
     143              :  *
     144              :  * The coverage of a freelist might be more or less than one partition, so it
     145              :  * needs its own lock rather than relying on caller locking.  Relying on that
     146              :  * wouldn't work even if the coverage was the same, because of the occasional
     147              :  * need to "borrow" entries from another freelist; see get_hash_entry().
     148              :  *
     149              :  * Using an array of FreeListData instead of separate arrays of mutexes,
     150              :  * nentries and freeLists helps to reduce sharing of cache lines between
     151              :  * different mutexes.
     152              :  */
     153              : typedef struct
     154              : {
     155              :     slock_t     mutex;          /* spinlock for this freelist */
     156              :     int64       nentries;       /* number of entries in associated buckets */
     157              :     HASHELEMENT *freeList;      /* chain of free elements */
     158              : } FreeListData;
     159              : 
     160              : /*
     161              :  * Header structure for a hash table --- contains all changeable info
     162              :  *
     163              :  * In a shared-memory hash table, the HASHHDR is in shared memory, while
     164              :  * each backend has a local HTAB struct.  For a non-shared table, there isn't
     165              :  * any functional difference between HASHHDR and HTAB, but we separate them
     166              :  * anyway to share code between shared and non-shared tables.
     167              :  */
     168              : struct HASHHDR
     169              : {
     170              :     /*
     171              :      * The freelist can become a point of contention in high-concurrency hash
     172              :      * tables, so we use an array of freelists, each with its own mutex and
     173              :      * nentries count, instead of just a single one.  Although the freelists
     174              :      * normally operate independently, we will scavenge entries from freelists
     175              :      * other than a hashcode's default freelist when necessary.
     176              :      *
     177              :      * If the hash table is not partitioned, only freeList[0] is used and its
     178              :      * spinlock is not used at all; callers' locking is assumed sufficient.
     179              :      */
     180              :     FreeListData freeList[NUM_FREELISTS];
     181              : 
     182              :     /* These fields can change, but not in a partitioned table */
     183              :     /* Also, dsize can't change in a shared table, even if unpartitioned */
     184              :     int64       dsize;          /* directory size */
     185              :     int64       nsegs;          /* number of allocated segments (<= dsize) */
     186              :     uint32      max_bucket;     /* ID of maximum bucket in use */
     187              :     uint32      high_mask;      /* mask to modulo into entire table */
     188              :     uint32      low_mask;       /* mask to modulo into lower half of table */
     189              : 
     190              :     /* These fields are fixed at hashtable creation */
     191              :     Size        keysize;        /* hash key length in bytes */
     192              :     Size        entrysize;      /* total user element size in bytes */
     193              :     int64       num_partitions; /* # partitions (must be power of 2), or 0 */
     194              :     int64       max_dsize;      /* 'dsize' limit if directory is fixed size */
     195              :     int64       ssize;          /* segment size --- must be power of 2 */
     196              :     int         sshift;         /* segment shift = log2(ssize) */
     197              :     int         nelem_alloc;    /* number of entries to allocate at once */
     198              :     bool        isfixed;        /* if true, don't enlarge */
     199              : 
     200              : #ifdef HASH_STATISTICS
     201              : 
     202              :     /*
     203              :      * Count statistics here.  NB: stats code doesn't bother with mutex, so
     204              :      * counts could be corrupted a bit in a partitioned table.
     205              :      */
     206              :     uint64      accesses;
     207              :     uint64      collisions;
     208              :     uint64      expansions;
     209              : #endif
     210              : };
     211              : 
     212              : #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
     213              : 
     214              : #define FREELIST_IDX(hctl, hashcode) \
     215              :     (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
     216              : 
     217              : /*
     218              :  * Top control structure for a hashtable --- in a shared table, each backend
     219              :  * has its own copy (OK since no fields change at runtime)
     220              :  */
     221              : struct HTAB
     222              : {
     223              :     HASHHDR    *hctl;           /* => shared control information */
     224              :     HASHSEGMENT *dir;           /* directory of segment starts */
     225              :     HashValueFunc hash;         /* hash function */
     226              :     HashCompareFunc match;      /* key comparison function */
     227              :     HashCopyFunc keycopy;       /* key copying function */
     228              :     HashAllocFunc alloc;        /* memory allocator */
     229              :     MemoryContext hcxt;         /* memory context if default allocator used */
     230              :     char       *tabname;        /* table name (for error messages) */
     231              :     bool        isshared;       /* true if table is in shared memory */
     232              : 
     233              :     /* freezing a shared table isn't allowed, so we can keep state here */
     234              :     bool        frozen;         /* true = no more inserts allowed */
     235              : 
     236              :     /* We keep local copies of these fixed values to reduce contention */
     237              :     Size        keysize;        /* hash key length in bytes */
     238              :     int64       ssize;          /* segment size --- must be power of 2 */
     239              :     int         sshift;         /* segment shift = log2(ssize) */
     240              : 
     241              :     /*
     242              :      * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
     243              :      * all the element blocks they have allocated.  This pacifies Valgrind,
     244              :      * which would otherwise often claim that the element blocks are "possibly
     245              :      * lost" for lack of any non-interior pointers to their starts.
     246              :      */
     247              : #ifdef USE_VALGRIND
     248              :     slist_head  element_blocks;
     249              : #endif
     250              : };
     251              : 
     252              : /*
     253              :  * Key (also entry) part of a HASHELEMENT
     254              :  */
     255              : #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
     256              : 
     257              : /*
     258              :  * Obtain element pointer given pointer to key
     259              :  */
     260              : #define ELEMENT_FROM_KEY(key)  \
     261              :     ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
     262              : 
     263              : /*
     264              :  * Fast MOD arithmetic, assuming that y is a power of 2 !
     265              :  */
     266              : #define MOD(x,y)               ((x) & ((y)-1))
     267              : 
     268              : /*
     269              :  * Private function prototypes
     270              :  */
     271              : static void *DynaHashAlloc(Size size);
     272              : static HASHSEGMENT seg_alloc(HTAB *hashp);
     273              : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
     274              : static bool dir_realloc(HTAB *hashp);
     275              : static bool expand_table(HTAB *hashp);
     276              : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
     277              : static void hdefault(HTAB *hashp);
     278              : static int  choose_nelem_alloc(Size entrysize);
     279              : static bool init_htab(HTAB *hashp, int64 nelem);
     280              : pg_noreturn static void hash_corrupted(HTAB *hashp);
     281              : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
     282              :                                   HASHBUCKET **bucketptr);
     283              : static int  my_log2(int64 num);
     284              : static int64 next_pow2_int64(int64 num);
     285              : static int  next_pow2_int(int64 num);
     286              : static void register_seq_scan(HTAB *hashp);
     287              : static void deregister_seq_scan(HTAB *hashp);
     288              : static bool has_seq_scans(HTAB *hashp);
     289              : 
     290              : 
     291              : /*
     292              :  * memory allocation support
     293              :  */
     294              : static MemoryContext CurrentDynaHashCxt = NULL;
     295              : 
     296              : static void *
     297      1608218 : DynaHashAlloc(Size size)
     298              : {
     299              :     Assert(MemoryContextIsValid(CurrentDynaHashCxt));
     300      1608218 :     return MemoryContextAllocExtended(CurrentDynaHashCxt, size,
     301              :                                       MCXT_ALLOC_NO_OOM);
     302              : }
     303              : 
     304              : 
     305              : /*
     306              :  * HashCompareFunc for string keys
     307              :  *
     308              :  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
     309              :  * bytes, so we can only compare that many ... hence strncmp is almost but
     310              :  * not quite the right thing.
     311              :  */
     312              : static int
     313       518274 : string_compare(const char *key1, const char *key2, Size keysize)
     314              : {
     315       518274 :     return strncmp(key1, key2, keysize - 1);
     316              : }
     317              : 
     318              : 
     319              : /************************** CREATE ROUTINES **********************/
     320              : 
     321              : /*
     322              :  * hash_create -- create a new dynamic hash table
     323              :  *
     324              :  *  tabname: a name for the table (for debugging purposes)
     325              :  *  nelem: maximum number of elements expected
     326              :  *  *info: additional table parameters, as indicated by flags
     327              :  *  flags: bitmask indicating which parameters to take from *info
     328              :  *
     329              :  * The flags value *must* include HASH_ELEM.  (Formerly, this was nominally
     330              :  * optional, but the default keysize and entrysize values were useless.)
     331              :  * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
     332              :  * or HASH_FUNCTION, to define the key hashing semantics (C strings,
     333              :  * binary blobs, or custom, respectively).  Callers specifying a custom
     334              :  * hash function will likely also want to use HASH_COMPARE, and perhaps
     335              :  * also HASH_KEYCOPY, to control key comparison and copying.
     336              :  * Another often-used flag is HASH_CONTEXT, to allocate the hash table
     337              :  * under info->hcxt rather than under TopMemoryContext; the default
     338              :  * behavior is only suitable for session-lifespan hash tables.
     339              :  * Other flags bits are special-purpose and seldom used, except for those
     340              :  * associated with shared-memory hash tables, for which see ShmemInitHash().
     341              :  *
     342              :  * Fields in *info are read only when the associated flags bit is set.
     343              :  * It is not necessary to initialize other fields of *info.
     344              :  * Neither tabname nor *info need persist after the hash_create() call.
     345              :  *
     346              :  * Note: It is deprecated for callers of hash_create() to explicitly specify
     347              :  * string_hash, tag_hash, uint32_hash, or oid_hash.  Just set HASH_STRINGS or
     348              :  * HASH_BLOBS.  Use HASH_FUNCTION only when you want something other than
     349              :  * one of these.
     350              :  *
     351              :  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
     352              :  * estimate, since we can't expand the table on the fly.  But an unshared
     353              :  * hashtable can be expanded on-the-fly, so it's better for nelem to be
     354              :  * on the small side and let the table grow if it's exceeded.  An overly
     355              :  * large nelem will penalize hash_seq_search speed without buying much.
     356              :  */
     357              : HTAB *
     358       373779 : hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
     359              : {
     360              :     HTAB       *hashp;
     361              :     HASHHDR    *hctl;
     362              : 
     363              :     /*
     364              :      * Hash tables now allocate space for key and data, but you have to say
     365              :      * how much space to allocate.
     366              :      */
     367              :     Assert(flags & HASH_ELEM);
     368              :     Assert(info->keysize > 0);
     369              :     Assert(info->entrysize >= info->keysize);
     370              : 
     371              :     /*
     372              :      * For shared hash tables, we have a local hash header (HTAB struct) that
     373              :      * we allocate in TopMemoryContext; all else is in shared memory.
     374              :      *
     375              :      * For non-shared hash tables, everything including the hash header is in
     376              :      * a memory context created specially for the hash table --- this makes
     377              :      * hash_destroy very simple.  The memory context is made a child of either
     378              :      * a context specified by the caller, or TopMemoryContext if nothing is
     379              :      * specified.
     380              :      */
     381       373779 :     if (flags & HASH_SHARED_MEM)
     382              :     {
     383              :         /* Set up to allocate the hash header */
     384        10267 :         CurrentDynaHashCxt = TopMemoryContext;
     385              :     }
     386              :     else
     387              :     {
     388              :         /* Create the hash table's private memory context */
     389       363512 :         if (flags & HASH_CONTEXT)
     390       202315 :             CurrentDynaHashCxt = info->hcxt;
     391              :         else
     392       161197 :             CurrentDynaHashCxt = TopMemoryContext;
     393       363512 :         CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
     394              :                                                    "dynahash",
     395              :                                                    ALLOCSET_DEFAULT_SIZES);
     396              :     }
     397              : 
     398              :     /* Initialize the hash header, plus a copy of the table name */
     399       373779 :     hashp = (HTAB *) MemoryContextAlloc(CurrentDynaHashCxt,
     400       373779 :                                         sizeof(HTAB) + strlen(tabname) + 1);
     401      4859127 :     MemSet(hashp, 0, sizeof(HTAB));
     402              : 
     403       373779 :     hashp->tabname = (char *) (hashp + 1);
     404       373779 :     strcpy(hashp->tabname, tabname);
     405              : 
     406              :     /* If we have a private context, label it with hashtable's name */
     407       373779 :     if (!(flags & HASH_SHARED_MEM))
     408       363512 :         MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
     409              : 
     410              :     /*
     411              :      * Select the appropriate hash function (see comments at head of file).
     412              :      */
     413       373779 :     if (flags & HASH_FUNCTION)
     414              :     {
     415              :         Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
     416        14520 :         hashp->hash = info->hash;
     417              :     }
     418       359259 :     else if (flags & HASH_BLOBS)
     419              :     {
     420              :         Assert(!(flags & HASH_STRINGS));
     421              :         /* We can optimize hashing for common key sizes */
     422       305200 :         if (info->keysize == sizeof(uint32))
     423       217522 :             hashp->hash = uint32_hash;
     424              :         else
     425        87678 :             hashp->hash = tag_hash;
     426              :     }
     427              :     else
     428              :     {
     429              :         /*
     430              :          * string_hash used to be considered the default hash method, and in a
     431              :          * non-assert build it effectively still is.  But we now consider it
     432              :          * an assertion error to not say HASH_STRINGS explicitly.  To help
     433              :          * catch mistaken usage of HASH_STRINGS, we also insist on a
     434              :          * reasonably long string length: if the keysize is only 4 or 8 bytes,
     435              :          * it's almost certainly an integer or pointer not a string.
     436              :          */
     437              :         Assert(flags & HASH_STRINGS);
     438              :         Assert(info->keysize > 8);
     439              : 
     440        54059 :         hashp->hash = string_hash;
     441              :     }
     442              : 
     443              :     /*
     444              :      * If you don't specify a match function, it defaults to string_compare if
     445              :      * you used string_hash, and to memcmp otherwise.
     446              :      *
     447              :      * Note: explicitly specifying string_hash is deprecated, because this
     448              :      * might not work for callers in loadable modules on some platforms due to
     449              :      * referencing a trampoline instead of the string_hash function proper.
     450              :      * Specify HASH_STRINGS instead.
     451              :      */
     452       373779 :     if (flags & HASH_COMPARE)
     453         7643 :         hashp->match = info->match;
     454       366136 :     else if (hashp->hash == string_hash)
     455        54059 :         hashp->match = (HashCompareFunc) string_compare;
     456              :     else
     457       312077 :         hashp->match = memcmp;
     458              : 
     459              :     /*
     460              :      * Similarly, the key-copying function defaults to strlcpy or memcpy.
     461              :      */
     462       373779 :     if (flags & HASH_KEYCOPY)
     463            0 :         hashp->keycopy = info->keycopy;
     464       373779 :     else if (hashp->hash == string_hash)
     465              :     {
     466              :         /*
     467              :          * The signature of keycopy is meant for memcpy(), which returns
     468              :          * void*, but strlcpy() returns size_t.  Since we never use the return
     469              :          * value of keycopy, and size_t is pretty much always the same size as
     470              :          * void *, this should be safe.  The extra cast in the middle is to
     471              :          * avoid warnings from -Wcast-function-type.
     472              :          */
     473        54059 :         hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
     474              :     }
     475              :     else
     476       319720 :         hashp->keycopy = memcpy;
     477              : 
     478              :     /* And select the entry allocation function, too. */
     479       373779 :     if (flags & HASH_ALLOC)
     480        10267 :         hashp->alloc = info->alloc;
     481              :     else
     482       363512 :         hashp->alloc = DynaHashAlloc;
     483              : 
     484       373779 :     if (flags & HASH_SHARED_MEM)
     485              :     {
     486              :         /*
     487              :          * ctl structure and directory are preallocated for shared memory
     488              :          * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
     489              :          * well.
     490              :          */
     491        10267 :         hashp->hctl = info->hctl;
     492        10267 :         hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
     493        10267 :         hashp->hcxt = NULL;
     494        10267 :         hashp->isshared = true;
     495              : 
     496              :         /* hash table already exists, we're just attaching to it */
     497        10267 :         if (flags & HASH_ATTACH)
     498              :         {
     499              :             /* make local copies of some heavily-used values */
     500            0 :             hctl = hashp->hctl;
     501            0 :             hashp->keysize = hctl->keysize;
     502            0 :             hashp->ssize = hctl->ssize;
     503            0 :             hashp->sshift = hctl->sshift;
     504              : 
     505            0 :             return hashp;
     506              :         }
     507              :     }
     508              :     else
     509              :     {
     510              :         /* setup hash table defaults */
     511       363512 :         hashp->hctl = NULL;
     512       363512 :         hashp->dir = NULL;
     513       363512 :         hashp->hcxt = CurrentDynaHashCxt;
     514       363512 :         hashp->isshared = false;
     515              :     }
     516              : 
     517       373779 :     if (!hashp->hctl)
     518              :     {
     519       363512 :         hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
     520       363512 :         if (!hashp->hctl)
     521            0 :             ereport(ERROR,
     522              :                     (errcode(ERRCODE_OUT_OF_MEMORY),
     523              :                      errmsg("out of memory")));
     524              :     }
     525              : 
     526       373779 :     hashp->frozen = false;
     527              : 
     528       373779 :     hdefault(hashp);
     529              : 
     530       373779 :     hctl = hashp->hctl;
     531              : 
     532       373779 :     if (flags & HASH_PARTITION)
     533              :     {
     534              :         /* Doesn't make sense to partition a local hash table */
     535              :         Assert(flags & HASH_SHARED_MEM);
     536              : 
     537              :         /*
     538              :          * The number of partitions had better be a power of 2. Also, it must
     539              :          * be less than INT_MAX (see init_htab()), so call the int version of
     540              :          * next_pow2.
     541              :          */
     542              :         Assert(info->num_partitions == next_pow2_int(info->num_partitions));
     543              : 
     544         5700 :         hctl->num_partitions = info->num_partitions;
     545              :     }
     546              : 
     547       373779 :     if (flags & HASH_SEGMENT)
     548              :     {
     549            0 :         hctl->ssize = info->ssize;
     550            0 :         hctl->sshift = my_log2(info->ssize);
     551              :         /* ssize had better be a power of 2 */
     552              :         Assert(hctl->ssize == (1L << hctl->sshift));
     553              :     }
     554              : 
     555              :     /*
     556              :      * SHM hash tables have fixed directory size passed by the caller.
     557              :      */
     558       373779 :     if (flags & HASH_DIRSIZE)
     559              :     {
     560        10267 :         hctl->max_dsize = info->max_dsize;
     561        10267 :         hctl->dsize = info->dsize;
     562              :     }
     563              : 
     564              :     /* remember the entry sizes, too */
     565       373779 :     hctl->keysize = info->keysize;
     566       373779 :     hctl->entrysize = info->entrysize;
     567              : 
     568              :     /* make local copies of heavily-used constant fields */
     569       373779 :     hashp->keysize = hctl->keysize;
     570       373779 :     hashp->ssize = hctl->ssize;
     571       373779 :     hashp->sshift = hctl->sshift;
     572              : 
     573              :     /* Build the hash directory structure */
     574       373779 :     if (!init_htab(hashp, nelem))
     575            0 :         elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
     576              : 
     577              :     /*
     578              :      * For a shared hash table, preallocate the requested number of elements.
     579              :      * This reduces problems with run-time out-of-shared-memory conditions.
     580              :      *
     581              :      * For a non-shared hash table, preallocate the requested number of
     582              :      * elements if it's less than our chosen nelem_alloc.  This avoids wasting
     583              :      * space if the caller correctly estimates a small table size.
     584              :      */
     585       373779 :     if ((flags & HASH_SHARED_MEM) ||
     586       363512 :         nelem < hctl->nelem_alloc)
     587              :     {
     588              :         int         i,
     589              :                     freelist_partitions,
     590              :                     nelem_alloc,
     591              :                     nelem_alloc_first;
     592              : 
     593              :         /*
     594              :          * If hash table is partitioned, give each freelist an equal share of
     595              :          * the initial allocation.  Otherwise only freeList[0] is used.
     596              :          */
     597       127968 :         if (IS_PARTITIONED(hashp->hctl))
     598         5700 :             freelist_partitions = NUM_FREELISTS;
     599              :         else
     600       122268 :             freelist_partitions = 1;
     601              : 
     602       127968 :         nelem_alloc = nelem / freelist_partitions;
     603       127968 :         if (nelem_alloc <= 0)
     604            0 :             nelem_alloc = 1;
     605              : 
     606              :         /*
     607              :          * Make sure we'll allocate all the requested elements; freeList[0]
     608              :          * gets the excess if the request isn't divisible by NUM_FREELISTS.
     609              :          */
     610       127968 :         if (nelem_alloc * freelist_partitions < nelem)
     611           53 :             nelem_alloc_first =
     612           53 :                 nelem - nelem_alloc * (freelist_partitions - 1);
     613              :         else
     614       127915 :             nelem_alloc_first = nelem_alloc;
     615              : 
     616       432636 :         for (i = 0; i < freelist_partitions; i++)
     617              :         {
     618       304668 :             int         temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
     619              : 
     620       304668 :             if (!element_alloc(hashp, temp, i))
     621            0 :                 ereport(ERROR,
     622              :                         (errcode(ERRCODE_OUT_OF_MEMORY),
     623              :                          errmsg("out of memory")));
     624              :         }
     625              :     }
     626              : 
     627              :     /* Set isfixed if requested, but not till after we build initial entries */
     628       373779 :     if (flags & HASH_FIXED_SIZE)
     629         4560 :         hctl->isfixed = true;
     630              : 
     631       373779 :     return hashp;
     632              : }
     633              : 
     634              : /*
     635              :  * Set default HASHHDR parameters.
     636              :  */
     637              : static void
     638       373779 : hdefault(HTAB *hashp)
     639              : {
     640       373779 :     HASHHDR    *hctl = hashp->hctl;
     641              : 
     642     40368132 :     MemSet(hctl, 0, sizeof(HASHHDR));
     643              : 
     644       373779 :     hctl->dsize = DEF_DIRSIZE;
     645       373779 :     hctl->nsegs = 0;
     646              : 
     647       373779 :     hctl->num_partitions = 0;    /* not partitioned */
     648              : 
     649              :     /* table has no fixed maximum size */
     650       373779 :     hctl->max_dsize = NO_MAX_DSIZE;
     651              : 
     652       373779 :     hctl->ssize = DEF_SEGSIZE;
     653       373779 :     hctl->sshift = DEF_SEGSIZE_SHIFT;
     654              : 
     655       373779 :     hctl->isfixed = false;       /* can be enlarged */
     656              : 
     657              : #ifdef HASH_STATISTICS
     658              :     hctl->accesses = hctl->collisions = hctl->expansions = 0;
     659              : #endif
     660       373779 : }
     661              : 
     662              : /*
     663              :  * Given the user-specified entry size, choose nelem_alloc, ie, how many
     664              :  * elements to add to the hash table when we need more.
     665              :  */
     666              : static int
     667       392929 : choose_nelem_alloc(Size entrysize)
     668              : {
     669              :     int         nelem_alloc;
     670              :     Size        elementSize;
     671              :     Size        allocSize;
     672              : 
     673              :     /* Each element has a HASHELEMENT header plus user data. */
     674              :     /* NB: this had better match element_alloc() */
     675       392929 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
     676              : 
     677              :     /*
     678              :      * The idea here is to choose nelem_alloc at least 32, but round up so
     679              :      * that the allocation request will be a power of 2 or just less. This
     680              :      * makes little difference for hash tables in shared memory, but for hash
     681              :      * tables managed by palloc, the allocation request will be rounded up to
     682              :      * a power of 2 anyway.  If we fail to take this into account, we'll waste
     683              :      * as much as half the allocated space.
     684              :      */
     685       392929 :     allocSize = 32 * 4;         /* assume elementSize at least 8 */
     686              :     do
     687              :     {
     688      1514533 :         allocSize <<= 1;
     689      1514533 :         nelem_alloc = allocSize / elementSize;
     690      1514533 :     } while (nelem_alloc < 32);
     691              : 
     692       392929 :     return nelem_alloc;
     693              : }
     694              : 
     695              : /*
     696              :  * Compute derived fields of hctl and build the initial directory/segment
     697              :  * arrays
     698              :  */
     699              : static bool
     700       373779 : init_htab(HTAB *hashp, int64 nelem)
     701              : {
     702       373779 :     HASHHDR    *hctl = hashp->hctl;
     703              :     HASHSEGMENT *segp;
     704              :     int         nbuckets;
     705              :     int         nsegs;
     706              :     int         i;
     707              : 
     708              :     /*
     709              :      * initialize mutexes if it's a partitioned table
     710              :      */
     711       373779 :     if (IS_PARTITIONED(hctl))
     712       188100 :         for (i = 0; i < NUM_FREELISTS; i++)
     713       182400 :             SpinLockInit(&(hctl->freeList[i].mutex));
     714              : 
     715              :     /*
     716              :      * Allocate space for the next greater power of two number of buckets,
     717              :      * assuming a desired maximum load factor of 1.
     718              :      */
     719       373779 :     nbuckets = next_pow2_int(nelem);
     720              : 
     721              :     /*
     722              :      * In a partitioned table, nbuckets must be at least equal to
     723              :      * num_partitions; were it less, keys with apparently different partition
     724              :      * numbers would map to the same bucket, breaking partition independence.
     725              :      * (Normally nbuckets will be much bigger; this is just a safety check.)
     726              :      */
     727       373779 :     while (nbuckets < hctl->num_partitions)
     728            0 :         nbuckets <<= 1;
     729              : 
     730       373779 :     hctl->max_bucket = hctl->low_mask = nbuckets - 1;
     731       373779 :     hctl->high_mask = (nbuckets << 1) - 1;
     732              : 
     733              :     /*
     734              :      * Figure number of directory segments needed, round up to a power of 2
     735              :      */
     736       373779 :     nsegs = (nbuckets - 1) / hctl->ssize + 1;
     737       373779 :     nsegs = next_pow2_int(nsegs);
     738              : 
     739              :     /*
     740              :      * Make sure directory is big enough. If pre-allocated directory is too
     741              :      * small, choke (caller screwed up).
     742              :      */
     743       373779 :     if (nsegs > hctl->dsize)
     744              :     {
     745            0 :         if (!(hashp->dir))
     746            0 :             hctl->dsize = nsegs;
     747              :         else
     748            0 :             return false;
     749              :     }
     750              : 
     751              :     /* Allocate a directory */
     752       373779 :     if (!(hashp->dir))
     753              :     {
     754       363512 :         CurrentDynaHashCxt = hashp->hcxt;
     755       363512 :         hashp->dir = (HASHSEGMENT *)
     756       363512 :             hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
     757       363512 :         if (!hashp->dir)
     758            0 :             return false;
     759              :     }
     760              : 
     761              :     /* Allocate initial segments */
     762      1064179 :     for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
     763              :     {
     764       690400 :         *segp = seg_alloc(hashp);
     765       690400 :         if (*segp == NULL)
     766            0 :             return false;
     767              :     }
     768              : 
     769              :     /* Choose number of entries to allocate at a time */
     770       373779 :     hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
     771              : 
     772       373779 :     return true;
     773              : }
     774              : 
     775              : /*
     776              :  * Estimate the space needed for a hashtable containing the given number
     777              :  * of entries of given size.
     778              :  * NOTE: this is used to estimate the footprint of hashtables in shared
     779              :  * memory; therefore it does not count HTAB which is in local memory.
     780              :  * NB: assumes that all hash structure parameters have default values!
     781              :  */
     782              : Size
     783        19150 : hash_estimate_size(int64 num_entries, Size entrysize)
     784              : {
     785              :     Size        size;
     786              :     int64       nBuckets,
     787              :                 nSegments,
     788              :                 nDirEntries,
     789              :                 nElementAllocs,
     790              :                 elementSize,
     791              :                 elementAllocCnt;
     792              : 
     793              :     /* estimate number of buckets wanted */
     794        19150 :     nBuckets = next_pow2_int64(num_entries);
     795              :     /* # of segments needed for nBuckets */
     796        19150 :     nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1);
     797              :     /* directory entries */
     798        19150 :     nDirEntries = DEF_DIRSIZE;
     799        19150 :     while (nDirEntries < nSegments)
     800            0 :         nDirEntries <<= 1;        /* dir_alloc doubles dsize at each call */
     801              : 
     802              :     /* fixed control info */
     803        19150 :     size = MAXALIGN(sizeof(HASHHDR));   /* but not HTAB, per above */
     804              :     /* directory */
     805        19150 :     size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
     806              :     /* segments */
     807        19150 :     size = add_size(size, mul_size(nSegments,
     808              :                                    MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
     809              :     /* elements --- allocated in groups of choose_nelem_alloc() entries */
     810        19150 :     elementAllocCnt = choose_nelem_alloc(entrysize);
     811        19150 :     nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
     812        19150 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
     813        19150 :     size = add_size(size,
     814              :                     mul_size(nElementAllocs,
     815              :                              mul_size(elementAllocCnt, elementSize)));
     816              : 
     817        19150 :     return size;
     818              : }
     819              : 
     820              : /*
     821              :  * Select an appropriate directory size for a hashtable with the given
     822              :  * maximum number of entries.
     823              :  * This is only needed for hashtables in shared memory, whose directories
     824              :  * cannot be expanded dynamically.
     825              :  * NB: assumes that all hash structure parameters have default values!
     826              :  *
     827              :  * XXX this had better agree with the behavior of init_htab()...
     828              :  */
     829              : int64
     830        10267 : hash_select_dirsize(int64 num_entries)
     831              : {
     832              :     int64       nBuckets,
     833              :                 nSegments,
     834              :                 nDirEntries;
     835              : 
     836              :     /* estimate number of buckets wanted */
     837        10267 :     nBuckets = next_pow2_int64(num_entries);
     838              :     /* # of segments needed for nBuckets */
     839        10267 :     nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1);
     840              :     /* directory entries */
     841        10267 :     nDirEntries = DEF_DIRSIZE;
     842        10267 :     while (nDirEntries < nSegments)
     843            0 :         nDirEntries <<= 1;        /* dir_alloc doubles dsize at each call */
     844              : 
     845        10267 :     return nDirEntries;
     846              : }
     847              : 
     848              : /*
     849              :  * Compute the required initial memory allocation for a shared-memory
     850              :  * hashtable with the given parameters.  We need space for the HASHHDR
     851              :  * and for the (non expansible) directory.
     852              :  */
     853              : Size
     854        10267 : hash_get_shared_size(HASHCTL *info, int flags)
     855              : {
     856              :     Assert(flags & HASH_DIRSIZE);
     857              :     Assert(info->dsize == info->max_dsize);
     858        10267 :     return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
     859              : }
     860              : 
     861              : 
     862              : /********************** DESTROY ROUTINES ************************/
     863              : 
     864              : void
     865        64794 : hash_destroy(HTAB *hashp)
     866              : {
     867        64794 :     if (hashp != NULL)
     868              :     {
     869              :         /* allocation method must be one we know how to free, too */
     870              :         Assert(hashp->alloc == DynaHashAlloc);
     871              :         /* so this hashtable must have its own context */
     872              :         Assert(hashp->hcxt != NULL);
     873              : 
     874        64794 :         hash_stats(__func__, hashp);
     875              : 
     876              :         /*
     877              :          * Free everything by destroying the hash table's memory context.
     878              :          */
     879        64794 :         MemoryContextDelete(hashp->hcxt);
     880              :     }
     881        64794 : }
     882              : 
     883              : void
     884        64794 : hash_stats(const char *caller, HTAB *hashp)
     885              : {
     886              : #ifdef HASH_STATISTICS
     887              :     HASHHDR    *hctl = hashp->hctl;
     888              : 
     889              :     elog(DEBUG4,
     890              :          "hash_stats:  Caller: %s  Table Name: \"%s\"  Accesses: " UINT64_FORMAT "  Collisions: " UINT64_FORMAT "  Expansions: " UINT64_FORMAT "  Entries: " INT64_FORMAT "  Key Size: %zu  Max Bucket: %u  Segment Count: " INT64_FORMAT,
     891              :          caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses,
     892              :          hctl->collisions, hctl->expansions, hash_get_num_entries(hashp),
     893              :          hctl->keysize, hctl->max_bucket, hctl->nsegs);
     894              : #endif
     895        64794 : }
     896              : 
     897              : /*******************************SEARCH ROUTINES *****************************/
     898              : 
     899              : 
     900              : /*
     901              :  * get_hash_value -- exported routine to calculate a key's hash value
     902              :  *
     903              :  * We export this because for partitioned tables, callers need to compute
     904              :  * the partition number (from the low-order bits of the hash value) before
     905              :  * searching.
     906              :  */
     907              : uint32
     908     84645550 : get_hash_value(HTAB *hashp, const void *keyPtr)
     909              : {
     910     84645550 :     return hashp->hash(keyPtr, hashp->keysize);
     911              : }
     912              : 
     913              : /* Convert a hash value to a bucket number */
     914              : static inline uint32
     915    198045051 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
     916              : {
     917              :     uint32      bucket;
     918              : 
     919    198045051 :     bucket = hash_val & hctl->high_mask;
     920    198045051 :     if (bucket > hctl->max_bucket)
     921     95360106 :         bucket = bucket & hctl->low_mask;
     922              : 
     923    198045051 :     return bucket;
     924              : }
     925              : 
     926              : /*
     927              :  * hash_search -- look up key in table and perform action
     928              :  * hash_search_with_hash_value -- same, with key's hash value already computed
     929              :  *
     930              :  * action is one of:
     931              :  *      HASH_FIND: look up key in table
     932              :  *      HASH_ENTER: look up key in table, creating entry if not present
     933              :  *      HASH_ENTER_NULL: same, but return NULL if out of memory
     934              :  *      HASH_REMOVE: look up key in table, remove entry if present
     935              :  *
     936              :  * Return value is a pointer to the element found/entered/removed if any,
     937              :  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
     938              :  * the result is a dangling pointer that shouldn't be dereferenced!)
     939              :  *
     940              :  * HASH_ENTER will normally ereport a generic "out of memory" error if
     941              :  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
     942              :  * the same except it will return NULL if out of memory.
     943              :  *
     944              :  * If foundPtr isn't NULL, then *foundPtr is set true if we found an
     945              :  * existing entry in the table, false otherwise.  This is needed in the
     946              :  * HASH_ENTER case, but is redundant with the return value otherwise.
     947              :  *
     948              :  * For hash_search_with_hash_value, the hashvalue parameter must have been
     949              :  * calculated with get_hash_value().
     950              :  */
     951              : void *
     952    121159531 : hash_search(HTAB *hashp,
     953              :             const void *keyPtr,
     954              :             HASHACTION action,
     955              :             bool *foundPtr)
     956              : {
     957    121159531 :     return hash_search_with_hash_value(hashp,
     958              :                                        keyPtr,
     959    121159531 :                                        hashp->hash(keyPtr, hashp->keysize),
     960              :                                        action,
     961              :                                        foundPtr);
     962              : }
     963              : 
     964              : void *
     965    196615878 : hash_search_with_hash_value(HTAB *hashp,
     966              :                             const void *keyPtr,
     967              :                             uint32 hashvalue,
     968              :                             HASHACTION action,
     969              :                             bool *foundPtr)
     970              : {
     971    196615878 :     HASHHDR    *hctl = hashp->hctl;
     972    196615878 :     int         freelist_idx = FREELIST_IDX(hctl, hashvalue);
     973              :     Size        keysize;
     974              :     HASHBUCKET  currBucket;
     975              :     HASHBUCKET *prevBucketPtr;
     976              :     HashCompareFunc match;
     977              : 
     978              : #ifdef HASH_STATISTICS
     979              :     hctl->accesses++;
     980              : #endif
     981              : 
     982              :     /*
     983              :      * If inserting, check if it is time to split a bucket.
     984              :      *
     985              :      * NOTE: failure to expand table is not a fatal error, it just means we
     986              :      * have to run at higher fill factor than we wanted.  However, if we're
     987              :      * using the palloc allocator then it will throw error anyway on
     988              :      * out-of-memory, so we must do this before modifying the table.
     989              :      */
     990    196615878 :     if (action == HASH_ENTER || action == HASH_ENTER_NULL)
     991              :     {
     992              :         /*
     993              :          * Can't split if running in partitioned mode, nor if frozen, nor if
     994              :          * table is the subject of any active hash_seq_search scans.
     995              :          */
     996     52733278 :         if (hctl->freeList[0].nentries > (int64) hctl->max_bucket &&
     997       375013 :             !IS_PARTITIONED(hctl) && !hashp->frozen &&
     998       375013 :             !has_seq_scans(hashp))
     999       375013 :             (void) expand_table(hashp);
    1000              :     }
    1001              : 
    1002              :     /*
    1003              :      * Do the initial lookup
    1004              :      */
    1005    196615878 :     (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
    1006    196615878 :     currBucket = *prevBucketPtr;
    1007              : 
    1008              :     /*
    1009              :      * Follow collision chain looking for matching key
    1010              :      */
    1011    196615878 :     match = hashp->match;        /* save one fetch in inner loop */
    1012    196615878 :     keysize = hashp->keysize;    /* ditto */
    1013              : 
    1014    232450007 :     while (currBucket != NULL)
    1015              :     {
    1016    349185807 :         if (currBucket->hashvalue == hashvalue &&
    1017    156678451 :             match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
    1018    156673227 :             break;
    1019     35834129 :         prevBucketPtr = &(currBucket->link);
    1020     35834129 :         currBucket = *prevBucketPtr;
    1021              : #ifdef HASH_STATISTICS
    1022              :         hctl->collisions++;
    1023              : #endif
    1024              :     }
    1025              : 
    1026    196615878 :     if (foundPtr)
    1027     54363224 :         *foundPtr = (bool) (currBucket != NULL);
    1028              : 
    1029              :     /*
    1030              :      * OK, now what?
    1031              :      */
    1032    196615878 :     switch (action)
    1033              :     {
    1034    116351824 :         case HASH_FIND:
    1035    116351824 :             if (currBucket != NULL)
    1036    109636467 :                 return ELEMENTKEY(currBucket);
    1037      6715357 :             return NULL;
    1038              : 
    1039     27530776 :         case HASH_REMOVE:
    1040     27530776 :             if (currBucket != NULL)
    1041              :             {
    1042              :                 /* if partitioned, must lock to touch nentries and freeList */
    1043     27528767 :                 if (IS_PARTITIONED(hctl))
    1044      5906775 :                     SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
    1045              : 
    1046              :                 /* delete the record from the appropriate nentries counter. */
    1047              :                 Assert(hctl->freeList[freelist_idx].nentries > 0);
    1048     27528767 :                 hctl->freeList[freelist_idx].nentries--;
    1049              : 
    1050              :                 /* remove record from hash bucket's chain. */
    1051     27528767 :                 *prevBucketPtr = currBucket->link;
    1052              : 
    1053              :                 /* add the record to the appropriate freelist. */
    1054     27528767 :                 currBucket->link = hctl->freeList[freelist_idx].freeList;
    1055     27528767 :                 hctl->freeList[freelist_idx].freeList = currBucket;
    1056              : 
    1057     27528767 :                 if (IS_PARTITIONED(hctl))
    1058      5906775 :                     SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1059              : 
    1060              :                 /*
    1061              :                  * better hope the caller is synchronizing access to this
    1062              :                  * element, because someone else is going to reuse it the next
    1063              :                  * time something is added to the table
    1064              :                  */
    1065     27528767 :                 return ELEMENTKEY(currBucket);
    1066              :             }
    1067         2009 :             return NULL;
    1068              : 
    1069     52733278 :         case HASH_ENTER:
    1070              :         case HASH_ENTER_NULL:
    1071              :             /* Return existing element if found, else create one */
    1072     52733278 :             if (currBucket != NULL)
    1073     19507993 :                 return ELEMENTKEY(currBucket);
    1074              : 
    1075              :             /* disallow inserts if frozen */
    1076     33225285 :             if (hashp->frozen)
    1077            0 :                 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
    1078              :                      hashp->tabname);
    1079              : 
    1080     33225285 :             currBucket = get_hash_entry(hashp, freelist_idx);
    1081     33225285 :             if (currBucket == NULL)
    1082              :             {
    1083              :                 /* out of memory */
    1084            0 :                 if (action == HASH_ENTER_NULL)
    1085            0 :                     return NULL;
    1086              :                 /* report a generic message */
    1087            0 :                 if (hashp->isshared)
    1088            0 :                     ereport(ERROR,
    1089              :                             (errcode(ERRCODE_OUT_OF_MEMORY),
    1090              :                              errmsg("out of shared memory")));
    1091              :                 else
    1092            0 :                     ereport(ERROR,
    1093              :                             (errcode(ERRCODE_OUT_OF_MEMORY),
    1094              :                              errmsg("out of memory")));
    1095              :             }
    1096              : 
    1097              :             /* link into hashbucket chain */
    1098     33225285 :             *prevBucketPtr = currBucket;
    1099     33225285 :             currBucket->link = NULL;
    1100              : 
    1101              :             /* copy key into record */
    1102     33225285 :             currBucket->hashvalue = hashvalue;
    1103     33225285 :             hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
    1104              : 
    1105              :             /*
    1106              :              * Caller is expected to fill the data field on return.  DO NOT
    1107              :              * insert any code that could possibly throw error here, as doing
    1108              :              * so would leave the table entry incomplete and hence corrupt the
    1109              :              * caller's data structure.
    1110              :              */
    1111              : 
    1112     33225285 :             return ELEMENTKEY(currBucket);
    1113              :     }
    1114              : 
    1115            0 :     elog(ERROR, "unrecognized hash action code: %d", (int) action);
    1116              : 
    1117              :     return NULL;                /* keep compiler quiet */
    1118              : }
    1119              : 
    1120              : /*
    1121              :  * hash_update_hash_key -- change the hash key of an existing table entry
    1122              :  *
    1123              :  * This is equivalent to removing the entry, making a new entry, and copying
    1124              :  * over its data, except that the entry never goes to the table's freelist.
    1125              :  * Therefore this cannot suffer an out-of-memory failure, even if there are
    1126              :  * other processes operating in other partitions of the hashtable.
    1127              :  *
    1128              :  * Returns true if successful, false if the requested new hash key is already
    1129              :  * present.  Throws error if the specified entry pointer isn't actually a
    1130              :  * table member.
    1131              :  *
    1132              :  * NB: currently, there is no special case for old and new hash keys being
    1133              :  * identical, which means we'll report false for that situation.  This is
    1134              :  * preferable for existing uses.
    1135              :  *
    1136              :  * NB: for a partitioned hashtable, caller must hold lock on both relevant
    1137              :  * partitions, if the new hash key would belong to a different partition.
    1138              :  */
    1139              : bool
    1140          751 : hash_update_hash_key(HTAB *hashp,
    1141              :                      void *existingEntry,
    1142              :                      const void *newKeyPtr)
    1143              : {
    1144          751 :     HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
    1145              :     uint32      newhashvalue;
    1146              :     Size        keysize;
    1147              :     uint32      bucket;
    1148              :     uint32      newbucket;
    1149              :     HASHBUCKET  currBucket;
    1150              :     HASHBUCKET *prevBucketPtr;
    1151              :     HASHBUCKET *oldPrevPtr;
    1152              :     HashCompareFunc match;
    1153              : 
    1154              : #ifdef HASH_STATISTICS
    1155              :     HASHHDR    *hctl = hashp->hctl;
    1156              : 
    1157              :     hctl->accesses++;
    1158              : #endif
    1159              : 
    1160              :     /* disallow updates if frozen */
    1161          751 :     if (hashp->frozen)
    1162            0 :         elog(ERROR, "cannot update in frozen hashtable \"%s\"",
    1163              :              hashp->tabname);
    1164              : 
    1165              :     /*
    1166              :      * Lookup the existing element using its saved hash value.  We need to do
    1167              :      * this to be able to unlink it from its hash chain, but as a side benefit
    1168              :      * we can verify the validity of the passed existingEntry pointer.
    1169              :      */
    1170          751 :     bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
    1171              :                                  &prevBucketPtr);
    1172          751 :     currBucket = *prevBucketPtr;
    1173              : 
    1174          751 :     while (currBucket != NULL)
    1175              :     {
    1176          751 :         if (currBucket == existingElement)
    1177          751 :             break;
    1178            0 :         prevBucketPtr = &(currBucket->link);
    1179            0 :         currBucket = *prevBucketPtr;
    1180              :     }
    1181              : 
    1182          751 :     if (currBucket == NULL)
    1183            0 :         elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
    1184              :              hashp->tabname);
    1185              : 
    1186          751 :     oldPrevPtr = prevBucketPtr;
    1187              : 
    1188              :     /*
    1189              :      * Now perform the equivalent of a HASH_ENTER operation to locate the hash
    1190              :      * chain we want to put the entry into.
    1191              :      */
    1192          751 :     newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
    1193          751 :     newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
    1194          751 :     currBucket = *prevBucketPtr;
    1195              : 
    1196              :     /*
    1197              :      * Follow collision chain looking for matching key
    1198              :      */
    1199          751 :     match = hashp->match;        /* save one fetch in inner loop */
    1200          751 :     keysize = hashp->keysize;    /* ditto */
    1201              : 
    1202          754 :     while (currBucket != NULL)
    1203              :     {
    1204            3 :         if (currBucket->hashvalue == newhashvalue &&
    1205            0 :             match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
    1206            0 :             break;
    1207            3 :         prevBucketPtr = &(currBucket->link);
    1208            3 :         currBucket = *prevBucketPtr;
    1209              : #ifdef HASH_STATISTICS
    1210              :         hctl->collisions++;
    1211              : #endif
    1212              :     }
    1213              : 
    1214          751 :     if (currBucket != NULL)
    1215            0 :         return false;           /* collision with an existing entry */
    1216              : 
    1217          751 :     currBucket = existingElement;
    1218              : 
    1219              :     /*
    1220              :      * If old and new hash values belong to the same bucket, we need not
    1221              :      * change any chain links, and indeed should not since this simplistic
    1222              :      * update will corrupt the list if currBucket is the last element.  (We
    1223              :      * cannot fall out earlier, however, since we need to scan the bucket to
    1224              :      * check for duplicate keys.)
    1225              :      */
    1226          751 :     if (bucket != newbucket)
    1227              :     {
    1228              :         /* OK to remove record from old hash bucket's chain. */
    1229          748 :         *oldPrevPtr = currBucket->link;
    1230              : 
    1231              :         /* link into new hashbucket chain */
    1232          748 :         *prevBucketPtr = currBucket;
    1233          748 :         currBucket->link = NULL;
    1234              :     }
    1235              : 
    1236              :     /* copy new key into record */
    1237          751 :     currBucket->hashvalue = newhashvalue;
    1238          751 :     hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
    1239              : 
    1240              :     /* rest of record is untouched */
    1241              : 
    1242          751 :     return true;
    1243              : }
    1244              : 
    1245              : /*
    1246              :  * Allocate a new hashtable entry if possible; return NULL if out of memory.
    1247              :  * (Or, if the underlying space allocator throws error for out-of-memory,
    1248              :  * we won't return at all.)
    1249              :  */
    1250              : static HASHBUCKET
    1251     33225285 : get_hash_entry(HTAB *hashp, int freelist_idx)
    1252              : {
    1253     33225285 :     HASHHDR    *hctl = hashp->hctl;
    1254              :     HASHBUCKET  newElement;
    1255              : 
    1256              :     for (;;)
    1257              :     {
    1258              :         /* if partitioned, must lock to touch nentries and freeList */
    1259     33570365 :         if (IS_PARTITIONED(hctl))
    1260      6546875 :             SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1261              : 
    1262              :         /* try to get an entry from the freelist */
    1263     33570365 :         newElement = hctl->freeList[freelist_idx].freeList;
    1264              : 
    1265     33570365 :         if (newElement != NULL)
    1266     33193168 :             break;
    1267              : 
    1268       377197 :         if (IS_PARTITIONED(hctl))
    1269        32117 :             SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1270              : 
    1271              :         /*
    1272              :          * No free elements in this freelist.  In a partitioned table, there
    1273              :          * might be entries in other freelists, but to reduce contention we
    1274              :          * prefer to first try to get another chunk of buckets from the main
    1275              :          * shmem allocator.  If that fails, though, we *MUST* root through all
    1276              :          * the other freelists before giving up.  There are multiple callers
    1277              :          * that assume that they can allocate every element in the initially
    1278              :          * requested table size, or that deleting an element guarantees they
    1279              :          * can insert a new element, even if shared memory is entirely full.
    1280              :          * Failing because the needed element is in a different freelist is
    1281              :          * not acceptable.
    1282              :          */
    1283       377197 :         if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
    1284              :         {
    1285              :             int         borrow_from_idx;
    1286              : 
    1287        32117 :             if (!IS_PARTITIONED(hctl))
    1288            0 :                 return NULL;    /* out of memory */
    1289              : 
    1290              :             /* try to borrow element from another freelist */
    1291        32117 :             borrow_from_idx = freelist_idx;
    1292              :             for (;;)
    1293              :             {
    1294        35651 :                 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
    1295        35651 :                 if (borrow_from_idx == freelist_idx)
    1296            0 :                     break;      /* examined all freelists, fail */
    1297              : 
    1298        35651 :                 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
    1299        35651 :                 newElement = hctl->freeList[borrow_from_idx].freeList;
    1300              : 
    1301        35651 :                 if (newElement != NULL)
    1302              :                 {
    1303        32117 :                     hctl->freeList[borrow_from_idx].freeList = newElement->link;
    1304        32117 :                     SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
    1305              : 
    1306              :                     /* careful: count the new element in its proper freelist */
    1307        32117 :                     SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1308        32117 :                     hctl->freeList[freelist_idx].nentries++;
    1309        32117 :                     SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1310              : 
    1311        32117 :                     return newElement;
    1312              :                 }
    1313              : 
    1314         3534 :                 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
    1315              :             }
    1316              : 
    1317              :             /* no elements available to borrow either, so out of memory */
    1318            0 :             return NULL;
    1319              :         }
    1320              :     }
    1321              : 
    1322              :     /* remove entry from freelist, bump nentries */
    1323     33193168 :     hctl->freeList[freelist_idx].freeList = newElement->link;
    1324     33193168 :     hctl->freeList[freelist_idx].nentries++;
    1325              : 
    1326     33193168 :     if (IS_PARTITIONED(hctl))
    1327      6514758 :         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1328              : 
    1329     33193168 :     return newElement;
    1330              : }
    1331              : 
    1332              : /*
    1333              :  * hash_get_num_entries -- get the number of entries in a hashtable
    1334              :  */
    1335              : int64
    1336        64075 : hash_get_num_entries(HTAB *hashp)
    1337              : {
    1338              :     int         i;
    1339        64075 :     int64       sum = hashp->hctl->freeList[0].nentries;
    1340              : 
    1341              :     /*
    1342              :      * We currently don't bother with acquiring the mutexes; it's only
    1343              :      * sensible to call this function if you've got lock on all partitions of
    1344              :      * the table.
    1345              :      */
    1346        64075 :     if (IS_PARTITIONED(hashp->hctl))
    1347              :     {
    1348        60608 :         for (i = 1; i < NUM_FREELISTS; i++)
    1349        58714 :             sum += hashp->hctl->freeList[i].nentries;
    1350              :     }
    1351              : 
    1352        64075 :     return sum;
    1353              : }
    1354              : 
    1355              : /*
    1356              :  * hash_seq_init/_search/_term
    1357              :  *          Sequentially search through hash table and return
    1358              :  *          all the elements one by one, return NULL when no more.
    1359              :  *
    1360              :  * hash_seq_term should be called if and only if the scan is abandoned before
    1361              :  * completion; if hash_seq_search returns NULL then it has already done the
    1362              :  * end-of-scan cleanup.
    1363              :  *
    1364              :  * NOTE: caller may delete the returned element before continuing the scan.
    1365              :  * However, deleting any other element while the scan is in progress is
    1366              :  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
    1367              :  * if elements are added to the table while the scan is in progress, it is
    1368              :  * unspecified whether they will be visited by the scan or not.
    1369              :  *
    1370              :  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
    1371              :  * worry about hash_seq_term cleanup, if the hashtable is first locked against
    1372              :  * further insertions by calling hash_freeze.
    1373              :  *
    1374              :  * NOTE: to use this with a partitioned hashtable, caller had better hold
    1375              :  * at least shared lock on all partitions of the table throughout the scan!
    1376              :  * We can cope with insertions or deletions by our own backend, but *not*
    1377              :  * with concurrent insertions or deletions by another.
    1378              :  */
    1379              : void
    1380      2789610 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
    1381              : {
    1382      2789610 :     status->hashp = hashp;
    1383      2789610 :     status->curBucket = 0;
    1384      2789610 :     status->curEntry = NULL;
    1385      2789610 :     status->hasHashvalue = false;
    1386      2789610 :     if (!hashp->frozen)
    1387      2789610 :         register_seq_scan(hashp);
    1388      2789610 : }
    1389              : 
    1390              : /*
    1391              :  * Same as above but scan by the given hash value.
    1392              :  * See also hash_seq_search().
    1393              :  *
    1394              :  * NOTE: the default hash function doesn't match syscache hash function.
    1395              :  * Thus, if you're going to use this function in syscache callback, make sure
    1396              :  * you're using custom hash function.  See relatt_cache_syshash()
    1397              :  * for example.
    1398              :  */
    1399              : void
    1400       903897 : hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp,
    1401              :                               uint32 hashvalue)
    1402              : {
    1403              :     HASHBUCKET *bucketPtr;
    1404              : 
    1405       903897 :     hash_seq_init(status, hashp);
    1406              : 
    1407       903897 :     status->hasHashvalue = true;
    1408       903897 :     status->hashvalue = hashvalue;
    1409              : 
    1410       903897 :     status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
    1411       903897 :     status->curEntry = *bucketPtr;
    1412       903897 : }
    1413              : 
    1414              : void *
    1415     29508165 : hash_seq_search(HASH_SEQ_STATUS *status)
    1416              : {
    1417              :     HTAB       *hashp;
    1418              :     HASHHDR    *hctl;
    1419              :     uint32      max_bucket;
    1420              :     int64       ssize;
    1421              :     int64       segment_num;
    1422              :     int64       segment_ndx;
    1423              :     HASHSEGMENT segp;
    1424              :     uint32      curBucket;
    1425              :     HASHELEMENT *curElem;
    1426              : 
    1427     29508165 :     if (status->hasHashvalue)
    1428              :     {
    1429              :         /*
    1430              :          * Scan entries only in the current bucket because only this bucket
    1431              :          * can contain entries with the given hash value.
    1432              :          */
    1433      1027695 :         while ((curElem = status->curEntry) != NULL)
    1434              :         {
    1435       123798 :             status->curEntry = curElem->link;
    1436       123798 :             if (status->hashvalue != curElem->hashvalue)
    1437       118957 :                 continue;
    1438         4841 :             return (void *) ELEMENTKEY(curElem);
    1439              :         }
    1440              : 
    1441       903897 :         hash_seq_term(status);
    1442       903897 :         return NULL;
    1443              :     }
    1444              : 
    1445     28599427 :     if ((curElem = status->curEntry) != NULL)
    1446              :     {
    1447              :         /* Continuing scan of curBucket... */
    1448      7116210 :         status->curEntry = curElem->link;
    1449      7116210 :         if (status->curEntry == NULL)    /* end of this bucket */
    1450      5160851 :             ++status->curBucket;
    1451      7116210 :         return ELEMENTKEY(curElem);
    1452              :     }
    1453              : 
    1454              :     /*
    1455              :      * Search for next nonempty bucket starting at curBucket.
    1456              :      */
    1457     21483217 :     curBucket = status->curBucket;
    1458     21483217 :     hashp = status->hashp;
    1459     21483217 :     hctl = hashp->hctl;
    1460     21483217 :     ssize = hashp->ssize;
    1461     21483217 :     max_bucket = hctl->max_bucket;
    1462              : 
    1463     21483217 :     if (curBucket > max_bucket)
    1464              :     {
    1465        61144 :         hash_seq_term(status);
    1466        61144 :         return NULL;            /* search is done */
    1467              :     }
    1468              : 
    1469              :     /*
    1470              :      * first find the right segment in the table directory.
    1471              :      */
    1472     21422073 :     segment_num = curBucket >> hashp->sshift;
    1473     21422073 :     segment_ndx = MOD(curBucket, ssize);
    1474              : 
    1475     21422073 :     segp = hashp->dir[segment_num];
    1476              : 
    1477              :     /*
    1478              :      * Pick up the first item in this bucket's chain.  If chain is not empty
    1479              :      * we can begin searching it.  Otherwise we have to advance to find the
    1480              :      * next nonempty bucket.  We try to optimize that case since searching a
    1481              :      * near-empty hashtable has to iterate this loop a lot.
    1482              :      */
    1483    115071619 :     while ((curElem = segp[segment_ndx]) == NULL)
    1484              :     {
    1485              :         /* empty bucket, advance to next */
    1486     95453827 :         if (++curBucket > max_bucket)
    1487              :         {
    1488      1804281 :             status->curBucket = curBucket;
    1489      1804281 :             hash_seq_term(status);
    1490      1804281 :             return NULL;        /* search is done */
    1491              :         }
    1492     93649546 :         if (++segment_ndx >= ssize)
    1493              :         {
    1494       187652 :             segment_num++;
    1495       187652 :             segment_ndx = 0;
    1496       187652 :             segp = hashp->dir[segment_num];
    1497              :         }
    1498              :     }
    1499              : 
    1500              :     /* Begin scan of curBucket... */
    1501     19617792 :     status->curEntry = curElem->link;
    1502     19617792 :     if (status->curEntry == NULL)    /* end of this bucket */
    1503     14456740 :         ++curBucket;
    1504     19617792 :     status->curBucket = curBucket;
    1505     19617792 :     return ELEMENTKEY(curElem);
    1506              : }
    1507              : 
    1508              : void
    1509      2789600 : hash_seq_term(HASH_SEQ_STATUS *status)
    1510              : {
    1511      2789600 :     if (!status->hashp->frozen)
    1512      2789600 :         deregister_seq_scan(status->hashp);
    1513      2789600 : }
    1514              : 
    1515              : /*
    1516              :  * hash_freeze
    1517              :  *          Freeze a hashtable against future insertions (deletions are
    1518              :  *          still allowed)
    1519              :  *
    1520              :  * The reason for doing this is that by preventing any more bucket splits,
    1521              :  * we no longer need to worry about registering hash_seq_search scans,
    1522              :  * and thus caller need not be careful about ensuring hash_seq_term gets
    1523              :  * called at the right times.
    1524              :  *
    1525              :  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
    1526              :  * with active scans (since hash_seq_term would then do the wrong thing).
    1527              :  */
    1528              : void
    1529            0 : hash_freeze(HTAB *hashp)
    1530              : {
    1531            0 :     if (hashp->isshared)
    1532            0 :         elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
    1533            0 :     if (!hashp->frozen && has_seq_scans(hashp))
    1534            0 :         elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
    1535              :              hashp->tabname);
    1536            0 :     hashp->frozen = true;
    1537            0 : }
    1538              : 
    1539              : 
    1540              : /********************************* UTILITIES ************************/
    1541              : 
    1542              : /*
    1543              :  * Expand the table by adding one more hash bucket.
    1544              :  */
    1545              : static bool
    1546       375013 : expand_table(HTAB *hashp)
    1547              : {
    1548       375013 :     HASHHDR    *hctl = hashp->hctl;
    1549              :     HASHSEGMENT old_seg,
    1550              :                 new_seg;
    1551              :     int64       old_bucket,
    1552              :                 new_bucket;
    1553              :     int64       new_segnum,
    1554              :                 new_segndx;
    1555              :     int64       old_segnum,
    1556              :                 old_segndx;
    1557              :     HASHBUCKET *oldlink,
    1558              :                *newlink;
    1559              :     HASHBUCKET  currElement,
    1560              :                 nextElement;
    1561              : 
    1562              :     Assert(!IS_PARTITIONED(hctl));
    1563              : 
    1564              : #ifdef HASH_STATISTICS
    1565              :     hctl->expansions++;
    1566              : #endif
    1567              : 
    1568       375013 :     new_bucket = hctl->max_bucket + 1;
    1569       375013 :     new_segnum = new_bucket >> hashp->sshift;
    1570       375013 :     new_segndx = MOD(new_bucket, hashp->ssize);
    1571              : 
    1572       375013 :     if (new_segnum >= hctl->nsegs)
    1573              :     {
    1574              :         /* Allocate new segment if necessary -- could fail if dir full */
    1575         1209 :         if (new_segnum >= hctl->dsize)
    1576            0 :             if (!dir_realloc(hashp))
    1577            0 :                 return false;
    1578         1209 :         if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
    1579            0 :             return false;
    1580         1209 :         hctl->nsegs++;
    1581              :     }
    1582              : 
    1583              :     /* OK, we created a new bucket */
    1584       375013 :     hctl->max_bucket++;
    1585              : 
    1586              :     /*
    1587              :      * *Before* changing masks, find old bucket corresponding to same hash
    1588              :      * values; values in that bucket may need to be relocated to new bucket.
    1589              :      * Note that new_bucket is certainly larger than low_mask at this point,
    1590              :      * so we can skip the first step of the regular hash mask calc.
    1591              :      */
    1592       375013 :     old_bucket = (new_bucket & hctl->low_mask);
    1593              : 
    1594              :     /*
    1595              :      * If we crossed a power of 2, readjust masks.
    1596              :      */
    1597       375013 :     if ((uint32) new_bucket > hctl->high_mask)
    1598              :     {
    1599         2286 :         hctl->low_mask = hctl->high_mask;
    1600         2286 :         hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
    1601              :     }
    1602              : 
    1603              :     /*
    1604              :      * Relocate records to the new bucket.  NOTE: because of the way the hash
    1605              :      * masking is done in calc_bucket, only one old bucket can need to be
    1606              :      * split at this point.  With a different way of reducing the hash value,
    1607              :      * that might not be true!
    1608              :      */
    1609       375013 :     old_segnum = old_bucket >> hashp->sshift;
    1610       375013 :     old_segndx = MOD(old_bucket, hashp->ssize);
    1611              : 
    1612       375013 :     old_seg = hashp->dir[old_segnum];
    1613       375013 :     new_seg = hashp->dir[new_segnum];
    1614              : 
    1615       375013 :     oldlink = &old_seg[old_segndx];
    1616       375013 :     newlink = &new_seg[new_segndx];
    1617              : 
    1618       375013 :     for (currElement = *oldlink;
    1619       898787 :          currElement != NULL;
    1620       523774 :          currElement = nextElement)
    1621              :     {
    1622       523774 :         nextElement = currElement->link;
    1623       523774 :         if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
    1624              :         {
    1625       257010 :             *oldlink = currElement;
    1626       257010 :             oldlink = &currElement->link;
    1627              :         }
    1628              :         else
    1629              :         {
    1630       266764 :             *newlink = currElement;
    1631       266764 :             newlink = &currElement->link;
    1632              :         }
    1633              :     }
    1634              :     /* don't forget to terminate the rebuilt hash chains... */
    1635       375013 :     *oldlink = NULL;
    1636       375013 :     *newlink = NULL;
    1637              : 
    1638       375013 :     return true;
    1639              : }
    1640              : 
    1641              : 
    1642              : static bool
    1643            0 : dir_realloc(HTAB *hashp)
    1644              : {
    1645              :     HASHSEGMENT *p;
    1646              :     HASHSEGMENT *old_p;
    1647              :     int64       new_dsize;
    1648              :     int64       old_dirsize;
    1649              :     int64       new_dirsize;
    1650              : 
    1651            0 :     if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
    1652            0 :         return false;
    1653              : 
    1654              :     /* Reallocate directory */
    1655            0 :     new_dsize = hashp->hctl->dsize << 1;
    1656            0 :     old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
    1657            0 :     new_dirsize = new_dsize * sizeof(HASHSEGMENT);
    1658              : 
    1659            0 :     old_p = hashp->dir;
    1660            0 :     CurrentDynaHashCxt = hashp->hcxt;
    1661            0 :     p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
    1662              : 
    1663            0 :     if (p != NULL)
    1664              :     {
    1665            0 :         memcpy(p, old_p, old_dirsize);
    1666            0 :         MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
    1667            0 :         hashp->dir = p;
    1668            0 :         hashp->hctl->dsize = new_dsize;
    1669              : 
    1670              :         /* XXX assume the allocator is palloc, so we know how to free */
    1671              :         Assert(hashp->alloc == DynaHashAlloc);
    1672            0 :         pfree(old_p);
    1673              : 
    1674            0 :         return true;
    1675              :     }
    1676              : 
    1677            0 :     return false;
    1678              : }
    1679              : 
    1680              : 
    1681              : static HASHSEGMENT
    1682       691609 : seg_alloc(HTAB *hashp)
    1683              : {
    1684              :     HASHSEGMENT segp;
    1685              : 
    1686       691609 :     CurrentDynaHashCxt = hashp->hcxt;
    1687       691609 :     segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
    1688              : 
    1689       691609 :     if (!segp)
    1690            0 :         return NULL;
    1691              : 
    1692       691609 :     MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
    1693              : 
    1694       691609 :     return segp;
    1695              : }
    1696              : 
    1697              : /*
    1698              :  * allocate some new elements and link them into the indicated free list
    1699              :  */
    1700              : static bool
    1701       681865 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
    1702              : {
    1703       681865 :     HASHHDR    *hctl = hashp->hctl;
    1704              :     Size        elementSize;
    1705              :     Size        requestSize;
    1706              :     char       *allocedBlock;
    1707              :     HASHELEMENT *firstElement;
    1708              :     HASHELEMENT *tmpElement;
    1709              :     HASHELEMENT *prevElement;
    1710              :     int         i;
    1711              : 
    1712       681865 :     if (hctl->isfixed)
    1713        32117 :         return false;
    1714              : 
    1715              :     /* Each element has a HASHELEMENT header plus user data. */
    1716       649748 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
    1717              : 
    1718       649748 :     requestSize = nelem * elementSize;
    1719              : 
    1720              :     /* Add space for slist_node list link if we need one. */
    1721              : #ifdef USE_VALGRIND
    1722              :     if (!hashp->isshared)
    1723              :         requestSize += MAXALIGN(sizeof(slist_node));
    1724              : #endif
    1725              : 
    1726              :     /* Allocate the memory. */
    1727       649748 :     CurrentDynaHashCxt = hashp->hcxt;
    1728       649748 :     allocedBlock = hashp->alloc(requestSize);
    1729              : 
    1730       649748 :     if (!allocedBlock)
    1731            0 :         return false;
    1732              : 
    1733              :     /*
    1734              :      * If USE_VALGRIND, each allocated block of elements of a non-shared
    1735              :      * hashtable is chained into a list, so that Valgrind won't think it's
    1736              :      * been leaked.
    1737              :      */
    1738              : #ifdef USE_VALGRIND
    1739              :     if (hashp->isshared)
    1740              :         firstElement = (HASHELEMENT *) allocedBlock;
    1741              :     else
    1742              :     {
    1743              :         slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock);
    1744              :         firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node)));
    1745              :     }
    1746              : #else
    1747       649748 :     firstElement = (HASHELEMENT *) allocedBlock;
    1748              : #endif
    1749              : 
    1750              :     /* prepare to link all the new entries into the freelist */
    1751       649748 :     prevElement = NULL;
    1752       649748 :     tmpElement = firstElement;
    1753     57086499 :     for (i = 0; i < nelem; i++)
    1754              :     {
    1755     56436751 :         tmpElement->link = prevElement;
    1756     56436751 :         prevElement = tmpElement;
    1757     56436751 :         tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
    1758              :     }
    1759              : 
    1760              :     /* if partitioned, must lock to touch freeList */
    1761       649748 :     if (IS_PARTITIONED(hctl))
    1762       182400 :         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1763              : 
    1764              :     /* freelist could be nonempty if two backends did this concurrently */
    1765       649748 :     firstElement->link = hctl->freeList[freelist_idx].freeList;
    1766       649748 :     hctl->freeList[freelist_idx].freeList = prevElement;
    1767              : 
    1768       649748 :     if (IS_PARTITIONED(hctl))
    1769       182400 :         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1770              : 
    1771       649748 :     return true;
    1772              : }
    1773              : 
    1774              : /*
    1775              :  * Do initial lookup of a bucket for the given hash value, retrieving its
    1776              :  * bucket number and its hash bucket.
    1777              :  */
    1778              : static inline uint32
    1779    197521277 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
    1780              : {
    1781    197521277 :     HASHHDR    *hctl = hashp->hctl;
    1782              :     HASHSEGMENT segp;
    1783              :     int64       segment_num;
    1784              :     int64       segment_ndx;
    1785              :     uint32      bucket;
    1786              : 
    1787    197521277 :     bucket = calc_bucket(hctl, hashvalue);
    1788              : 
    1789    197521277 :     segment_num = bucket >> hashp->sshift;
    1790    197521277 :     segment_ndx = MOD(bucket, hashp->ssize);
    1791              : 
    1792    197521277 :     segp = hashp->dir[segment_num];
    1793              : 
    1794    197521277 :     if (segp == NULL)
    1795            0 :         hash_corrupted(hashp);
    1796              : 
    1797    197521277 :     *bucketptr = &segp[segment_ndx];
    1798    197521277 :     return bucket;
    1799              : }
    1800              : 
    1801              : /* complain when we have detected a corrupted hashtable */
    1802              : static void
    1803            0 : hash_corrupted(HTAB *hashp)
    1804              : {
    1805              :     /*
    1806              :      * If the corruption is in a shared hashtable, we'd better force a
    1807              :      * systemwide restart.  Otherwise, just shut down this one backend.
    1808              :      */
    1809            0 :     if (hashp->isshared)
    1810            0 :         elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
    1811              :     else
    1812            0 :         elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
    1813              : }
    1814              : 
    1815              : /* calculate ceil(log base 2) of num */
    1816              : static int
    1817       806392 : my_log2(int64 num)
    1818              : {
    1819              :     /*
    1820              :      * guard against too-large input, which would be invalid for
    1821              :      * pg_ceil_log2_*()
    1822              :      */
    1823       806392 :     if (num > PG_INT64_MAX / 2)
    1824            0 :         num = PG_INT64_MAX / 2;
    1825              : 
    1826       806392 :     return pg_ceil_log2_64(num);
    1827              : }
    1828              : 
    1829              : /* calculate first power of 2 >= num, bounded to what will fit in a int64 */
    1830              : static int64
    1831        58834 : next_pow2_int64(int64 num)
    1832              : {
    1833              :     /* my_log2's internal range check is sufficient */
    1834        58834 :     return 1L << my_log2(num);
    1835              : }
    1836              : 
    1837              : /* calculate first power of 2 >= num, bounded to what will fit in an int */
    1838              : static int
    1839       747558 : next_pow2_int(int64 num)
    1840              : {
    1841       747558 :     if (num > INT_MAX / 2)
    1842            0 :         num = INT_MAX / 2;
    1843       747558 :     return 1 << my_log2(num);
    1844              : }
    1845              : 
    1846              : 
    1847              : /************************* SEQ SCAN TRACKING ************************/
    1848              : 
    1849              : /*
    1850              :  * We track active hash_seq_search scans here.  The need for this mechanism
    1851              :  * comes from the fact that a scan will get confused if a bucket split occurs
    1852              :  * while it's in progress: it might visit entries twice, or even miss some
    1853              :  * entirely (if it's partway through the same bucket that splits).  Hence
    1854              :  * we want to inhibit bucket splits if there are any active scans on the
    1855              :  * table being inserted into.  This is a fairly rare case in current usage,
    1856              :  * so just postponing the split until the next insertion seems sufficient.
    1857              :  *
    1858              :  * Given present usages of the function, only a few scans are likely to be
    1859              :  * open concurrently; so a finite-size stack of open scans seems sufficient,
    1860              :  * and we don't worry that linear search is too slow.  Note that we do
    1861              :  * allow multiple scans of the same hashtable to be open concurrently.
    1862              :  *
    1863              :  * This mechanism can support concurrent scan and insertion in a shared
    1864              :  * hashtable if it's the same backend doing both.  It would fail otherwise,
    1865              :  * but locking reasons seem to preclude any such scenario anyway, so we don't
    1866              :  * worry.
    1867              :  *
    1868              :  * This arrangement is reasonably robust if a transient hashtable is deleted
    1869              :  * without notifying us.  The absolute worst case is we might inhibit splits
    1870              :  * in another table created later at exactly the same address.  We will give
    1871              :  * a warning at transaction end for reference leaks, so any bugs leading to
    1872              :  * lack of notification should be easy to catch.
    1873              :  */
    1874              : 
    1875              : #define MAX_SEQ_SCANS 100
    1876              : 
    1877              : static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
    1878              : static int  seq_scan_level[MAX_SEQ_SCANS];  /* subtransaction nest level */
    1879              : static int  num_seq_scans = 0;
    1880              : 
    1881              : 
    1882              : /* Register a table as having an active hash_seq_search scan */
    1883              : static void
    1884      2789610 : register_seq_scan(HTAB *hashp)
    1885              : {
    1886      2789610 :     if (num_seq_scans >= MAX_SEQ_SCANS)
    1887            0 :         elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
    1888              :              hashp->tabname);
    1889      2789610 :     seq_scan_tables[num_seq_scans] = hashp;
    1890      2789610 :     seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
    1891      2789610 :     num_seq_scans++;
    1892      2789610 : }
    1893              : 
    1894              : /* Deregister an active scan */
    1895              : static void
    1896      2789600 : deregister_seq_scan(HTAB *hashp)
    1897              : {
    1898              :     int         i;
    1899              : 
    1900              :     /* Search backward since it's most likely at the stack top */
    1901      2789600 :     for (i = num_seq_scans - 1; i >= 0; i--)
    1902              :     {
    1903      2789600 :         if (seq_scan_tables[i] == hashp)
    1904              :         {
    1905      2789600 :             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
    1906      2789600 :             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
    1907      2789600 :             num_seq_scans--;
    1908      2789600 :             return;
    1909              :         }
    1910              :     }
    1911            0 :     elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
    1912              :          hashp->tabname);
    1913              : }
    1914              : 
    1915              : /* Check if a table has any active scan */
    1916              : static bool
    1917       375013 : has_seq_scans(HTAB *hashp)
    1918              : {
    1919              :     int         i;
    1920              : 
    1921       375028 :     for (i = 0; i < num_seq_scans; i++)
    1922              :     {
    1923           15 :         if (seq_scan_tables[i] == hashp)
    1924            0 :             return true;
    1925              :     }
    1926       375013 :     return false;
    1927              : }
    1928              : 
    1929              : /* Clean up any open scans at end of transaction */
    1930              : void
    1931       553369 : AtEOXact_HashTables(bool isCommit)
    1932              : {
    1933              :     /*
    1934              :      * During abort cleanup, open scans are expected; just silently clean 'em
    1935              :      * out.  An open scan at commit means someone forgot a hash_seq_term()
    1936              :      * call, so complain.
    1937              :      *
    1938              :      * Note: it's tempting to try to print the tabname here, but refrain for
    1939              :      * fear of touching deallocated memory.  This isn't a user-facing message
    1940              :      * anyway, so it needn't be pretty.
    1941              :      */
    1942       553369 :     if (isCommit)
    1943              :     {
    1944              :         int         i;
    1945              : 
    1946       526873 :         for (i = 0; i < num_seq_scans; i++)
    1947              :         {
    1948            0 :             elog(WARNING, "leaked hash_seq_search scan for hash table %p",
    1949              :                  seq_scan_tables[i]);
    1950              :         }
    1951              :     }
    1952       553369 :     num_seq_scans = 0;
    1953       553369 : }
    1954              : 
    1955              : /* Clean up any open scans at end of subtransaction */
    1956              : void
    1957        10104 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
    1958              : {
    1959              :     int         i;
    1960              : 
    1961              :     /*
    1962              :      * Search backward to make cleanup easy.  Note we must check all entries,
    1963              :      * not only those at the end of the array, because deletion technique
    1964              :      * doesn't keep them in order.
    1965              :      */
    1966        10104 :     for (i = num_seq_scans - 1; i >= 0; i--)
    1967              :     {
    1968            0 :         if (seq_scan_level[i] >= nestDepth)
    1969              :         {
    1970            0 :             if (isCommit)
    1971            0 :                 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
    1972              :                      seq_scan_tables[i]);
    1973            0 :             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
    1974            0 :             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
    1975            0 :             num_seq_scans--;
    1976              :         }
    1977              :     }
    1978        10104 : }
        

Generated by: LCOV version 2.0-1