LCOV - code coverage report
Current view: top level - src/backend/utils/hash - dynahash.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 86.0 % 486 418
Test Date: 2026-04-07 14:16:30 Functions: 91.4 % 35 32
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * dynahash.c
       4              :  *    dynamic chained hash tables
       5              :  *
       6              :  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
       7              :  * shared memory.  For shared hash tables, it is the caller's responsibility
       8              :  * to provide appropriate access interlocking.  The simplest convention is
       9              :  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
      10              :  * hash_seq_search) need only shared lock, but any update requires exclusive
      11              :  * lock.  For heavily-used shared tables, the single-lock approach creates a
      12              :  * concurrency bottleneck, so we also support "partitioned" locking wherein
      13              :  * there are multiple LWLocks guarding distinct subsets of the table.  To use
      14              :  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
      15              :  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
      16              :  * Therefore, each hash bucket chain operates independently, and no fields
      17              :  * of the hash header change after init except nentries and freeList.
      18              :  * (A partitioned table uses multiple copies of those fields, guarded by
      19              :  * spinlocks, for additional concurrency.)
      20              :  * This lets any subset of the hash buckets be treated as a separately
      21              :  * lockable partition.  We expect callers to use the low-order bits of a
      22              :  * lookup key's hash value as a partition number --- this will work because
      23              :  * of the way calc_bucket() maps hash values to bucket numbers.
      24              :  *
      25              :  * The memory allocator function should match malloc's semantics of returning
      26              :  * NULL on failure.  (This is essential for hash tables in shared memory.
      27              :  * For hash tables in local memory, we used to use palloc() which will throw
      28              :  * error on failure; but we no longer do, so it's untested whether this
      29              :  * module will still cope with that behavior.)
      30              :  *
      31              :  * dynahash.c provides support for these types of lookup keys:
      32              :  *
      33              :  * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
      34              :  * compared as though by strcmp().  This is selected by specifying the
      35              :  * HASH_STRINGS flag to hash_create.
      36              :  *
      37              :  * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
      38              :  * (Caller must ensure there are no undefined padding bits in the keys!)
      39              :  * This is selected by specifying the HASH_BLOBS flag to hash_create.
      40              :  *
      41              :  * 3. More complex key behavior can be selected by specifying user-supplied
      42              :  * hashing, comparison, and/or key-copying functions.  At least a hashing
      43              :  * function must be supplied; comparison defaults to memcmp() and key copying
      44              :  * to memcpy() when a user-defined hashing function is selected.
      45              :  *
      46              :  * Compared to simplehash, dynahash has the following benefits:
      47              :  *
      48              :  * - It supports partitioning, which is useful for shared memory access using
      49              :  *   locks.
      50              :  * - Shared memory hashes are allocated in a fixed size area at startup and
      51              :  *   are discoverable by name from other processes.
      52              :  * - Because entries don't need to be moved in the case of hash conflicts,
      53              :  *   dynahash has better performance for large entries.
      54              :  * - Guarantees stable pointers to entries.
      55              :  *
      56              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      57              :  * Portions Copyright (c) 1994, Regents of the University of California
      58              :  *
      59              :  *
      60              :  * IDENTIFICATION
      61              :  *    src/backend/utils/hash/dynahash.c
      62              :  *
      63              :  *-------------------------------------------------------------------------
      64              :  */
      65              : 
      66              : /*
      67              :  * Original comments:
      68              :  *
      69              :  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
      70              :  * Coded into C, with minor code improvements, and with hsearch(3) interface,
      71              :  * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
      72              :  * also, hcreate/hdestroy routines added to simulate hsearch(3).
      73              :  *
      74              :  * These routines simulate hsearch(3) and family, with the important
      75              :  * difference that the hash table is dynamic - can grow indefinitely
      76              :  * beyond its original size (as supplied to hcreate()).
      77              :  *
      78              :  * Performance appears to be comparable to that of hsearch(3).
      79              :  * The 'source-code' options referred to in hsearch(3)'s 'man' page
      80              :  * are not implemented; otherwise functionality is identical.
      81              :  *
      82              :  * Compilation controls:
      83              :  * HASH_STATISTICS causes some usage statistics to be maintained, which can be
      84              :  * logged by calling hash_stats().
      85              :  *
      86              :  * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
      87              :  * concatenation property, in probably unnecessary code 'optimization'.
      88              :  *
      89              :  * Modified margo@postgres.berkeley.edu February 1990
      90              :  *      added multiple table interface
      91              :  * Modified by sullivan@postgres.berkeley.edu April 1990
      92              :  *      changed ctl structure for shared memory
      93              :  */
      94              : 
      95              : #include "postgres.h"
      96              : 
      97              : #include <limits.h>
      98              : 
      99              : #include "access/xact.h"
     100              : #include "common/hashfn.h"
     101              : #include "lib/ilist.h"
     102              : #include "port/pg_bitutils.h"
     103              : #include "storage/shmem.h"
     104              : #include "storage/spin.h"
     105              : #include "utils/memutils.h"
     106              : 
     107              : 
     108              : /*
     109              :  * Constants
     110              :  *
     111              :  * A hash table has a top-level "directory", each of whose entries points to a
     112              :  * "segment" of HASH_SEGSIZE bucket headers.  The maximum number of hash
     113              :  * buckets is thus dsize * HASH_SEGSIZE (but dsize may be expansible).  Of
     114              :  * course, the number of records in the table can be larger, but we don't want
     115              :  * a whole lot of records per bucket or performance goes down.
     116              :  *
     117              :  * In a hash table allocated in shared memory, the directory cannot be
     118              :  * expanded because it must stay at a fixed address.  The directory size is
     119              :  * chosen at creation based on the initial number of elements, so even though
     120              :  * we support allocating more elements later, performance will suffer if the
     121              :  * table grows much beyond the initial size.  (Currently, shared memory hash
     122              :  * tables are only created by ShmemRequestHash()/ShmemInitHash() though, which
     123              :  * doesn't support growing at all.)
     124              :  */
     125              : #define HASH_SEGSIZE               256
     126              : #define HASH_SEGSIZE_SHIFT     8    /* must be log2(HASH_SEGSIZE) */
     127              : #define DEF_DIRSIZE            256
     128              : 
     129              : /* Number of freelists to be used for a partitioned hash table. */
     130              : #define NUM_FREELISTS           32
     131              : 
     132              : /* A hash bucket is a linked list of HASHELEMENTs */
     133              : typedef HASHELEMENT *HASHBUCKET;
     134              : 
     135              : /* A hash segment is an array of bucket headers */
     136              : typedef HASHBUCKET *HASHSEGMENT;
     137              : 
     138              : /*
     139              :  * Per-freelist data.
     140              :  *
     141              :  * In a partitioned hash table, each freelist is associated with a specific
     142              :  * set of hashcodes, as determined by the FREELIST_IDX() macro below.
     143              :  * nentries tracks the number of live hashtable entries having those hashcodes
     144              :  * (NOT the number of entries in the freelist, as you might expect).
     145              :  *
     146              :  * The coverage of a freelist might be more or less than one partition, so it
     147              :  * needs its own lock rather than relying on caller locking.  Relying on that
     148              :  * wouldn't work even if the coverage was the same, because of the occasional
     149              :  * need to "borrow" entries from another freelist; see get_hash_entry().
     150              :  *
     151              :  * Using an array of FreeListData instead of separate arrays of mutexes,
     152              :  * nentries and freeLists helps to reduce sharing of cache lines between
     153              :  * different mutexes.
     154              :  */
     155              : typedef struct
     156              : {
     157              :     slock_t     mutex;          /* spinlock for this freelist */
     158              :     int64       nentries;       /* number of entries in associated buckets */
     159              :     HASHELEMENT *freeList;      /* chain of free elements */
     160              : } FreeListData;
     161              : 
     162              : /*
     163              :  * Header structure for a hash table --- contains all changeable info
     164              :  *
     165              :  * In a shared-memory hash table, the HASHHDR is in shared memory, while
     166              :  * each backend has a local HTAB struct.  For a non-shared table, there isn't
     167              :  * any functional difference between HASHHDR and HTAB, but we separate them
     168              :  * anyway to share code between shared and non-shared tables.
     169              :  */
     170              : struct HASHHDR
     171              : {
     172              :     /*
     173              :      * The freelist can become a point of contention in high-concurrency hash
     174              :      * tables, so we use an array of freelists, each with its own mutex and
     175              :      * nentries count, instead of just a single one.  Although the freelists
     176              :      * normally operate independently, we will scavenge entries from freelists
     177              :      * other than a hashcode's default freelist when necessary.
     178              :      *
     179              :      * If the hash table is not partitioned, only freeList[0] is used and its
     180              :      * spinlock is not used at all; callers' locking is assumed sufficient.
     181              :      */
     182              :     FreeListData freeList[NUM_FREELISTS];
     183              : 
     184              :     /* These fields can change, but not in a partitioned table */
     185              :     /* Also, dsize can't change in a shared table, even if unpartitioned */
     186              :     int64       dsize;          /* directory size */
     187              :     int64       nsegs;          /* number of allocated segments (<= dsize) */
     188              :     uint32      max_bucket;     /* ID of maximum bucket in use */
     189              :     uint32      high_mask;      /* mask to modulo into entire table */
     190              :     uint32      low_mask;       /* mask to modulo into lower half of table */
     191              : 
     192              :     /* These fields are fixed at hashtable creation */
     193              :     Size        keysize;        /* hash key length in bytes */
     194              :     Size        entrysize;      /* total user element size in bytes */
     195              :     int64       num_partitions; /* # partitions (must be power of 2), or 0 */
     196              :     int64       max_dsize;      /* 'dsize' limit if directory is fixed size */
     197              :     int         nelem_alloc;    /* number of entries to allocate at once */
     198              :     bool        isfixed;        /* if true, don't enlarge */
     199              : 
     200              :     /* Current directory.  In shared tables, this doesn't change */
     201              :     HASHSEGMENT *dir;
     202              : 
     203              : #ifdef HASH_STATISTICS
     204              : 
     205              :     /*
     206              :      * Count statistics here.  NB: stats code doesn't bother with mutex, so
     207              :      * counts could be corrupted a bit in a partitioned table.
     208              :      */
     209              :     uint64      accesses;
     210              :     uint64      collisions;
     211              :     uint64      expansions;
     212              : #endif
     213              : };
     214              : 
     215              : #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
     216              : 
     217              : #define FREELIST_IDX(hctl, hashcode) \
     218              :     (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
     219              : 
     220              : /*
     221              :  * Top control structure for a hashtable --- in a shared table, each backend
     222              :  * has its own copy (OK since no fields change at runtime)
     223              :  */
     224              : struct HTAB
     225              : {
     226              :     HASHHDR    *hctl;           /* => shared control information */
     227              :     HASHSEGMENT *dir;           /* directory of segment starts */
     228              :     HashValueFunc hash;         /* hash function */
     229              :     HashCompareFunc match;      /* key comparison function */
     230              :     HashCopyFunc keycopy;       /* key copying function */
     231              :     HashAllocFunc alloc;        /* memory allocator */
     232              :     void       *alloc_arg;      /* opaque argument passed to allocator */
     233              :     MemoryContext hcxt;         /* memory context if default allocator used */
     234              :     char       *tabname;        /* table name (for error messages) */
     235              :     bool        isshared;       /* true if table is in shared memory */
     236              : 
     237              :     /* freezing a shared table isn't allowed, so we can keep state here */
     238              :     bool        frozen;         /* true = no more inserts allowed */
     239              : 
     240              :     /* We keep local copies of these fixed values to reduce contention */
     241              :     Size        keysize;        /* hash key length in bytes */
     242              : 
     243              :     /*
     244              :      * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
     245              :      * all the element blocks they have allocated.  This pacifies Valgrind,
     246              :      * which would otherwise often claim that the element blocks are "possibly
     247              :      * lost" for lack of any non-interior pointers to their starts.
     248              :      */
     249              : #ifdef USE_VALGRIND
     250              :     slist_head  element_blocks;
     251              : #endif
     252              : };
     253              : 
     254              : /*
     255              :  * Key (also entry) part of a HASHELEMENT
     256              :  */
     257              : #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
     258              : 
     259              : /*
     260              :  * Obtain element pointer given pointer to key
     261              :  */
     262              : #define ELEMENT_FROM_KEY(key)  \
     263              :     ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
     264              : 
     265              : /*
     266              :  * Fast MOD arithmetic, assuming that y is a power of 2 !
     267              :  */
     268              : #define MOD(x,y)               ((x) & ((y)-1))
     269              : 
     270              : /*
     271              :  * Private function prototypes
     272              :  */
     273              : static void *DynaHashAlloc(Size size, void *alloc_arg);
     274              : static HASHSEGMENT seg_alloc(HTAB *hashp);
     275              : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
     276              : static bool dir_realloc(HTAB *hashp);
     277              : static bool expand_table(HTAB *hashp);
     278              : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
     279              : static void hdefault(HTAB *hashp);
     280              : static int  choose_nelem_alloc(Size entrysize);
     281              : static bool init_htab(HTAB *hashp, int64 nelem);
     282              : pg_noreturn static void hash_corrupted(HTAB *hashp);
     283              : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
     284              :                                   HASHBUCKET **bucketptr);
     285              : static int  my_log2(int64 num);
     286              : static int64 next_pow2_int64(int64 num);
     287              : static int  next_pow2_int(int64 num);
     288              : static void register_seq_scan(HTAB *hashp);
     289              : static void deregister_seq_scan(HTAB *hashp);
     290              : static bool has_seq_scans(HTAB *hashp);
     291              : 
     292              : 
     293              : /*
     294              :  * memory allocation support
     295              :  */
     296              : static void *
     297      1865966 : DynaHashAlloc(Size size, void *alloc_arg)
     298              : {
     299      1865966 :     MemoryContext hcxt = (MemoryContext) alloc_arg;
     300              : 
     301              :     Assert(MemoryContextIsValid(hcxt));
     302      1865966 :     return MemoryContextAllocExtended(hcxt, size, MCXT_ALLOC_NO_OOM);
     303              : }
     304              : 
     305              : 
     306              : /*
     307              :  * HashCompareFunc for string keys
     308              :  *
     309              :  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
     310              :  * bytes, so we can only compare that many ... hence strncmp is almost but
     311              :  * not quite the right thing.
     312              :  */
     313              : static int
     314       611013 : string_compare(const char *key1, const char *key2, Size keysize)
     315              : {
     316       611013 :     return strncmp(key1, key2, keysize - 1);
     317              : }
     318              : 
     319              : 
     320              : /************************** CREATE ROUTINES **********************/
     321              : 
     322              : /*
     323              :  * hash_create -- create a new dynamic hash table
     324              :  *
     325              :  *  tabname: a name for the table (for debugging purposes)
     326              :  *  nelem: maximum number of elements expected
     327              :  *  *info: additional table parameters, as indicated by flags
     328              :  *  flags: bitmask indicating which parameters to take from *info
     329              :  *
     330              :  * The flags value *must* include HASH_ELEM.  (Formerly, this was nominally
     331              :  * optional, but the default keysize and entrysize values were useless.)
     332              :  * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
     333              :  * or HASH_FUNCTION, to define the key hashing semantics (C strings,
     334              :  * binary blobs, or custom, respectively).  Callers specifying a custom
     335              :  * hash function will likely also want to use HASH_COMPARE, and perhaps
     336              :  * also HASH_KEYCOPY, to control key comparison and copying.
     337              :  * Another often-used flag is HASH_CONTEXT, to allocate the hash table
     338              :  * under info->hcxt rather than under TopMemoryContext; the default
     339              :  * behavior is only suitable for session-lifespan hash tables.
     340              :  * Other flags bits are special-purpose and seldom used, except for those
     341              :  * associated with shared-memory hash tables, for which see
     342              :  * ShmemRequestHash().
     343              :  *
     344              :  * Fields in *info are read only when the associated flags bit is set.
     345              :  * It is not necessary to initialize other fields of *info.
     346              :  * Neither tabname nor *info need persist after the hash_create() call.
     347              :  *
     348              :  * Note: It is deprecated for callers of hash_create() to explicitly specify
     349              :  * string_hash, tag_hash, uint32_hash, or oid_hash.  Just set HASH_STRINGS or
     350              :  * HASH_BLOBS.  Use HASH_FUNCTION only when you want something other than
     351              :  * one of these.
     352              :  *
     353              :  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
     354              :  * estimate, since we can't expand the table on the fly.  But an unshared
     355              :  * hashtable can be expanded on-the-fly, so it's better for nelem to be
     356              :  * on the small side and let the table grow if it's exceeded.  An overly
     357              :  * large nelem will penalize hash_seq_search speed without buying much.
     358              :  */
     359              : HTAB *
     360       429960 : hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
     361              : {
     362              :     HTAB       *hashp;
     363              :     HASHHDR    *hctl;
     364              :     MemoryContext hcxt;
     365              : 
     366              :     /*
     367              :      * Hash tables now allocate space for key and data, but you have to say
     368              :      * how much space to allocate.
     369              :      */
     370              :     Assert(flags & HASH_ELEM);
     371              :     Assert(info->keysize > 0);
     372              :     Assert(info->entrysize >= info->keysize);
     373              : 
     374              :     /*
     375              :      * For shared hash tables, we have a local hash header (HTAB struct) that
     376              :      * we allocate in TopMemoryContext; all else is in shared memory.
     377              :      *
     378              :      * For non-shared hash tables, everything including the hash header is in
     379              :      * a memory context created specially for the hash table --- this makes
     380              :      * hash_destroy very simple.  The memory context is made a child of either
     381              :      * a context specified by the caller, or TopMemoryContext if nothing is
     382              :      * specified.
     383              :      *
     384              :      * Note that HASH_ALLOC had better be set as well.
     385              :      */
     386       429960 :     if (flags & HASH_SHARED_MEM)
     387              :     {
     388              :         /* Set up to allocate the hash header */
     389        11090 :         hcxt = TopMemoryContext;
     390              :     }
     391              :     else
     392              :     {
     393              :         /* Create the hash table's private memory context */
     394       418870 :         if (flags & HASH_CONTEXT)
     395       271077 :             hcxt = info->hcxt;
     396              :         else
     397       147793 :             hcxt = TopMemoryContext;
     398       418870 :         hcxt = AllocSetContextCreate(hcxt, "dynahash",
     399              :                                      ALLOCSET_DEFAULT_SIZES);
     400              :     }
     401              : 
     402              :     /* Initialize the hash header, plus a copy of the table name */
     403       429960 :     hashp = (HTAB *) MemoryContextAlloc(hcxt,
     404       429960 :                                         sizeof(HTAB) + strlen(tabname) + 1);
     405      5159520 :     MemSet(hashp, 0, sizeof(HTAB));
     406              : 
     407       429960 :     hashp->tabname = (char *) (hashp + 1);
     408       429960 :     strcpy(hashp->tabname, tabname);
     409              : 
     410              :     /* If we have a private context, label it with hashtable's name */
     411       429960 :     if (!(flags & HASH_SHARED_MEM))
     412       418870 :         MemoryContextSetIdentifier(hcxt, hashp->tabname);
     413              : 
     414              :     /*
     415              :      * Select the appropriate hash function (see comments at head of file).
     416              :      */
     417       429960 :     if (flags & HASH_FUNCTION)
     418              :     {
     419              :         Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
     420        15589 :         hashp->hash = info->hash;
     421              :     }
     422       414371 :     else if (flags & HASH_BLOBS)
     423              :     {
     424              :         Assert(!(flags & HASH_STRINGS));
     425              :         /* We can optimize hashing for common key sizes */
     426       350273 :         if (info->keysize == sizeof(uint32))
     427       246948 :             hashp->hash = uint32_hash;
     428              :         else
     429       103325 :             hashp->hash = tag_hash;
     430              :     }
     431              :     else
     432              :     {
     433              :         /*
     434              :          * string_hash used to be considered the default hash method, and in a
     435              :          * non-assert build it effectively still is.  But we now consider it
     436              :          * an assertion error to not say HASH_STRINGS explicitly.  To help
     437              :          * catch mistaken usage of HASH_STRINGS, we also insist on a
     438              :          * reasonably long string length: if the keysize is only 4 or 8 bytes,
     439              :          * it's almost certainly an integer or pointer not a string.
     440              :          */
     441              :         Assert(flags & HASH_STRINGS);
     442              :         Assert(info->keysize > 8);
     443              : 
     444        64098 :         hashp->hash = string_hash;
     445              :     }
     446              : 
     447              :     /*
     448              :      * If you don't specify a match function, it defaults to string_compare if
     449              :      * you used string_hash, and to memcmp otherwise.
     450              :      *
     451              :      * Note: explicitly specifying string_hash is deprecated, because this
     452              :      * might not work for callers in loadable modules on some platforms due to
     453              :      * referencing a trampoline instead of the string_hash function proper.
     454              :      * Specify HASH_STRINGS instead.
     455              :      */
     456       429960 :     if (flags & HASH_COMPARE)
     457         8370 :         hashp->match = info->match;
     458       421590 :     else if (hashp->hash == string_hash)
     459        64098 :         hashp->match = (HashCompareFunc) string_compare;
     460              :     else
     461       357492 :         hashp->match = memcmp;
     462              : 
     463              :     /*
     464              :      * Similarly, the key-copying function defaults to strlcpy or memcpy.
     465              :      */
     466       429960 :     if (flags & HASH_KEYCOPY)
     467            0 :         hashp->keycopy = info->keycopy;
     468       429960 :     else if (hashp->hash == string_hash)
     469              :     {
     470              :         /*
     471              :          * The signature of keycopy is meant for memcpy(), which returns
     472              :          * void*, but strlcpy() returns size_t.  Since we never use the return
     473              :          * value of keycopy, and size_t is pretty much always the same size as
     474              :          * void *, this should be safe.  The extra cast in the middle is to
     475              :          * avoid warnings from -Wcast-function-type.
     476              :          */
     477        64098 :         hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
     478              :     }
     479              :     else
     480       365862 :         hashp->keycopy = memcpy;
     481              : 
     482              :     /* And select the entry allocation function, too. */
     483       429960 :     if (flags & HASH_ALLOC)
     484              :     {
     485        11090 :         hashp->alloc = info->alloc;
     486        11090 :         hashp->alloc_arg = info->alloc_arg;
     487              :     }
     488              :     else
     489              :     {
     490       418870 :         hashp->alloc = DynaHashAlloc;
     491       418870 :         hashp->alloc_arg = hcxt;
     492              :     }
     493              : 
     494       429960 :     if (flags & HASH_SHARED_MEM)
     495              :     {
     496        11090 :         hashp->hcxt = NULL;
     497        11090 :         hashp->isshared = true;
     498              : 
     499              :         /* hash table already exists, we're just attaching to it */
     500        11090 :         if (flags & HASH_ATTACH)
     501              :         {
     502              :             /* Caller must pass the pointer to the shared header */
     503              :             Assert(info->hctl);
     504            0 :             hashp->hctl = info->hctl;
     505              : 
     506              :             /* make local copies of some heavily-used values */
     507            0 :             hashp->dir = info->hctl->dir;
     508            0 :             hashp->keysize = info->hctl->keysize;
     509              : 
     510            0 :             return hashp;
     511              :         }
     512              :     }
     513              :     else
     514              :     {
     515              :         /* setup hash table defaults */
     516       418870 :         hashp->hctl = NULL;
     517       418870 :         hashp->dir = NULL;
     518       418870 :         hashp->hcxt = hcxt;
     519       418870 :         hashp->isshared = false;
     520              :     }
     521              : 
     522              :     /*
     523              :      * Allocate the header structure.
     524              :      *
     525              :      * XXX: In case of a shared memory hash table, other processes need the
     526              :      * pointer to the header to re-find the hash table.  There is currently no
     527              :      * explicit way to pass it back from here, the caller relies on the fact
     528              :      * that this is the first allocation made with the alloc function.  That's
     529              :      * a little ugly, but works for now.
     530              :      */
     531       429960 :     hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
     532       429960 :     if (!hashp->hctl)
     533            0 :         ereport(ERROR,
     534              :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     535              :                  errmsg("out of memory")));
     536              : 
     537       429960 :     hashp->frozen = false;
     538              : 
     539       429960 :     hdefault(hashp);
     540              : 
     541       429960 :     hctl = hashp->hctl;
     542              : 
     543       429960 :     if (flags & HASH_PARTITION)
     544              :     {
     545              :         /* Doesn't make sense to partition a local hash table */
     546              :         Assert(flags & HASH_SHARED_MEM);
     547              : 
     548              :         /*
     549              :          * The number of partitions had better be a power of 2. Also, it must
     550              :          * be less than INT_MAX (see init_htab()), so call the int version of
     551              :          * next_pow2.
     552              :          */
     553              :         Assert(info->num_partitions == next_pow2_int(info->num_partitions));
     554              : 
     555         6155 :         hctl->num_partitions = info->num_partitions;
     556              :     }
     557              : 
     558              :     /* remember the entry sizes, too */
     559       429960 :     hctl->keysize = info->keysize;
     560       429960 :     hctl->entrysize = info->entrysize;
     561              : 
     562              :     /* make local copies of heavily-used constant fields */
     563       429960 :     hashp->keysize = hctl->keysize;
     564              : 
     565              :     /* Build the hash directory structure */
     566       429960 :     if (!init_htab(hashp, nelem))
     567            0 :         elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
     568              : 
     569              :     /*
     570              :      * For a shared hash table, preallocate the requested number of elements.
     571              :      * This reduces problems with run-time out-of-shared-memory conditions.
     572              :      *
     573              :      * For a non-shared hash table, preallocate the requested number of
     574              :      * elements if it's less than our chosen nelem_alloc.  This avoids wasting
     575              :      * space if the caller correctly estimates a small table size.
     576              :      */
     577       429960 :     if ((flags & HASH_SHARED_MEM) ||
     578       418870 :         nelem < hctl->nelem_alloc)
     579              :     {
     580              :         int         i,
     581              :                     freelist_partitions,
     582              :                     nelem_alloc,
     583              :                     nelem_alloc_first;
     584              : 
     585              :         /*
     586              :          * If hash table is partitioned, give each freelist an equal share of
     587              :          * the initial allocation.  Otherwise only freeList[0] is used.
     588              :          */
     589       154884 :         if (IS_PARTITIONED(hashp->hctl))
     590         6155 :             freelist_partitions = NUM_FREELISTS;
     591              :         else
     592       148729 :             freelist_partitions = 1;
     593              : 
     594       154884 :         nelem_alloc = nelem / freelist_partitions;
     595       154884 :         if (nelem_alloc <= 0)
     596            0 :             nelem_alloc = 1;
     597              : 
     598              :         /*
     599              :          * Make sure we'll allocate all the requested elements; freeList[0]
     600              :          * gets the excess if the request isn't divisible by NUM_FREELISTS.
     601              :          */
     602       154884 :         if (nelem_alloc * freelist_partitions < nelem)
     603           59 :             nelem_alloc_first =
     604           59 :                 nelem - nelem_alloc * (freelist_partitions - 1);
     605              :         else
     606       154825 :             nelem_alloc_first = nelem_alloc;
     607              : 
     608       500573 :         for (i = 0; i < freelist_partitions; i++)
     609              :         {
     610       345689 :             int         temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
     611              : 
     612       345689 :             if (!element_alloc(hashp, temp, i))
     613            0 :                 ereport(ERROR,
     614              :                         (errcode(ERRCODE_OUT_OF_MEMORY),
     615              :                          errmsg("out of memory")));
     616              :         }
     617              :     }
     618              : 
     619              :     /* Set isfixed if requested, but not till after we build initial entries */
     620       429960 :     if (flags & HASH_FIXED_SIZE)
     621        11090 :         hctl->isfixed = true;
     622              : 
     623       429960 :     return hashp;
     624              : }
     625              : 
     626              : /*
     627              :  * Set default HASHHDR parameters.
     628              :  */
     629              : static void
     630       429960 : hdefault(HTAB *hashp)
     631              : {
     632       429960 :     HASHHDR    *hctl = hashp->hctl;
     633              : 
     634     46005720 :     MemSet(hctl, 0, sizeof(HASHHDR));
     635              : 
     636       429960 :     hctl->num_partitions = 0;    /* not partitioned */
     637              : 
     638              :     /* table has no fixed maximum size */
     639       429960 :     hctl->max_dsize = NO_MAX_DSIZE;
     640              : 
     641       429960 :     hctl->isfixed = false;       /* can be enlarged */
     642              : 
     643              : #ifdef HASH_STATISTICS
     644              :     hctl->accesses = hctl->collisions = hctl->expansions = 0;
     645              : #endif
     646       429960 : }
     647              : 
     648              : /*
     649              :  * Given the user-specified entry size, choose nelem_alloc, ie, how many
     650              :  * elements to add to the hash table when we need more.
     651              :  */
     652              : static int
     653       443365 : choose_nelem_alloc(Size entrysize)
     654              : {
     655              :     int         nelem_alloc;
     656              :     Size        elementSize;
     657              :     Size        allocSize;
     658              : 
     659              :     /* Each element has a HASHELEMENT header plus user data. */
     660              :     /* NB: this had better match element_alloc() */
     661       443365 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
     662              : 
     663              :     /*
     664              :      * The idea here is to choose nelem_alloc at least 32, but round up so
     665              :      * that the allocation request will be a power of 2 or just less. This
     666              :      * makes little difference for hash tables in shared memory, but for hash
     667              :      * tables managed by palloc, the allocation request will be rounded up to
     668              :      * a power of 2 anyway.  If we fail to take this into account, we'll waste
     669              :      * as much as half the allocated space.
     670              :      */
     671       443365 :     allocSize = 32 * 4;         /* assume elementSize at least 8 */
     672              :     do
     673              :     {
     674      1700029 :         allocSize <<= 1;
     675      1700029 :         nelem_alloc = allocSize / elementSize;
     676      1700029 :     } while (nelem_alloc < 32);
     677              : 
     678       443365 :     return nelem_alloc;
     679              : }
     680              : 
     681              : /*
     682              :  * Compute derived fields of hctl and build the initial directory/segment
     683              :  * arrays
     684              :  */
     685              : static bool
     686       429960 : init_htab(HTAB *hashp, int64 nelem)
     687              : {
     688       429960 :     HASHHDR    *hctl = hashp->hctl;
     689              :     HASHSEGMENT *segp;
     690              :     int         nbuckets;
     691              :     int         nsegs;
     692              :     int         i;
     693              : 
     694              :     /*
     695              :      * initialize mutexes if it's a partitioned table
     696              :      */
     697       429960 :     if (IS_PARTITIONED(hctl))
     698       203115 :         for (i = 0; i < NUM_FREELISTS; i++)
     699       196960 :             SpinLockInit(&(hctl->freeList[i].mutex));
     700              : 
     701              :     /*
     702              :      * Allocate space for the next greater power of two number of buckets,
     703              :      * assuming a desired maximum load factor of 1.
     704              :      */
     705       429960 :     nbuckets = next_pow2_int(nelem);
     706              : 
     707              :     /*
     708              :      * In a partitioned table, nbuckets must be at least equal to
     709              :      * num_partitions; were it less, keys with apparently different partition
     710              :      * numbers would map to the same bucket, breaking partition independence.
     711              :      * (Normally nbuckets will be much bigger; this is just a safety check.)
     712              :      */
     713       429960 :     while (nbuckets < hctl->num_partitions)
     714            0 :         nbuckets <<= 1;
     715              : 
     716       429960 :     hctl->max_bucket = hctl->low_mask = nbuckets - 1;
     717       429960 :     hctl->high_mask = (nbuckets << 1) - 1;
     718              : 
     719              :     /*
     720              :      * Figure number of directory segments needed, round up to a power of 2
     721              :      */
     722       429960 :     nsegs = (nbuckets - 1) / HASH_SEGSIZE + 1;
     723       429960 :     nsegs = next_pow2_int(nsegs);
     724              : 
     725              :     /*
     726              :      * Make sure directory is big enough.
     727              :      */
     728       429960 :     hctl->dsize = Max(DEF_DIRSIZE, nsegs);
     729              : 
     730              :     /* SHM hash tables have a fixed directory. */
     731       429960 :     if (hashp->isshared)
     732        11090 :         hctl->max_dsize = hctl->dsize;
     733              : 
     734              :     /* Allocate a directory */
     735       429960 :     hctl->dir = (HASHSEGMENT *)
     736       429960 :         hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
     737       429960 :     if (!hctl->dir)
     738            0 :         return false;
     739       429960 :     hashp->dir = hctl->dir;
     740              : 
     741              :     /* Allocate initial segments */
     742      1402638 :     for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
     743              :     {
     744       972678 :         *segp = seg_alloc(hashp);
     745       972678 :         if (*segp == NULL)
     746            0 :             return false;
     747              :     }
     748              : 
     749              :     /* Choose number of entries to allocate at a time */
     750       429960 :     hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
     751              : 
     752       429960 :     return true;
     753              : }
     754              : 
     755              : /*
     756              :  * Estimate the space needed for a hashtable containing the given number
     757              :  * of entries of given size.
     758              :  * NOTE: this is used to estimate the footprint of hashtables in shared
     759              :  * memory; therefore it does not count HTAB which is in local memory.
     760              :  * NB: assumes that all hash structure parameters have default values!
     761              :  */
     762              : Size
     763        13405 : hash_estimate_size(int64 num_entries, Size entrysize)
     764              : {
     765              :     Size        size;
     766              :     int64       nBuckets,
     767              :                 nSegments,
     768              :                 nDirEntries,
     769              :                 nElementAllocs,
     770              :                 elementSize,
     771              :                 elementAllocCnt;
     772              : 
     773              :     /* estimate number of buckets wanted */
     774        13405 :     nBuckets = next_pow2_int64(num_entries);
     775              :     /* # of segments needed for nBuckets */
     776        13405 :     nSegments = next_pow2_int64((nBuckets - 1) / HASH_SEGSIZE + 1);
     777              :     /* directory entries */
     778        13405 :     nDirEntries = Max(DEF_DIRSIZE, nSegments);
     779              : 
     780              :     /* fixed control info */
     781        13405 :     size = MAXALIGN(sizeof(HASHHDR));   /* but not HTAB, per above */
     782              :     /* directory */
     783        13405 :     size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
     784              :     /* segments */
     785        13405 :     size = add_size(size, mul_size(nSegments,
     786              :                                    MAXALIGN(HASH_SEGSIZE * sizeof(HASHBUCKET))));
     787              :     /* elements --- allocated in groups of choose_nelem_alloc() entries */
     788        13405 :     elementAllocCnt = choose_nelem_alloc(entrysize);
     789        13405 :     nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
     790        13405 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
     791        13405 :     size = add_size(size,
     792              :                     mul_size(nElementAllocs,
     793              :                              mul_size(elementAllocCnt, elementSize)));
     794              : 
     795        13405 :     return size;
     796              : }
     797              : 
     798              : 
     799              : /********************** DESTROY ROUTINES ************************/
     800              : 
     801              : void
     802        79886 : hash_destroy(HTAB *hashp)
     803              : {
     804        79886 :     if (hashp != NULL)
     805              :     {
     806              :         /* allocation method must be one we know how to free, too */
     807              :         Assert(hashp->alloc == DynaHashAlloc);
     808              :         /* so this hashtable must have its own context */
     809              :         Assert(hashp->hcxt != NULL);
     810              : 
     811        79886 :         hash_stats(__func__, hashp);
     812              : 
     813              :         /*
     814              :          * Free everything by destroying the hash table's memory context.
     815              :          */
     816        79886 :         MemoryContextDelete(hashp->hcxt);
     817              :     }
     818        79886 : }
     819              : 
     820              : void
     821        79886 : hash_stats(const char *caller, HTAB *hashp)
     822              : {
     823              : #ifdef HASH_STATISTICS
     824              :     HASHHDR    *hctl = hashp->hctl;
     825              : 
     826              :     elog(DEBUG4,
     827              :          "hash_stats:  Caller: %s  Table Name: \"%s\"  Accesses: " UINT64_FORMAT "  Collisions: " UINT64_FORMAT "  Expansions: " UINT64_FORMAT "  Entries: " INT64_FORMAT "  Key Size: %zu  Max Bucket: %u  Segment Count: " INT64_FORMAT,
     828              :          caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses,
     829              :          hctl->collisions, hctl->expansions, hash_get_num_entries(hashp),
     830              :          hctl->keysize, hctl->max_bucket, hctl->nsegs);
     831              : #endif
     832        79886 : }
     833              : 
     834              : /*******************************SEARCH ROUTINES *****************************/
     835              : 
     836              : 
     837              : /*
     838              :  * get_hash_value -- exported routine to calculate a key's hash value
     839              :  *
     840              :  * We export this because for partitioned tables, callers need to compute
     841              :  * the partition number (from the low-order bits of the hash value) before
     842              :  * searching.
     843              :  */
     844              : uint32
     845    110055978 : get_hash_value(HTAB *hashp, const void *keyPtr)
     846              : {
     847    110055978 :     return hashp->hash(keyPtr, hashp->keysize);
     848              : }
     849              : 
     850              : /* Convert a hash value to a bucket number */
     851              : static inline uint32
     852    245594187 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
     853              : {
     854              :     uint32      bucket;
     855              : 
     856    245594187 :     bucket = hash_val & hctl->high_mask;
     857    245594187 :     if (bucket > hctl->max_bucket)
     858    119662860 :         bucket = bucket & hctl->low_mask;
     859              : 
     860    245594187 :     return bucket;
     861              : }
     862              : 
     863              : /*
     864              :  * hash_search -- look up key in table and perform action
     865              :  * hash_search_with_hash_value -- same, with key's hash value already computed
     866              :  *
     867              :  * action is one of:
     868              :  *      HASH_FIND: look up key in table
     869              :  *      HASH_ENTER: look up key in table, creating entry if not present
     870              :  *      HASH_ENTER_NULL: same, but return NULL if out of memory
     871              :  *      HASH_REMOVE: look up key in table, remove entry if present
     872              :  *
     873              :  * Return value is a pointer to the element found/entered/removed if any,
     874              :  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
     875              :  * the result is a dangling pointer that shouldn't be dereferenced!)
     876              :  *
     877              :  * HASH_ENTER will normally ereport a generic "out of memory" error if
     878              :  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
     879              :  * the same except it will return NULL if out of memory.
     880              :  *
     881              :  * If foundPtr isn't NULL, then *foundPtr is set true if we found an
     882              :  * existing entry in the table, false otherwise.  This is needed in the
     883              :  * HASH_ENTER case, but is redundant with the return value otherwise.
     884              :  *
     885              :  * For hash_search_with_hash_value, the hashvalue parameter must have been
     886              :  * calculated with get_hash_value().
     887              :  */
     888              : void *
     889    145360302 : hash_search(HTAB *hashp,
     890              :             const void *keyPtr,
     891              :             HASHACTION action,
     892              :             bool *foundPtr)
     893              : {
     894    145360302 :     return hash_search_with_hash_value(hashp,
     895              :                                        keyPtr,
     896    145360302 :                                        hashp->hash(keyPtr, hashp->keysize),
     897              :                                        action,
     898              :                                        foundPtr);
     899              : }
     900              : 
     901              : void *
     902    243678995 : hash_search_with_hash_value(HTAB *hashp,
     903              :                             const void *keyPtr,
     904              :                             uint32 hashvalue,
     905              :                             HASHACTION action,
     906              :                             bool *foundPtr)
     907              : {
     908    243678995 :     HASHHDR    *hctl = hashp->hctl;
     909    243678995 :     int         freelist_idx = FREELIST_IDX(hctl, hashvalue);
     910              :     Size        keysize;
     911              :     HASHBUCKET  currBucket;
     912              :     HASHBUCKET *prevBucketPtr;
     913              :     HashCompareFunc match;
     914              : 
     915              : #ifdef HASH_STATISTICS
     916              :     hctl->accesses++;
     917              : #endif
     918              : 
     919              :     /*
     920              :      * If inserting, check if it is time to split a bucket.
     921              :      *
     922              :      * NOTE: failure to expand table is not a fatal error, it just means we
     923              :      * have to run at higher fill factor than we wanted.  However, if we're
     924              :      * using the palloc allocator then it will throw error anyway on
     925              :      * out-of-memory, so we must do this before modifying the table.
     926              :      */
     927    243678995 :     if (action == HASH_ENTER || action == HASH_ENTER_NULL)
     928              :     {
     929              :         /*
     930              :          * Can't split if running in partitioned mode, nor if frozen, nor if
     931              :          * table is the subject of any active hash_seq_search scans.
     932              :          */
     933     61593020 :         if (hctl->freeList[0].nentries > (int64) hctl->max_bucket &&
     934       457364 :             !IS_PARTITIONED(hctl) && !hashp->frozen &&
     935       457364 :             !has_seq_scans(hashp))
     936       457364 :             (void) expand_table(hashp);
     937              :     }
     938              : 
     939              :     /*
     940              :      * Do the initial lookup
     941              :      */
     942    243678995 :     (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
     943    243678995 :     currBucket = *prevBucketPtr;
     944              : 
     945              :     /*
     946              :      * Follow collision chain looking for matching key
     947              :      */
     948    243678995 :     match = hashp->match;        /* save one fetch in inner loop */
     949    243678995 :     keysize = hashp->keysize;    /* ditto */
     950              : 
     951    292662840 :     while (currBucket != NULL)
     952              :     {
     953    443045376 :         if (currBucket->hashvalue == hashvalue &&
     954    197034193 :             match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
     955    197027338 :             break;
     956     48983845 :         prevBucketPtr = &(currBucket->link);
     957     48983845 :         currBucket = *prevBucketPtr;
     958              : #ifdef HASH_STATISTICS
     959              :         hctl->collisions++;
     960              : #endif
     961              :     }
     962              : 
     963    243678995 :     if (foundPtr)
     964     63100187 :         *foundPtr = (bool) (currBucket != NULL);
     965              : 
     966              :     /*
     967              :      * OK, now what?
     968              :      */
     969    243678995 :     switch (action)
     970              :     {
     971    149773171 :         case HASH_FIND:
     972    149773171 :             if (currBucket != NULL)
     973    141741289 :                 return ELEMENTKEY(currBucket);
     974      8031882 :             return NULL;
     975              : 
     976     32312804 :         case HASH_REMOVE:
     977     32312804 :             if (currBucket != NULL)
     978              :             {
     979              :                 /* if partitioned, must lock to touch nentries and freeList */
     980     32308881 :                 if (IS_PARTITIONED(hctl))
     981      6657474 :                     SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
     982              : 
     983              :                 /* delete the record from the appropriate nentries counter. */
     984              :                 Assert(hctl->freeList[freelist_idx].nentries > 0);
     985     32308881 :                 hctl->freeList[freelist_idx].nentries--;
     986              : 
     987              :                 /* remove record from hash bucket's chain. */
     988     32308881 :                 *prevBucketPtr = currBucket->link;
     989              : 
     990              :                 /* add the record to the appropriate freelist. */
     991     32308881 :                 currBucket->link = hctl->freeList[freelist_idx].freeList;
     992     32308881 :                 hctl->freeList[freelist_idx].freeList = currBucket;
     993              : 
     994     32308881 :                 if (IS_PARTITIONED(hctl))
     995      6657474 :                     SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
     996              : 
     997              :                 /*
     998              :                  * better hope the caller is synchronizing access to this
     999              :                  * element, because someone else is going to reuse it the next
    1000              :                  * time something is added to the table
    1001              :                  */
    1002     32308881 :                 return ELEMENTKEY(currBucket);
    1003              :             }
    1004         3923 :             return NULL;
    1005              : 
    1006     61593020 :         case HASH_ENTER:
    1007              :         case HASH_ENTER_NULL:
    1008              :             /* Return existing element if found, else create one */
    1009     61593020 :             if (currBucket != NULL)
    1010     22977168 :                 return ELEMENTKEY(currBucket);
    1011              : 
    1012              :             /* disallow inserts if frozen */
    1013     38615852 :             if (hashp->frozen)
    1014            0 :                 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
    1015              :                      hashp->tabname);
    1016              : 
    1017     38615852 :             currBucket = get_hash_entry(hashp, freelist_idx);
    1018     38615852 :             if (currBucket == NULL)
    1019              :             {
    1020              :                 /* out of memory */
    1021            0 :                 if (action == HASH_ENTER_NULL)
    1022            0 :                     return NULL;
    1023              :                 /* report a generic message */
    1024            0 :                 if (hashp->isshared)
    1025            0 :                     ereport(ERROR,
    1026              :                             (errcode(ERRCODE_OUT_OF_MEMORY),
    1027              :                              errmsg("out of shared memory")));
    1028              :                 else
    1029            0 :                     ereport(ERROR,
    1030              :                             (errcode(ERRCODE_OUT_OF_MEMORY),
    1031              :                              errmsg("out of memory")));
    1032              :             }
    1033              : 
    1034              :             /* link into hashbucket chain */
    1035     38615852 :             *prevBucketPtr = currBucket;
    1036     38615852 :             currBucket->link = NULL;
    1037              : 
    1038              :             /* copy key into record */
    1039     38615852 :             currBucket->hashvalue = hashvalue;
    1040     38615852 :             hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
    1041              : 
    1042              :             /*
    1043              :              * Caller is expected to fill the data field on return.  DO NOT
    1044              :              * insert any code that could possibly throw error here, as doing
    1045              :              * so would leave the table entry incomplete and hence corrupt the
    1046              :              * caller's data structure.
    1047              :              */
    1048              : 
    1049     38615852 :             return ELEMENTKEY(currBucket);
    1050              :     }
    1051              : 
    1052            0 :     elog(ERROR, "unrecognized hash action code: %d", (int) action);
    1053              : 
    1054              :     return NULL;                /* keep compiler quiet */
    1055              : }
    1056              : 
    1057              : /*
    1058              :  * hash_update_hash_key -- change the hash key of an existing table entry
    1059              :  *
    1060              :  * This is equivalent to removing the entry, making a new entry, and copying
    1061              :  * over its data, except that the entry never goes to the table's freelist.
    1062              :  * Therefore this cannot suffer an out-of-memory failure, even if there are
    1063              :  * other processes operating in other partitions of the hashtable.
    1064              :  *
    1065              :  * Returns true if successful, false if the requested new hash key is already
    1066              :  * present.  Throws error if the specified entry pointer isn't actually a
    1067              :  * table member.
    1068              :  *
    1069              :  * NB: currently, there is no special case for old and new hash keys being
    1070              :  * identical, which means we'll report false for that situation.  This is
    1071              :  * preferable for existing uses.
    1072              :  *
    1073              :  * NB: for a partitioned hashtable, caller must hold lock on both relevant
    1074              :  * partitions, if the new hash key would belong to a different partition.
    1075              :  */
    1076              : bool
    1077          789 : hash_update_hash_key(HTAB *hashp,
    1078              :                      void *existingEntry,
    1079              :                      const void *newKeyPtr)
    1080              : {
    1081          789 :     HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
    1082              :     uint32      newhashvalue;
    1083              :     Size        keysize;
    1084              :     uint32      bucket;
    1085              :     uint32      newbucket;
    1086              :     HASHBUCKET  currBucket;
    1087              :     HASHBUCKET *prevBucketPtr;
    1088              :     HASHBUCKET *oldPrevPtr;
    1089              :     HashCompareFunc match;
    1090              : 
    1091              : #ifdef HASH_STATISTICS
    1092              :     HASHHDR    *hctl = hashp->hctl;
    1093              : 
    1094              :     hctl->accesses++;
    1095              : #endif
    1096              : 
    1097              :     /* disallow updates if frozen */
    1098          789 :     if (hashp->frozen)
    1099            0 :         elog(ERROR, "cannot update in frozen hashtable \"%s\"",
    1100              :              hashp->tabname);
    1101              : 
    1102              :     /*
    1103              :      * Lookup the existing element using its saved hash value.  We need to do
    1104              :      * this to be able to unlink it from its hash chain, but as a side benefit
    1105              :      * we can verify the validity of the passed existingEntry pointer.
    1106              :      */
    1107          789 :     bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
    1108              :                                  &prevBucketPtr);
    1109          789 :     currBucket = *prevBucketPtr;
    1110              : 
    1111          796 :     while (currBucket != NULL)
    1112              :     {
    1113          796 :         if (currBucket == existingElement)
    1114          789 :             break;
    1115            7 :         prevBucketPtr = &(currBucket->link);
    1116            7 :         currBucket = *prevBucketPtr;
    1117              :     }
    1118              : 
    1119          789 :     if (currBucket == NULL)
    1120            0 :         elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
    1121              :              hashp->tabname);
    1122              : 
    1123          789 :     oldPrevPtr = prevBucketPtr;
    1124              : 
    1125              :     /*
    1126              :      * Now perform the equivalent of a HASH_ENTER operation to locate the hash
    1127              :      * chain we want to put the entry into.
    1128              :      */
    1129          789 :     newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
    1130          789 :     newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
    1131          789 :     currBucket = *prevBucketPtr;
    1132              : 
    1133              :     /*
    1134              :      * Follow collision chain looking for matching key
    1135              :      */
    1136          789 :     match = hashp->match;        /* save one fetch in inner loop */
    1137          789 :     keysize = hashp->keysize;    /* ditto */
    1138              : 
    1139          851 :     while (currBucket != NULL)
    1140              :     {
    1141           62 :         if (currBucket->hashvalue == newhashvalue &&
    1142            0 :             match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
    1143            0 :             break;
    1144           62 :         prevBucketPtr = &(currBucket->link);
    1145           62 :         currBucket = *prevBucketPtr;
    1146              : #ifdef HASH_STATISTICS
    1147              :         hctl->collisions++;
    1148              : #endif
    1149              :     }
    1150              : 
    1151          789 :     if (currBucket != NULL)
    1152            0 :         return false;           /* collision with an existing entry */
    1153              : 
    1154          789 :     currBucket = existingElement;
    1155              : 
    1156              :     /*
    1157              :      * If old and new hash values belong to the same bucket, we need not
    1158              :      * change any chain links, and indeed should not since this simplistic
    1159              :      * update will corrupt the list if currBucket is the last element.  (We
    1160              :      * cannot fall out earlier, however, since we need to scan the bucket to
    1161              :      * check for duplicate keys.)
    1162              :      */
    1163          789 :     if (bucket != newbucket)
    1164              :     {
    1165              :         /* OK to remove record from old hash bucket's chain. */
    1166          729 :         *oldPrevPtr = currBucket->link;
    1167              : 
    1168              :         /* link into new hashbucket chain */
    1169          729 :         *prevBucketPtr = currBucket;
    1170          729 :         currBucket->link = NULL;
    1171              :     }
    1172              : 
    1173              :     /* copy new key into record */
    1174          789 :     currBucket->hashvalue = newhashvalue;
    1175          789 :     hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
    1176              : 
    1177              :     /* rest of record is untouched */
    1178              : 
    1179          789 :     return true;
    1180              : }
    1181              : 
    1182              : /*
    1183              :  * Allocate a new hashtable entry if possible; return NULL if out of memory.
    1184              :  * (Or, if the underlying space allocator throws error for out-of-memory,
    1185              :  * we won't return at all.)
    1186              :  */
    1187              : static HASHBUCKET
    1188     38615852 : get_hash_entry(HTAB *hashp, int freelist_idx)
    1189              : {
    1190     38615852 :     HASHHDR    *hctl = hashp->hctl;
    1191              :     HASHBUCKET  newElement;
    1192              : 
    1193              :     for (;;)
    1194              :     {
    1195              :         /* if partitioned, must lock to touch nentries and freeList */
    1196     39020863 :         if (IS_PARTITIONED(hctl))
    1197      7374367 :             SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1198              : 
    1199              :         /* try to get an entry from the freelist */
    1200     39020863 :         newElement = hctl->freeList[freelist_idx].freeList;
    1201              : 
    1202     39020863 :         if (newElement != NULL)
    1203     38577685 :             break;
    1204              : 
    1205       443178 :         if (IS_PARTITIONED(hctl))
    1206        38167 :             SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1207              : 
    1208              :         /*
    1209              :          * No free elements in this freelist.  In a partitioned table, there
    1210              :          * might be entries in other freelists, but to reduce contention we
    1211              :          * prefer to first try to get another chunk of buckets from the main
    1212              :          * shmem allocator.  If that fails, though, we *MUST* root through all
    1213              :          * the other freelists before giving up.  There are multiple callers
    1214              :          * that assume that they can allocate every element in the initially
    1215              :          * requested table size, or that deleting an element guarantees they
    1216              :          * can insert a new element, even if shared memory is entirely full.
    1217              :          * Failing because the needed element is in a different freelist is
    1218              :          * not acceptable.
    1219              :          */
    1220       443178 :         if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
    1221              :         {
    1222              :             int         borrow_from_idx;
    1223              : 
    1224        38167 :             if (!IS_PARTITIONED(hctl))
    1225            0 :                 return NULL;    /* out of memory */
    1226              : 
    1227              :             /* try to borrow element from another freelist */
    1228        38167 :             borrow_from_idx = freelist_idx;
    1229              :             for (;;)
    1230              :             {
    1231        42610 :                 borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
    1232        42610 :                 if (borrow_from_idx == freelist_idx)
    1233            0 :                     break;      /* examined all freelists, fail */
    1234              : 
    1235        42610 :                 SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
    1236        42610 :                 newElement = hctl->freeList[borrow_from_idx].freeList;
    1237              : 
    1238        42610 :                 if (newElement != NULL)
    1239              :                 {
    1240        38167 :                     hctl->freeList[borrow_from_idx].freeList = newElement->link;
    1241        38167 :                     SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
    1242              : 
    1243              :                     /* careful: count the new element in its proper freelist */
    1244        38167 :                     SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1245        38167 :                     hctl->freeList[freelist_idx].nentries++;
    1246        38167 :                     SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1247              : 
    1248        38167 :                     return newElement;
    1249              :                 }
    1250              : 
    1251         4443 :                 SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
    1252              :             }
    1253              : 
    1254              :             /* no elements available to borrow either, so out of memory */
    1255            0 :             return NULL;
    1256              :         }
    1257              :     }
    1258              : 
    1259              :     /* remove entry from freelist, bump nentries */
    1260     38577685 :     hctl->freeList[freelist_idx].freeList = newElement->link;
    1261     38577685 :     hctl->freeList[freelist_idx].nentries++;
    1262              : 
    1263     38577685 :     if (IS_PARTITIONED(hctl))
    1264      7336200 :         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1265              : 
    1266     38577685 :     return newElement;
    1267              : }
    1268              : 
    1269              : /*
    1270              :  * hash_get_num_entries -- get the number of entries in a hashtable
    1271              :  */
    1272              : int64
    1273        67499 : hash_get_num_entries(HTAB *hashp)
    1274              : {
    1275              :     int         i;
    1276        67499 :     int64       sum = hashp->hctl->freeList[0].nentries;
    1277              : 
    1278              :     /*
    1279              :      * We currently don't bother with acquiring the mutexes; it's only
    1280              :      * sensible to call this function if you've got lock on all partitions of
    1281              :      * the table.
    1282              :      */
    1283        67499 :     if (IS_PARTITIONED(hashp->hctl))
    1284              :     {
    1285        74848 :         for (i = 1; i < NUM_FREELISTS; i++)
    1286        72509 :             sum += hashp->hctl->freeList[i].nentries;
    1287              :     }
    1288              : 
    1289        67499 :     return sum;
    1290              : }
    1291              : 
    1292              : /*
    1293              :  * hash_seq_init/_search/_term
    1294              :  *          Sequentially search through hash table and return
    1295              :  *          all the elements one by one, return NULL when no more.
    1296              :  *
    1297              :  * hash_seq_term should be called if and only if the scan is abandoned before
    1298              :  * completion; if hash_seq_search returns NULL then it has already done the
    1299              :  * end-of-scan cleanup.
    1300              :  *
    1301              :  * NOTE: caller may delete the returned element before continuing the scan.
    1302              :  * However, deleting any other element while the scan is in progress is
    1303              :  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
    1304              :  * if elements are added to the table while the scan is in progress, it is
    1305              :  * unspecified whether they will be visited by the scan or not.
    1306              :  *
    1307              :  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
    1308              :  * worry about hash_seq_term cleanup, if the hashtable is first locked against
    1309              :  * further insertions by calling hash_freeze.
    1310              :  *
    1311              :  * NOTE: to use this with a partitioned hashtable, caller had better hold
    1312              :  * at least shared lock on all partitions of the table throughout the scan!
    1313              :  * We can cope with insertions or deletions by our own backend, but *not*
    1314              :  * with concurrent insertions or deletions by another.
    1315              :  */
    1316              : void
    1317      3432932 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
    1318              : {
    1319      3432932 :     status->hashp = hashp;
    1320      3432932 :     status->curBucket = 0;
    1321      3432932 :     status->curEntry = NULL;
    1322      3432932 :     status->hasHashvalue = false;
    1323      3432932 :     if (!hashp->frozen)
    1324      3432932 :         register_seq_scan(hashp);
    1325      3432932 : }
    1326              : 
    1327              : /*
    1328              :  * Same as above but scan by the given hash value.
    1329              :  * See also hash_seq_search().
    1330              :  *
    1331              :  * NOTE: the default hash function doesn't match syscache hash function.
    1332              :  * Thus, if you're going to use this function in syscache callback, make sure
    1333              :  * you're using custom hash function.  See relatt_cache_syshash()
    1334              :  * for example.
    1335              :  */
    1336              : void
    1337      1266616 : hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp,
    1338              :                               uint32 hashvalue)
    1339              : {
    1340              :     HASHBUCKET *bucketPtr;
    1341              : 
    1342      1266616 :     hash_seq_init(status, hashp);
    1343              : 
    1344      1266616 :     status->hasHashvalue = true;
    1345      1266616 :     status->hashvalue = hashvalue;
    1346              : 
    1347      1266616 :     status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
    1348      1266616 :     status->curEntry = *bucketPtr;
    1349      1266616 : }
    1350              : 
    1351              : void *
    1352     37771805 : hash_seq_search(HASH_SEQ_STATUS *status)
    1353              : {
    1354              :     HTAB       *hashp;
    1355              :     HASHHDR    *hctl;
    1356              :     uint32      max_bucket;
    1357              :     int64       segment_num;
    1358              :     int64       segment_ndx;
    1359              :     HASHSEGMENT segp;
    1360              :     uint32      curBucket;
    1361              :     HASHELEMENT *curElem;
    1362              : 
    1363     37771805 :     if (status->hasHashvalue)
    1364              :     {
    1365              :         /*
    1366              :          * Scan entries only in the current bucket because only this bucket
    1367              :          * can contain entries with the given hash value.
    1368              :          */
    1369      1428139 :         while ((curElem = status->curEntry) != NULL)
    1370              :         {
    1371       161523 :             status->curEntry = curElem->link;
    1372       161523 :             if (status->hashvalue != curElem->hashvalue)
    1373       154991 :                 continue;
    1374         6532 :             return (void *) ELEMENTKEY(curElem);
    1375              :         }
    1376              : 
    1377      1266616 :         hash_seq_term(status);
    1378      1266616 :         return NULL;
    1379              :     }
    1380              : 
    1381     36498657 :     if ((curElem = status->curEntry) != NULL)
    1382              :     {
    1383              :         /* Continuing scan of curBucket... */
    1384      9648119 :         status->curEntry = curElem->link;
    1385      9648119 :         if (status->curEntry == NULL)    /* end of this bucket */
    1386      6883829 :             ++status->curBucket;
    1387      9648119 :         return ELEMENTKEY(curElem);
    1388              :     }
    1389              : 
    1390              :     /*
    1391              :      * Search for next nonempty bucket starting at curBucket.
    1392              :      */
    1393     26850538 :     curBucket = status->curBucket;
    1394     26850538 :     hashp = status->hashp;
    1395     26850538 :     hctl = hashp->hctl;
    1396     26850538 :     max_bucket = hctl->max_bucket;
    1397              : 
    1398     26850538 :     if (curBucket > max_bucket)
    1399              :     {
    1400        72249 :         hash_seq_term(status);
    1401        72249 :         return NULL;            /* search is done */
    1402              :     }
    1403              : 
    1404              :     /*
    1405              :      * first find the right segment in the table directory.
    1406              :      */
    1407     26778289 :     segment_num = curBucket >> HASH_SEGSIZE_SHIFT;
    1408     26778289 :     segment_ndx = MOD(curBucket, HASH_SEGSIZE);
    1409              : 
    1410     26778289 :     segp = hashp->dir[segment_num];
    1411              : 
    1412              :     /*
    1413              :      * Pick up the first item in this bucket's chain.  If chain is not empty
    1414              :      * we can begin searching it.  Otherwise we have to advance to find the
    1415              :      * next nonempty bucket.  We try to optimize that case since searching a
    1416              :      * near-empty hashtable has to iterate this loop a lot.
    1417              :      */
    1418    183841085 :     while ((curElem = segp[segment_ndx]) == NULL)
    1419              :     {
    1420              :         /* empty bucket, advance to next */
    1421    159136034 :         if (++curBucket > max_bucket)
    1422              :         {
    1423      2073238 :             status->curBucket = curBucket;
    1424      2073238 :             hash_seq_term(status);
    1425      2073238 :             return NULL;        /* search is done */
    1426              :         }
    1427    157062796 :         if (++segment_ndx >= HASH_SEGSIZE)
    1428              :         {
    1429       403875 :             segment_num++;
    1430       403875 :             segment_ndx = 0;
    1431       403875 :             segp = hashp->dir[segment_num];
    1432              :         }
    1433              :     }
    1434              : 
    1435              :     /* Begin scan of curBucket... */
    1436     24705051 :     status->curEntry = curElem->link;
    1437     24705051 :     if (status->curEntry == NULL)    /* end of this bucket */
    1438     17821018 :         ++curBucket;
    1439     24705051 :     status->curBucket = curBucket;
    1440     24705051 :     return ELEMENTKEY(curElem);
    1441              : }
    1442              : 
    1443              : void
    1444      3432638 : hash_seq_term(HASH_SEQ_STATUS *status)
    1445              : {
    1446      3432638 :     if (!status->hashp->frozen)
    1447      3432638 :         deregister_seq_scan(status->hashp);
    1448      3432638 : }
    1449              : 
    1450              : /*
    1451              :  * hash_freeze
    1452              :  *          Freeze a hashtable against future insertions (deletions are
    1453              :  *          still allowed)
    1454              :  *
    1455              :  * The reason for doing this is that by preventing any more bucket splits,
    1456              :  * we no longer need to worry about registering hash_seq_search scans,
    1457              :  * and thus caller need not be careful about ensuring hash_seq_term gets
    1458              :  * called at the right times.
    1459              :  *
    1460              :  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
    1461              :  * with active scans (since hash_seq_term would then do the wrong thing).
    1462              :  */
    1463              : void
    1464            0 : hash_freeze(HTAB *hashp)
    1465              : {
    1466            0 :     if (hashp->isshared)
    1467            0 :         elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
    1468            0 :     if (!hashp->frozen && has_seq_scans(hashp))
    1469            0 :         elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
    1470              :              hashp->tabname);
    1471            0 :     hashp->frozen = true;
    1472            0 : }
    1473              : 
    1474              : 
    1475              : /********************************* UTILITIES ************************/
    1476              : 
    1477              : /*
    1478              :  * Expand the table by adding one more hash bucket.
    1479              :  */
    1480              : static bool
    1481       457364 : expand_table(HTAB *hashp)
    1482              : {
    1483       457364 :     HASHHDR    *hctl = hashp->hctl;
    1484              :     HASHSEGMENT old_seg,
    1485              :                 new_seg;
    1486              :     int64       old_bucket,
    1487              :                 new_bucket;
    1488              :     int64       new_segnum,
    1489              :                 new_segndx;
    1490              :     int64       old_segnum,
    1491              :                 old_segndx;
    1492              :     HASHBUCKET *oldlink,
    1493              :                *newlink;
    1494              :     HASHBUCKET  currElement,
    1495              :                 nextElement;
    1496              : 
    1497              :     Assert(!IS_PARTITIONED(hctl));
    1498              : 
    1499              : #ifdef HASH_STATISTICS
    1500              :     hctl->expansions++;
    1501              : #endif
    1502              : 
    1503       457364 :     new_bucket = hctl->max_bucket + 1;
    1504       457364 :     new_segnum = new_bucket >> HASH_SEGSIZE_SHIFT;
    1505       457364 :     new_segndx = MOD(new_bucket, HASH_SEGSIZE);
    1506              : 
    1507       457364 :     if (new_segnum >= hctl->nsegs)
    1508              :     {
    1509              :         /* Allocate new segment if necessary -- could fail if dir full */
    1510         1537 :         if (new_segnum >= hctl->dsize)
    1511            0 :             if (!dir_realloc(hashp))
    1512            0 :                 return false;
    1513         1537 :         if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
    1514            0 :             return false;
    1515         1537 :         hctl->nsegs++;
    1516              :     }
    1517              : 
    1518              :     /* OK, we created a new bucket */
    1519       457364 :     hctl->max_bucket++;
    1520              : 
    1521              :     /*
    1522              :      * *Before* changing masks, find old bucket corresponding to same hash
    1523              :      * values; values in that bucket may need to be relocated to new bucket.
    1524              :      * Note that new_bucket is certainly larger than low_mask at this point,
    1525              :      * so we can skip the first step of the regular hash mask calc.
    1526              :      */
    1527       457364 :     old_bucket = (new_bucket & hctl->low_mask);
    1528              : 
    1529              :     /*
    1530              :      * If we crossed a power of 2, readjust masks.
    1531              :      */
    1532       457364 :     if ((uint32) new_bucket > hctl->high_mask)
    1533              :     {
    1534         2896 :         hctl->low_mask = hctl->high_mask;
    1535         2896 :         hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
    1536              :     }
    1537              : 
    1538              :     /*
    1539              :      * Relocate records to the new bucket.  NOTE: because of the way the hash
    1540              :      * masking is done in calc_bucket, only one old bucket can need to be
    1541              :      * split at this point.  With a different way of reducing the hash value,
    1542              :      * that might not be true!
    1543              :      */
    1544       457364 :     old_segnum = old_bucket >> HASH_SEGSIZE_SHIFT;
    1545       457364 :     old_segndx = MOD(old_bucket, HASH_SEGSIZE);
    1546              : 
    1547       457364 :     old_seg = hashp->dir[old_segnum];
    1548       457364 :     new_seg = hashp->dir[new_segnum];
    1549              : 
    1550       457364 :     oldlink = &old_seg[old_segndx];
    1551       457364 :     newlink = &new_seg[new_segndx];
    1552              : 
    1553       457364 :     for (currElement = *oldlink;
    1554      1104362 :          currElement != NULL;
    1555       646998 :          currElement = nextElement)
    1556              :     {
    1557       646998 :         nextElement = currElement->link;
    1558       646998 :         if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
    1559              :         {
    1560       322897 :             *oldlink = currElement;
    1561       322897 :             oldlink = &currElement->link;
    1562              :         }
    1563              :         else
    1564              :         {
    1565       324101 :             *newlink = currElement;
    1566       324101 :             newlink = &currElement->link;
    1567              :         }
    1568              :     }
    1569              :     /* don't forget to terminate the rebuilt hash chains... */
    1570       457364 :     *oldlink = NULL;
    1571       457364 :     *newlink = NULL;
    1572              : 
    1573       457364 :     return true;
    1574              : }
    1575              : 
    1576              : 
    1577              : static bool
    1578            0 : dir_realloc(HTAB *hashp)
    1579              : {
    1580              :     HASHSEGMENT *p;
    1581              :     HASHSEGMENT *old_p;
    1582              :     int64       new_dsize;
    1583              :     int64       old_dirsize;
    1584              :     int64       new_dirsize;
    1585              : 
    1586            0 :     if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
    1587            0 :         return false;
    1588              : 
    1589              :     /* Reallocate directory */
    1590            0 :     new_dsize = hashp->hctl->dsize << 1;
    1591            0 :     old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
    1592            0 :     new_dirsize = new_dsize * sizeof(HASHSEGMENT);
    1593              : 
    1594            0 :     old_p = hashp->dir;
    1595            0 :     p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize, hashp->alloc_arg);
    1596              : 
    1597            0 :     if (p != NULL)
    1598              :     {
    1599            0 :         memcpy(p, old_p, old_dirsize);
    1600            0 :         MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
    1601            0 :         hashp->hctl->dir = p;
    1602            0 :         hashp->dir = p;
    1603            0 :         hashp->hctl->dsize = new_dsize;
    1604              : 
    1605              :         /* XXX assume the allocator is palloc, so we know how to free */
    1606              :         Assert(hashp->alloc == DynaHashAlloc);
    1607            0 :         pfree(old_p);
    1608              : 
    1609            0 :         return true;
    1610              :     }
    1611              : 
    1612            0 :     return false;
    1613              : }
    1614              : 
    1615              : 
    1616              : static HASHSEGMENT
    1617       974215 : seg_alloc(HTAB *hashp)
    1618              : {
    1619              :     HASHSEGMENT segp;
    1620              : 
    1621       974215 :     segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * HASH_SEGSIZE, hashp->alloc_arg);
    1622              : 
    1623       974215 :     if (!segp)
    1624            0 :         return NULL;
    1625              : 
    1626       974215 :     MemSet(segp, 0, sizeof(HASHBUCKET) * HASH_SEGSIZE);
    1627              : 
    1628       974215 :     return segp;
    1629              : }
    1630              : 
    1631              : /*
    1632              :  * allocate some new elements and link them into the indicated free list
    1633              :  */
    1634              : static bool
    1635       788867 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
    1636              : {
    1637       788867 :     HASHHDR    *hctl = hashp->hctl;
    1638              :     Size        elementSize;
    1639              :     Size        requestSize;
    1640              :     char       *allocedBlock;
    1641              :     HASHELEMENT *firstElement;
    1642              :     HASHELEMENT *tmpElement;
    1643              :     HASHELEMENT *prevElement;
    1644              :     int         i;
    1645              : 
    1646       788867 :     if (hctl->isfixed)
    1647        38167 :         return false;
    1648              : 
    1649              :     /* Each element has a HASHELEMENT header plus user data. */
    1650       750700 :     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
    1651              : 
    1652       750700 :     requestSize = nelem * elementSize;
    1653              : 
    1654              :     /* Add space for slist_node list link if we need one. */
    1655              : #ifdef USE_VALGRIND
    1656              :     if (!hashp->isshared)
    1657              :         requestSize += MAXALIGN(sizeof(slist_node));
    1658              : #endif
    1659              : 
    1660              :     /* Allocate the memory. */
    1661       750700 :     allocedBlock = hashp->alloc(requestSize, hashp->alloc_arg);
    1662              : 
    1663       750700 :     if (!allocedBlock)
    1664            0 :         return false;
    1665              : 
    1666              :     /*
    1667              :      * If USE_VALGRIND, each allocated block of elements of a non-shared
    1668              :      * hashtable is chained into a list, so that Valgrind won't think it's
    1669              :      * been leaked.
    1670              :      */
    1671              : #ifdef USE_VALGRIND
    1672              :     if (hashp->isshared)
    1673              :         firstElement = (HASHELEMENT *) allocedBlock;
    1674              :     else
    1675              :     {
    1676              :         slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock);
    1677              :         firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node)));
    1678              :     }
    1679              : #else
    1680       750700 :     firstElement = (HASHELEMENT *) allocedBlock;
    1681              : #endif
    1682              : 
    1683              :     /* prepare to link all the new entries into the freelist */
    1684       750700 :     prevElement = NULL;
    1685       750700 :     tmpElement = firstElement;
    1686     98183652 :     for (i = 0; i < nelem; i++)
    1687              :     {
    1688     97432952 :         tmpElement->link = prevElement;
    1689     97432952 :         prevElement = tmpElement;
    1690     97432952 :         tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
    1691              :     }
    1692              : 
    1693              :     /* if partitioned, must lock to touch freeList */
    1694       750700 :     if (IS_PARTITIONED(hctl))
    1695       196960 :         SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
    1696              : 
    1697              :     /* freelist could be nonempty if two backends did this concurrently */
    1698       750700 :     firstElement->link = hctl->freeList[freelist_idx].freeList;
    1699       750700 :     hctl->freeList[freelist_idx].freeList = prevElement;
    1700              : 
    1701       750700 :     if (IS_PARTITIONED(hctl))
    1702       196960 :         SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
    1703              : 
    1704       750700 :     return true;
    1705              : }
    1706              : 
    1707              : /*
    1708              :  * Do initial lookup of a bucket for the given hash value, retrieving its
    1709              :  * bucket number and its hash bucket.
    1710              :  */
    1711              : static inline uint32
    1712    244947189 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
    1713              : {
    1714    244947189 :     HASHHDR    *hctl = hashp->hctl;
    1715              :     HASHSEGMENT segp;
    1716              :     int64       segment_num;
    1717              :     int64       segment_ndx;
    1718              :     uint32      bucket;
    1719              : 
    1720    244947189 :     bucket = calc_bucket(hctl, hashvalue);
    1721              : 
    1722    244947189 :     segment_num = bucket >> HASH_SEGSIZE_SHIFT;
    1723    244947189 :     segment_ndx = MOD(bucket, HASH_SEGSIZE);
    1724              : 
    1725    244947189 :     segp = hashp->dir[segment_num];
    1726              : 
    1727    244947189 :     if (segp == NULL)
    1728            0 :         hash_corrupted(hashp);
    1729              : 
    1730    244947189 :     *bucketptr = &segp[segment_ndx];
    1731    244947189 :     return bucket;
    1732              : }
    1733              : 
    1734              : /* complain when we have detected a corrupted hashtable */
    1735              : static void
    1736            0 : hash_corrupted(HTAB *hashp)
    1737              : {
    1738              :     /*
    1739              :      * If the corruption is in a shared hashtable, we'd better force a
    1740              :      * systemwide restart.  Otherwise, just shut down this one backend.
    1741              :      */
    1742            0 :     if (hashp->isshared)
    1743            0 :         elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
    1744              :     else
    1745            0 :         elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
    1746              : }
    1747              : 
    1748              : /* calculate ceil(log base 2) of num */
    1749              : static int
    1750       886730 : my_log2(int64 num)
    1751              : {
    1752              :     /*
    1753              :      * guard against too-large input, which would be invalid for
    1754              :      * pg_ceil_log2_*()
    1755              :      */
    1756       886730 :     if (num > PG_INT64_MAX / 2)
    1757            0 :         num = PG_INT64_MAX / 2;
    1758              : 
    1759       886730 :     return pg_ceil_log2_64(num);
    1760              : }
    1761              : 
    1762              : /* calculate first power of 2 >= num, bounded to what will fit in a int64 */
    1763              : static int64
    1764        26810 : next_pow2_int64(int64 num)
    1765              : {
    1766              :     /* my_log2's internal range check is sufficient */
    1767        26810 :     return 1L << my_log2(num);
    1768              : }
    1769              : 
    1770              : /* calculate first power of 2 >= num, bounded to what will fit in an int */
    1771              : static int
    1772       859920 : next_pow2_int(int64 num)
    1773              : {
    1774       859920 :     if (num > INT_MAX / 2)
    1775            0 :         num = INT_MAX / 2;
    1776       859920 :     return 1 << my_log2(num);
    1777              : }
    1778              : 
    1779              : 
    1780              : /************************* SEQ SCAN TRACKING ************************/
    1781              : 
    1782              : /*
    1783              :  * We track active hash_seq_search scans here.  The need for this mechanism
    1784              :  * comes from the fact that a scan will get confused if a bucket split occurs
    1785              :  * while it's in progress: it might visit entries twice, or even miss some
    1786              :  * entirely (if it's partway through the same bucket that splits).  Hence
    1787              :  * we want to inhibit bucket splits if there are any active scans on the
    1788              :  * table being inserted into.  This is a fairly rare case in current usage,
    1789              :  * so just postponing the split until the next insertion seems sufficient.
    1790              :  *
    1791              :  * Given present usages of the function, only a few scans are likely to be
    1792              :  * open concurrently; so a finite-size stack of open scans seems sufficient,
    1793              :  * and we don't worry that linear search is too slow.  Note that we do
    1794              :  * allow multiple scans of the same hashtable to be open concurrently.
    1795              :  *
    1796              :  * This mechanism can support concurrent scan and insertion in a shared
    1797              :  * hashtable if it's the same backend doing both.  It would fail otherwise,
    1798              :  * but locking reasons seem to preclude any such scenario anyway, so we don't
    1799              :  * worry.
    1800              :  *
    1801              :  * This arrangement is reasonably robust if a transient hashtable is deleted
    1802              :  * without notifying us.  The absolute worst case is we might inhibit splits
    1803              :  * in another table created later at exactly the same address.  We will give
    1804              :  * a warning at transaction end for reference leaks, so any bugs leading to
    1805              :  * lack of notification should be easy to catch.
    1806              :  */
    1807              : 
    1808              : #define MAX_SEQ_SCANS 100
    1809              : 
    1810              : static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
    1811              : static int  seq_scan_level[MAX_SEQ_SCANS];  /* subtransaction nest level */
    1812              : static int  num_seq_scans = 0;
    1813              : 
    1814              : 
    1815              : /* Register a table as having an active hash_seq_search scan */
    1816              : static void
    1817      3432932 : register_seq_scan(HTAB *hashp)
    1818              : {
    1819      3432932 :     if (num_seq_scans >= MAX_SEQ_SCANS)
    1820            0 :         elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
    1821              :              hashp->tabname);
    1822      3432932 :     seq_scan_tables[num_seq_scans] = hashp;
    1823      3432932 :     seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
    1824      3432932 :     num_seq_scans++;
    1825      3432932 : }
    1826              : 
    1827              : /* Deregister an active scan */
    1828              : static void
    1829      3432638 : deregister_seq_scan(HTAB *hashp)
    1830              : {
    1831              :     int         i;
    1832              : 
    1833              :     /* Search backward since it's most likely at the stack top */
    1834      3432638 :     for (i = num_seq_scans - 1; i >= 0; i--)
    1835              :     {
    1836      3432638 :         if (seq_scan_tables[i] == hashp)
    1837              :         {
    1838      3432638 :             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
    1839      3432638 :             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
    1840      3432638 :             num_seq_scans--;
    1841      3432638 :             return;
    1842              :         }
    1843              :     }
    1844            0 :     elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
    1845              :          hashp->tabname);
    1846              : }
    1847              : 
    1848              : /* Check if a table has any active scan */
    1849              : static bool
    1850       457364 : has_seq_scans(HTAB *hashp)
    1851              : {
    1852              :     int         i;
    1853              : 
    1854       457576 :     for (i = 0; i < num_seq_scans; i++)
    1855              :     {
    1856          212 :         if (seq_scan_tables[i] == hashp)
    1857            0 :             return true;
    1858              :     }
    1859       457364 :     return false;
    1860              : }
    1861              : 
    1862              : /* Clean up any open scans at end of transaction */
    1863              : void
    1864       630754 : AtEOXact_HashTables(bool isCommit)
    1865              : {
    1866              :     /*
    1867              :      * During abort cleanup, open scans are expected; just silently clean 'em
    1868              :      * out.  An open scan at commit means someone forgot a hash_seq_term()
    1869              :      * call, so complain.
    1870              :      *
    1871              :      * Note: it's tempting to try to print the tabname here, but refrain for
    1872              :      * fear of touching deallocated memory.  This isn't a user-facing message
    1873              :      * anyway, so it needn't be pretty.
    1874              :      */
    1875       630754 :     if (isCommit)
    1876              :     {
    1877              :         int         i;
    1878              : 
    1879       595424 :         for (i = 0; i < num_seq_scans; i++)
    1880              :         {
    1881            0 :             elog(WARNING, "leaked hash_seq_search scan for hash table %p",
    1882              :                  seq_scan_tables[i]);
    1883              :         }
    1884              :     }
    1885       630754 :     num_seq_scans = 0;
    1886       630754 : }
    1887              : 
    1888              : /* Clean up any open scans at end of subtransaction */
    1889              : void
    1890        11910 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
    1891              : {
    1892              :     int         i;
    1893              : 
    1894              :     /*
    1895              :      * Search backward to make cleanup easy.  Note we must check all entries,
    1896              :      * not only those at the end of the array, because deletion technique
    1897              :      * doesn't keep them in order.
    1898              :      */
    1899        11913 :     for (i = num_seq_scans - 1; i >= 0; i--)
    1900              :     {
    1901            3 :         if (seq_scan_level[i] >= nestDepth)
    1902              :         {
    1903            3 :             if (isCommit)
    1904            0 :                 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
    1905              :                      seq_scan_tables[i]);
    1906            3 :             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
    1907            3 :             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
    1908            3 :             num_seq_scans--;
    1909              :         }
    1910              :     }
    1911        11910 : }
        

Generated by: LCOV version 2.0-1