Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dynahash.c
4 : * dynamic chained hash tables
5 : *
6 : * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 : * shared memory. For shared hash tables, it is the caller's responsibility
8 : * to provide appropriate access interlocking. The simplest convention is
9 : * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 : * hash_seq_search) need only shared lock, but any update requires exclusive
11 : * lock. For heavily-used shared tables, the single-lock approach creates a
12 : * concurrency bottleneck, so we also support "partitioned" locking wherein
13 : * there are multiple LWLocks guarding distinct subsets of the table. To use
14 : * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 : * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 : * Therefore, each hash bucket chain operates independently, and no fields
17 : * of the hash header change after init except nentries and freeList.
18 : * (A partitioned table uses multiple copies of those fields, guarded by
19 : * spinlocks, for additional concurrency.)
20 : * This lets any subset of the hash buckets be treated as a separately
21 : * lockable partition. We expect callers to use the low-order bits of a
22 : * lookup key's hash value as a partition number --- this will work because
23 : * of the way calc_bucket() maps hash values to bucket numbers.
24 : *
25 : * The memory allocator function should match malloc's semantics of returning
26 : * NULL on failure. (This is essential for hash tables in shared memory.
27 : * For hash tables in local memory, we used to use palloc() which will throw
28 : * error on failure; but we no longer do, so it's untested whether this
29 : * module will still cope with that behavior.)
30 : *
31 : * dynahash.c provides support for these types of lookup keys:
32 : *
33 : * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
34 : * compared as though by strcmp(). This is selected by specifying the
35 : * HASH_STRINGS flag to hash_create.
36 : *
37 : * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
38 : * (Caller must ensure there are no undefined padding bits in the keys!)
39 : * This is selected by specifying the HASH_BLOBS flag to hash_create.
40 : *
41 : * 3. More complex key behavior can be selected by specifying user-supplied
42 : * hashing, comparison, and/or key-copying functions. At least a hashing
43 : * function must be supplied; comparison defaults to memcmp() and key copying
44 : * to memcpy() when a user-defined hashing function is selected.
45 : *
46 : * Compared to simplehash, dynahash has the following benefits:
47 : *
48 : * - It supports partitioning, which is useful for shared memory access using
49 : * locks.
50 : * - Shared memory hashes are allocated in a fixed size area at startup and
51 : * are discoverable by name from other processes.
52 : * - Because entries don't need to be moved in the case of hash conflicts,
53 : * dynahash has better performance for large entries.
54 : * - Guarantees stable pointers to entries.
55 : *
56 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
57 : * Portions Copyright (c) 1994, Regents of the University of California
58 : *
59 : *
60 : * IDENTIFICATION
61 : * src/backend/utils/hash/dynahash.c
62 : *
63 : *-------------------------------------------------------------------------
64 : */
65 :
66 : /*
67 : * Original comments:
68 : *
69 : * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
70 : * Coded into C, with minor code improvements, and with hsearch(3) interface,
71 : * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
72 : * also, hcreate/hdestroy routines added to simulate hsearch(3).
73 : *
74 : * These routines simulate hsearch(3) and family, with the important
75 : * difference that the hash table is dynamic - can grow indefinitely
76 : * beyond its original size (as supplied to hcreate()).
77 : *
78 : * Performance appears to be comparable to that of hsearch(3).
79 : * The 'source-code' options referred to in hsearch(3)'s 'man' page
80 : * are not implemented; otherwise functionality is identical.
81 : *
82 : * Compilation controls:
83 : * HASH_STATISTICS causes some usage statistics to be maintained, which can be
84 : * logged by calling hash_stats().
85 : *
86 : * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 : * concatenation property, in probably unnecessary code 'optimization'.
88 : *
89 : * Modified margo@postgres.berkeley.edu February 1990
90 : * added multiple table interface
91 : * Modified by sullivan@postgres.berkeley.edu April 1990
92 : * changed ctl structure for shared memory
93 : */
94 :
95 : #include "postgres.h"
96 :
97 : #include <limits.h>
98 :
99 : #include "access/xact.h"
100 : #include "common/hashfn.h"
101 : #include "lib/ilist.h"
102 : #include "port/pg_bitutils.h"
103 : #include "storage/shmem.h"
104 : #include "storage/spin.h"
105 : #include "utils/dynahash.h"
106 : #include "utils/memutils.h"
107 :
108 :
109 : /*
110 : * Constants
111 : *
112 : * A hash table has a top-level "directory", each of whose entries points
113 : * to a "segment" of ssize bucket headers. The maximum number of hash
114 : * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
115 : * the number of records in the table can be larger, but we don't want a
116 : * whole lot of records per bucket or performance goes down.
117 : *
118 : * In a hash table allocated in shared memory, the directory cannot be
119 : * expanded because it must stay at a fixed address. The directory size
120 : * should be selected using hash_select_dirsize (and you'd better have
121 : * a good idea of the maximum number of entries!). For non-shared hash
122 : * tables, the initial directory size can be left at the default.
123 : */
124 : #define DEF_SEGSIZE 256
125 : #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
126 : #define DEF_DIRSIZE 256
127 :
128 : /* Number of freelists to be used for a partitioned hash table. */
129 : #define NUM_FREELISTS 32
130 :
131 : /* A hash bucket is a linked list of HASHELEMENTs */
132 : typedef HASHELEMENT *HASHBUCKET;
133 :
134 : /* A hash segment is an array of bucket headers */
135 : typedef HASHBUCKET *HASHSEGMENT;
136 :
137 : /*
138 : * Per-freelist data.
139 : *
140 : * In a partitioned hash table, each freelist is associated with a specific
141 : * set of hashcodes, as determined by the FREELIST_IDX() macro below.
142 : * nentries tracks the number of live hashtable entries having those hashcodes
143 : * (NOT the number of entries in the freelist, as you might expect).
144 : *
145 : * The coverage of a freelist might be more or less than one partition, so it
146 : * needs its own lock rather than relying on caller locking. Relying on that
147 : * wouldn't work even if the coverage was the same, because of the occasional
148 : * need to "borrow" entries from another freelist; see get_hash_entry().
149 : *
150 : * Using an array of FreeListData instead of separate arrays of mutexes,
151 : * nentries and freeLists helps to reduce sharing of cache lines between
152 : * different mutexes.
153 : */
154 : typedef struct
155 : {
156 : slock_t mutex; /* spinlock for this freelist */
157 : int64 nentries; /* number of entries in associated buckets */
158 : HASHELEMENT *freeList; /* chain of free elements */
159 : } FreeListData;
160 :
161 : /*
162 : * Header structure for a hash table --- contains all changeable info
163 : *
164 : * In a shared-memory hash table, the HASHHDR is in shared memory, while
165 : * each backend has a local HTAB struct. For a non-shared table, there isn't
166 : * any functional difference between HASHHDR and HTAB, but we separate them
167 : * anyway to share code between shared and non-shared tables.
168 : */
169 : struct HASHHDR
170 : {
171 : /*
172 : * The freelist can become a point of contention in high-concurrency hash
173 : * tables, so we use an array of freelists, each with its own mutex and
174 : * nentries count, instead of just a single one. Although the freelists
175 : * normally operate independently, we will scavenge entries from freelists
176 : * other than a hashcode's default freelist when necessary.
177 : *
178 : * If the hash table is not partitioned, only freeList[0] is used and its
179 : * spinlock is not used at all; callers' locking is assumed sufficient.
180 : */
181 : FreeListData freeList[NUM_FREELISTS];
182 :
183 : /* These fields can change, but not in a partitioned table */
184 : /* Also, dsize can't change in a shared table, even if unpartitioned */
185 : int64 dsize; /* directory size */
186 : int64 nsegs; /* number of allocated segments (<= dsize) */
187 : uint32 max_bucket; /* ID of maximum bucket in use */
188 : uint32 high_mask; /* mask to modulo into entire table */
189 : uint32 low_mask; /* mask to modulo into lower half of table */
190 :
191 : /* These fields are fixed at hashtable creation */
192 : Size keysize; /* hash key length in bytes */
193 : Size entrysize; /* total user element size in bytes */
194 : int64 num_partitions; /* # partitions (must be power of 2), or 0 */
195 : int64 max_dsize; /* 'dsize' limit if directory is fixed size */
196 : int64 ssize; /* segment size --- must be power of 2 */
197 : int sshift; /* segment shift = log2(ssize) */
198 : int nelem_alloc; /* number of entries to allocate at once */
199 : bool isfixed; /* if true, don't enlarge */
200 :
201 : #ifdef HASH_STATISTICS
202 :
203 : /*
204 : * Count statistics here. NB: stats code doesn't bother with mutex, so
205 : * counts could be corrupted a bit in a partitioned table.
206 : */
207 : uint64 accesses;
208 : uint64 collisions;
209 : uint64 expansions;
210 : #endif
211 : };
212 :
213 : #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
214 :
215 : #define FREELIST_IDX(hctl, hashcode) \
216 : (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
217 :
218 : /*
219 : * Top control structure for a hashtable --- in a shared table, each backend
220 : * has its own copy (OK since no fields change at runtime)
221 : */
222 : struct HTAB
223 : {
224 : HASHHDR *hctl; /* => shared control information */
225 : HASHSEGMENT *dir; /* directory of segment starts */
226 : HashValueFunc hash; /* hash function */
227 : HashCompareFunc match; /* key comparison function */
228 : HashCopyFunc keycopy; /* key copying function */
229 : HashAllocFunc alloc; /* memory allocator */
230 : MemoryContext hcxt; /* memory context if default allocator used */
231 : char *tabname; /* table name (for error messages) */
232 : bool isshared; /* true if table is in shared memory */
233 :
234 : /* freezing a shared table isn't allowed, so we can keep state here */
235 : bool frozen; /* true = no more inserts allowed */
236 :
237 : /* We keep local copies of these fixed values to reduce contention */
238 : Size keysize; /* hash key length in bytes */
239 : int64 ssize; /* segment size --- must be power of 2 */
240 : int sshift; /* segment shift = log2(ssize) */
241 :
242 : /*
243 : * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
244 : * all the element blocks they have allocated. This pacifies Valgrind,
245 : * which would otherwise often claim that the element blocks are "possibly
246 : * lost" for lack of any non-interior pointers to their starts.
247 : */
248 : #ifdef USE_VALGRIND
249 : slist_head element_blocks;
250 : #endif
251 : };
252 :
253 : /*
254 : * Key (also entry) part of a HASHELEMENT
255 : */
256 : #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
257 :
258 : /*
259 : * Obtain element pointer given pointer to key
260 : */
261 : #define ELEMENT_FROM_KEY(key) \
262 : ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
263 :
264 : /*
265 : * Fast MOD arithmetic, assuming that y is a power of 2 !
266 : */
267 : #define MOD(x,y) ((x) & ((y)-1))
268 :
269 : /*
270 : * Private function prototypes
271 : */
272 : static void *DynaHashAlloc(Size size);
273 : static HASHSEGMENT seg_alloc(HTAB *hashp);
274 : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
275 : static bool dir_realloc(HTAB *hashp);
276 : static bool expand_table(HTAB *hashp);
277 : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
278 : static void hdefault(HTAB *hashp);
279 : static int choose_nelem_alloc(Size entrysize);
280 : static bool init_htab(HTAB *hashp, int64 nelem);
281 : pg_noreturn static void hash_corrupted(HTAB *hashp);
282 : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
283 : HASHBUCKET **bucketptr);
284 : static int64 next_pow2_int64(int64 num);
285 : static int next_pow2_int(int64 num);
286 : static void register_seq_scan(HTAB *hashp);
287 : static void deregister_seq_scan(HTAB *hashp);
288 : static bool has_seq_scans(HTAB *hashp);
289 :
290 :
291 : /*
292 : * memory allocation support
293 : */
294 : static MemoryContext CurrentDynaHashCxt = NULL;
295 :
296 : static void *
297 3126362 : DynaHashAlloc(Size size)
298 : {
299 : Assert(MemoryContextIsValid(CurrentDynaHashCxt));
300 3126362 : return MemoryContextAllocExtended(CurrentDynaHashCxt, size,
301 : MCXT_ALLOC_NO_OOM);
302 : }
303 :
304 :
305 : /*
306 : * HashCompareFunc for string keys
307 : *
308 : * Because we copy keys with strlcpy(), they will be truncated at keysize-1
309 : * bytes, so we can only compare that many ... hence strncmp is almost but
310 : * not quite the right thing.
311 : */
312 : static int
313 981536 : string_compare(const char *key1, const char *key2, Size keysize)
314 : {
315 981536 : return strncmp(key1, key2, keysize - 1);
316 : }
317 :
318 :
319 : /************************** CREATE ROUTINES **********************/
320 :
321 : /*
322 : * hash_create -- create a new dynamic hash table
323 : *
324 : * tabname: a name for the table (for debugging purposes)
325 : * nelem: maximum number of elements expected
326 : * *info: additional table parameters, as indicated by flags
327 : * flags: bitmask indicating which parameters to take from *info
328 : *
329 : * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
330 : * optional, but the default keysize and entrysize values were useless.)
331 : * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
332 : * or HASH_FUNCTION, to define the key hashing semantics (C strings,
333 : * binary blobs, or custom, respectively). Callers specifying a custom
334 : * hash function will likely also want to use HASH_COMPARE, and perhaps
335 : * also HASH_KEYCOPY, to control key comparison and copying.
336 : * Another often-used flag is HASH_CONTEXT, to allocate the hash table
337 : * under info->hcxt rather than under TopMemoryContext; the default
338 : * behavior is only suitable for session-lifespan hash tables.
339 : * Other flags bits are special-purpose and seldom used, except for those
340 : * associated with shared-memory hash tables, for which see ShmemInitHash().
341 : *
342 : * Fields in *info are read only when the associated flags bit is set.
343 : * It is not necessary to initialize other fields of *info.
344 : * Neither tabname nor *info need persist after the hash_create() call.
345 : *
346 : * Note: It is deprecated for callers of hash_create() to explicitly specify
347 : * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
348 : * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
349 : * one of these.
350 : *
351 : * Note: for a shared-memory hashtable, nelem needs to be a pretty good
352 : * estimate, since we can't expand the table on the fly. But an unshared
353 : * hashtable can be expanded on-the-fly, so it's better for nelem to be
354 : * on the small side and let the table grow if it's exceeded. An overly
355 : * large nelem will penalize hash_seq_search speed without buying much.
356 : */
357 : HTAB *
358 725712 : hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
359 : {
360 : HTAB *hashp;
361 : HASHHDR *hctl;
362 :
363 : /*
364 : * Hash tables now allocate space for key and data, but you have to say
365 : * how much space to allocate.
366 : */
367 : Assert(flags & HASH_ELEM);
368 : Assert(info->keysize > 0);
369 : Assert(info->entrysize >= info->keysize);
370 :
371 : /*
372 : * For shared hash tables, we have a local hash header (HTAB struct) that
373 : * we allocate in TopMemoryContext; all else is in shared memory.
374 : *
375 : * For non-shared hash tables, everything including the hash header is in
376 : * a memory context created specially for the hash table --- this makes
377 : * hash_destroy very simple. The memory context is made a child of either
378 : * a context specified by the caller, or TopMemoryContext if nothing is
379 : * specified.
380 : */
381 725712 : if (flags & HASH_SHARED_MEM)
382 : {
383 : /* Set up to allocate the hash header */
384 19382 : CurrentDynaHashCxt = TopMemoryContext;
385 : }
386 : else
387 : {
388 : /* Create the hash table's private memory context */
389 706330 : if (flags & HASH_CONTEXT)
390 392008 : CurrentDynaHashCxt = info->hcxt;
391 : else
392 314322 : CurrentDynaHashCxt = TopMemoryContext;
393 706330 : CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
394 : "dynahash",
395 : ALLOCSET_DEFAULT_SIZES);
396 : }
397 :
398 : /* Initialize the hash header, plus a copy of the table name */
399 725712 : hashp = (HTAB *) MemoryContextAlloc(CurrentDynaHashCxt,
400 725712 : sizeof(HTAB) + strlen(tabname) + 1);
401 9434256 : MemSet(hashp, 0, sizeof(HTAB));
402 :
403 725712 : hashp->tabname = (char *) (hashp + 1);
404 725712 : strcpy(hashp->tabname, tabname);
405 :
406 : /* If we have a private context, label it with hashtable's name */
407 725712 : if (!(flags & HASH_SHARED_MEM))
408 706330 : MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
409 :
410 : /*
411 : * Select the appropriate hash function (see comments at head of file).
412 : */
413 725712 : if (flags & HASH_FUNCTION)
414 : {
415 : Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
416 27838 : hashp->hash = info->hash;
417 : }
418 697874 : else if (flags & HASH_BLOBS)
419 : {
420 : Assert(!(flags & HASH_STRINGS));
421 : /* We can optimize hashing for common key sizes */
422 593256 : if (info->keysize == sizeof(uint32))
423 416680 : hashp->hash = uint32_hash;
424 : else
425 176576 : hashp->hash = tag_hash;
426 : }
427 : else
428 : {
429 : /*
430 : * string_hash used to be considered the default hash method, and in a
431 : * non-assert build it effectively still is. But we now consider it
432 : * an assertion error to not say HASH_STRINGS explicitly. To help
433 : * catch mistaken usage of HASH_STRINGS, we also insist on a
434 : * reasonably long string length: if the keysize is only 4 or 8 bytes,
435 : * it's almost certainly an integer or pointer not a string.
436 : */
437 : Assert(flags & HASH_STRINGS);
438 : Assert(info->keysize > 8);
439 :
440 104618 : hashp->hash = string_hash;
441 : }
442 :
443 : /*
444 : * If you don't specify a match function, it defaults to string_compare if
445 : * you used string_hash, and to memcmp otherwise.
446 : *
447 : * Note: explicitly specifying string_hash is deprecated, because this
448 : * might not work for callers in loadable modules on some platforms due to
449 : * referencing a trampoline instead of the string_hash function proper.
450 : * Specify HASH_STRINGS instead.
451 : */
452 725712 : if (flags & HASH_COMPARE)
453 14604 : hashp->match = info->match;
454 711108 : else if (hashp->hash == string_hash)
455 104618 : hashp->match = (HashCompareFunc) string_compare;
456 : else
457 606490 : hashp->match = memcmp;
458 :
459 : /*
460 : * Similarly, the key-copying function defaults to strlcpy or memcpy.
461 : */
462 725712 : if (flags & HASH_KEYCOPY)
463 0 : hashp->keycopy = info->keycopy;
464 725712 : else if (hashp->hash == string_hash)
465 : {
466 : /*
467 : * The signature of keycopy is meant for memcpy(), which returns
468 : * void*, but strlcpy() returns size_t. Since we never use the return
469 : * value of keycopy, and size_t is pretty much always the same size as
470 : * void *, this should be safe. The extra cast in the middle is to
471 : * avoid warnings from -Wcast-function-type.
472 : */
473 104618 : hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
474 : }
475 : else
476 621094 : hashp->keycopy = memcpy;
477 :
478 : /* And select the entry allocation function, too. */
479 725712 : if (flags & HASH_ALLOC)
480 19382 : hashp->alloc = info->alloc;
481 : else
482 706330 : hashp->alloc = DynaHashAlloc;
483 :
484 725712 : if (flags & HASH_SHARED_MEM)
485 : {
486 : /*
487 : * ctl structure and directory are preallocated for shared memory
488 : * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
489 : * well.
490 : */
491 19382 : hashp->hctl = info->hctl;
492 19382 : hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
493 19382 : hashp->hcxt = NULL;
494 19382 : hashp->isshared = true;
495 :
496 : /* hash table already exists, we're just attaching to it */
497 19382 : if (flags & HASH_ATTACH)
498 : {
499 : /* make local copies of some heavily-used values */
500 0 : hctl = hashp->hctl;
501 0 : hashp->keysize = hctl->keysize;
502 0 : hashp->ssize = hctl->ssize;
503 0 : hashp->sshift = hctl->sshift;
504 :
505 0 : return hashp;
506 : }
507 : }
508 : else
509 : {
510 : /* setup hash table defaults */
511 706330 : hashp->hctl = NULL;
512 706330 : hashp->dir = NULL;
513 706330 : hashp->hcxt = CurrentDynaHashCxt;
514 706330 : hashp->isshared = false;
515 : }
516 :
517 725712 : if (!hashp->hctl)
518 : {
519 706330 : hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
520 706330 : if (!hashp->hctl)
521 0 : ereport(ERROR,
522 : (errcode(ERRCODE_OUT_OF_MEMORY),
523 : errmsg("out of memory")));
524 : }
525 :
526 725712 : hashp->frozen = false;
527 :
528 725712 : hdefault(hashp);
529 :
530 725712 : hctl = hashp->hctl;
531 :
532 725712 : if (flags & HASH_PARTITION)
533 : {
534 : /* Doesn't make sense to partition a local hash table */
535 : Assert(flags & HASH_SHARED_MEM);
536 :
537 : /*
538 : * The number of partitions had better be a power of 2. Also, it must
539 : * be less than INT_MAX (see init_htab()), so call the int version of
540 : * next_pow2.
541 : */
542 : Assert(info->num_partitions == next_pow2_int(info->num_partitions));
543 :
544 10760 : hctl->num_partitions = info->num_partitions;
545 : }
546 :
547 725712 : if (flags & HASH_SEGMENT)
548 : {
549 0 : hctl->ssize = info->ssize;
550 0 : hctl->sshift = my_log2(info->ssize);
551 : /* ssize had better be a power of 2 */
552 : Assert(hctl->ssize == (1L << hctl->sshift));
553 : }
554 :
555 : /*
556 : * SHM hash tables have fixed directory size passed by the caller.
557 : */
558 725712 : if (flags & HASH_DIRSIZE)
559 : {
560 19382 : hctl->max_dsize = info->max_dsize;
561 19382 : hctl->dsize = info->dsize;
562 : }
563 :
564 : /* remember the entry sizes, too */
565 725712 : hctl->keysize = info->keysize;
566 725712 : hctl->entrysize = info->entrysize;
567 :
568 : /* make local copies of heavily-used constant fields */
569 725712 : hashp->keysize = hctl->keysize;
570 725712 : hashp->ssize = hctl->ssize;
571 725712 : hashp->sshift = hctl->sshift;
572 :
573 : /* Build the hash directory structure */
574 725712 : if (!init_htab(hashp, nelem))
575 0 : elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
576 :
577 : /*
578 : * For a shared hash table, preallocate the requested number of elements.
579 : * This reduces problems with run-time out-of-shared-memory conditions.
580 : *
581 : * For a non-shared hash table, preallocate the requested number of
582 : * elements if it's less than our chosen nelem_alloc. This avoids wasting
583 : * space if the caller correctly estimates a small table size.
584 : */
585 725712 : if ((flags & HASH_SHARED_MEM) ||
586 706330 : nelem < hctl->nelem_alloc)
587 : {
588 : int i,
589 : freelist_partitions,
590 : nelem_alloc,
591 : nelem_alloc_first;
592 :
593 : /*
594 : * If hash table is partitioned, give each freelist an equal share of
595 : * the initial allocation. Otherwise only freeList[0] is used.
596 : */
597 256344 : if (IS_PARTITIONED(hashp->hctl))
598 10760 : freelist_partitions = NUM_FREELISTS;
599 : else
600 245584 : freelist_partitions = 1;
601 :
602 256344 : nelem_alloc = nelem / freelist_partitions;
603 256344 : if (nelem_alloc <= 0)
604 0 : nelem_alloc = 1;
605 :
606 : /*
607 : * Make sure we'll allocate all the requested elements; freeList[0]
608 : * gets the excess if the request isn't divisible by NUM_FREELISTS.
609 : */
610 256344 : if (nelem_alloc * freelist_partitions < nelem)
611 104 : nelem_alloc_first =
612 104 : nelem - nelem_alloc * (freelist_partitions - 1);
613 : else
614 256240 : nelem_alloc_first = nelem_alloc;
615 :
616 846248 : for (i = 0; i < freelist_partitions; i++)
617 : {
618 589904 : int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
619 :
620 589904 : if (!element_alloc(hashp, temp, i))
621 0 : ereport(ERROR,
622 : (errcode(ERRCODE_OUT_OF_MEMORY),
623 : errmsg("out of memory")));
624 : }
625 : }
626 :
627 : /* Set isfixed if requested, but not till after we build initial entries */
628 725712 : if (flags & HASH_FIXED_SIZE)
629 6456 : hctl->isfixed = true;
630 :
631 725712 : return hashp;
632 : }
633 :
634 : /*
635 : * Set default HASHHDR parameters.
636 : */
637 : static void
638 725712 : hdefault(HTAB *hashp)
639 : {
640 725712 : HASHHDR *hctl = hashp->hctl;
641 :
642 78376896 : MemSet(hctl, 0, sizeof(HASHHDR));
643 :
644 725712 : hctl->dsize = DEF_DIRSIZE;
645 725712 : hctl->nsegs = 0;
646 :
647 725712 : hctl->num_partitions = 0; /* not partitioned */
648 :
649 : /* table has no fixed maximum size */
650 725712 : hctl->max_dsize = NO_MAX_DSIZE;
651 :
652 725712 : hctl->ssize = DEF_SEGSIZE;
653 725712 : hctl->sshift = DEF_SEGSIZE_SHIFT;
654 :
655 725712 : hctl->isfixed = false; /* can be enlarged */
656 :
657 : #ifdef HASH_STATISTICS
658 : hctl->accesses = hctl->collisions = hctl->expansions = 0;
659 : #endif
660 725712 : }
661 :
662 : /*
663 : * Given the user-specified entry size, choose nelem_alloc, ie, how many
664 : * elements to add to the hash table when we need more.
665 : */
666 : static int
667 761762 : choose_nelem_alloc(Size entrysize)
668 : {
669 : int nelem_alloc;
670 : Size elementSize;
671 : Size allocSize;
672 :
673 : /* Each element has a HASHELEMENT header plus user data. */
674 : /* NB: this had better match element_alloc() */
675 761762 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
676 :
677 : /*
678 : * The idea here is to choose nelem_alloc at least 32, but round up so
679 : * that the allocation request will be a power of 2 or just less. This
680 : * makes little difference for hash tables in shared memory, but for hash
681 : * tables managed by palloc, the allocation request will be rounded up to
682 : * a power of 2 anyway. If we fail to take this into account, we'll waste
683 : * as much as half the allocated space.
684 : */
685 761762 : allocSize = 32 * 4; /* assume elementSize at least 8 */
686 : do
687 : {
688 2932754 : allocSize <<= 1;
689 2932754 : nelem_alloc = allocSize / elementSize;
690 2932754 : } while (nelem_alloc < 32);
691 :
692 761762 : return nelem_alloc;
693 : }
694 :
695 : /*
696 : * Compute derived fields of hctl and build the initial directory/segment
697 : * arrays
698 : */
699 : static bool
700 725712 : init_htab(HTAB *hashp, int64 nelem)
701 : {
702 725712 : HASHHDR *hctl = hashp->hctl;
703 : HASHSEGMENT *segp;
704 : int nbuckets;
705 : int nsegs;
706 : int i;
707 :
708 : /*
709 : * initialize mutexes if it's a partitioned table
710 : */
711 725712 : if (IS_PARTITIONED(hctl))
712 355080 : for (i = 0; i < NUM_FREELISTS; i++)
713 344320 : SpinLockInit(&(hctl->freeList[i].mutex));
714 :
715 : /*
716 : * Allocate space for the next greater power of two number of buckets,
717 : * assuming a desired maximum load factor of 1.
718 : */
719 725712 : nbuckets = next_pow2_int(nelem);
720 :
721 : /*
722 : * In a partitioned table, nbuckets must be at least equal to
723 : * num_partitions; were it less, keys with apparently different partition
724 : * numbers would map to the same bucket, breaking partition independence.
725 : * (Normally nbuckets will be much bigger; this is just a safety check.)
726 : */
727 725712 : while (nbuckets < hctl->num_partitions)
728 0 : nbuckets <<= 1;
729 :
730 725712 : hctl->max_bucket = hctl->low_mask = nbuckets - 1;
731 725712 : hctl->high_mask = (nbuckets << 1) - 1;
732 :
733 : /*
734 : * Figure number of directory segments needed, round up to a power of 2
735 : */
736 725712 : nsegs = (nbuckets - 1) / hctl->ssize + 1;
737 725712 : nsegs = next_pow2_int(nsegs);
738 :
739 : /*
740 : * Make sure directory is big enough. If pre-allocated directory is too
741 : * small, choke (caller screwed up).
742 : */
743 725712 : if (nsegs > hctl->dsize)
744 : {
745 0 : if (!(hashp->dir))
746 0 : hctl->dsize = nsegs;
747 : else
748 0 : return false;
749 : }
750 :
751 : /* Allocate a directory */
752 725712 : if (!(hashp->dir))
753 : {
754 706330 : CurrentDynaHashCxt = hashp->hcxt;
755 706330 : hashp->dir = (HASHSEGMENT *)
756 706330 : hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
757 706330 : if (!hashp->dir)
758 0 : return false;
759 : }
760 :
761 : /* Allocate initial segments */
762 2049530 : for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
763 : {
764 1323818 : *segp = seg_alloc(hashp);
765 1323818 : if (*segp == NULL)
766 0 : return false;
767 : }
768 :
769 : /* Choose number of entries to allocate at a time */
770 725712 : hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
771 :
772 725712 : return true;
773 : }
774 :
775 : /*
776 : * Estimate the space needed for a hashtable containing the given number
777 : * of entries of given size.
778 : * NOTE: this is used to estimate the footprint of hashtables in shared
779 : * memory; therefore it does not count HTAB which is in local memory.
780 : * NB: assumes that all hash structure parameters have default values!
781 : */
782 : Size
783 36050 : hash_estimate_size(int64 num_entries, Size entrysize)
784 : {
785 : Size size;
786 : int64 nBuckets,
787 : nSegments,
788 : nDirEntries,
789 : nElementAllocs,
790 : elementSize,
791 : elementAllocCnt;
792 :
793 : /* estimate number of buckets wanted */
794 36050 : nBuckets = next_pow2_int64(num_entries);
795 : /* # of segments needed for nBuckets */
796 36050 : nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1);
797 : /* directory entries */
798 36050 : nDirEntries = DEF_DIRSIZE;
799 36050 : while (nDirEntries < nSegments)
800 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
801 :
802 : /* fixed control info */
803 36050 : size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
804 : /* directory */
805 36050 : size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
806 : /* segments */
807 36050 : size = add_size(size, mul_size(nSegments,
808 : MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
809 : /* elements --- allocated in groups of choose_nelem_alloc() entries */
810 36050 : elementAllocCnt = choose_nelem_alloc(entrysize);
811 36050 : nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
812 36050 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
813 36050 : size = add_size(size,
814 : mul_size(nElementAllocs,
815 : mul_size(elementAllocCnt, elementSize)));
816 :
817 36050 : return size;
818 : }
819 :
820 : /*
821 : * Select an appropriate directory size for a hashtable with the given
822 : * maximum number of entries.
823 : * This is only needed for hashtables in shared memory, whose directories
824 : * cannot be expanded dynamically.
825 : * NB: assumes that all hash structure parameters have default values!
826 : *
827 : * XXX this had better agree with the behavior of init_htab()...
828 : */
829 : int64
830 19382 : hash_select_dirsize(int64 num_entries)
831 : {
832 : int64 nBuckets,
833 : nSegments,
834 : nDirEntries;
835 :
836 : /* estimate number of buckets wanted */
837 19382 : nBuckets = next_pow2_int64(num_entries);
838 : /* # of segments needed for nBuckets */
839 19382 : nSegments = next_pow2_int64((nBuckets - 1) / DEF_SEGSIZE + 1);
840 : /* directory entries */
841 19382 : nDirEntries = DEF_DIRSIZE;
842 19382 : while (nDirEntries < nSegments)
843 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
844 :
845 19382 : return nDirEntries;
846 : }
847 :
848 : /*
849 : * Compute the required initial memory allocation for a shared-memory
850 : * hashtable with the given parameters. We need space for the HASHHDR
851 : * and for the (non expansible) directory.
852 : */
853 : Size
854 19382 : hash_get_shared_size(HASHCTL *info, int flags)
855 : {
856 : Assert(flags & HASH_DIRSIZE);
857 : Assert(info->dsize == info->max_dsize);
858 19382 : return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
859 : }
860 :
861 :
862 : /********************** DESTROY ROUTINES ************************/
863 :
864 : void
865 124800 : hash_destroy(HTAB *hashp)
866 : {
867 124800 : if (hashp != NULL)
868 : {
869 : /* allocation method must be one we know how to free, too */
870 : Assert(hashp->alloc == DynaHashAlloc);
871 : /* so this hashtable must have its own context */
872 : Assert(hashp->hcxt != NULL);
873 :
874 124800 : hash_stats(__func__, hashp);
875 :
876 : /*
877 : * Free everything by destroying the hash table's memory context.
878 : */
879 124800 : MemoryContextDelete(hashp->hcxt);
880 : }
881 124800 : }
882 :
883 : void
884 124800 : hash_stats(const char *caller, HTAB *hashp)
885 : {
886 : #ifdef HASH_STATISTICS
887 : HASHHDR *hctl = hashp->hctl;
888 :
889 : elog(DEBUG4,
890 : "hash_stats: Caller: %s Table Name: \"%s\" Accesses: " UINT64_FORMAT " Collisions: " UINT64_FORMAT " Expansions: " UINT64_FORMAT " Entries: " INT64_FORMAT " Key Size: %zu Max Bucket: %u Segment Count: " INT64_FORMAT,
891 : caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses,
892 : hctl->collisions, hctl->expansions, hash_get_num_entries(hashp),
893 : hctl->keysize, hctl->max_bucket, hctl->nsegs);
894 : #endif
895 124800 : }
896 :
897 : /*******************************SEARCH ROUTINES *****************************/
898 :
899 :
900 : /*
901 : * get_hash_value -- exported routine to calculate a key's hash value
902 : *
903 : * We export this because for partitioned tables, callers need to compute
904 : * the partition number (from the low-order bits of the hash value) before
905 : * searching.
906 : */
907 : uint32
908 166659438 : get_hash_value(HTAB *hashp, const void *keyPtr)
909 : {
910 166659438 : return hashp->hash(keyPtr, hashp->keysize);
911 : }
912 :
913 : /* Convert a hash value to a bucket number */
914 : static inline uint32
915 387232920 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
916 : {
917 : uint32 bucket;
918 :
919 387232920 : bucket = hash_val & hctl->high_mask;
920 387232920 : if (bucket > hctl->max_bucket)
921 180817250 : bucket = bucket & hctl->low_mask;
922 :
923 387232920 : return bucket;
924 : }
925 :
926 : /*
927 : * hash_search -- look up key in table and perform action
928 : * hash_search_with_hash_value -- same, with key's hash value already computed
929 : *
930 : * action is one of:
931 : * HASH_FIND: look up key in table
932 : * HASH_ENTER: look up key in table, creating entry if not present
933 : * HASH_ENTER_NULL: same, but return NULL if out of memory
934 : * HASH_REMOVE: look up key in table, remove entry if present
935 : *
936 : * Return value is a pointer to the element found/entered/removed if any,
937 : * or NULL if no match was found. (NB: in the case of the REMOVE action,
938 : * the result is a dangling pointer that shouldn't be dereferenced!)
939 : *
940 : * HASH_ENTER will normally ereport a generic "out of memory" error if
941 : * it is unable to create a new entry. The HASH_ENTER_NULL operation is
942 : * the same except it will return NULL if out of memory.
943 : *
944 : * If foundPtr isn't NULL, then *foundPtr is set true if we found an
945 : * existing entry in the table, false otherwise. This is needed in the
946 : * HASH_ENTER case, but is redundant with the return value otherwise.
947 : *
948 : * For hash_search_with_hash_value, the hashvalue parameter must have been
949 : * calculated with get_hash_value().
950 : */
951 : void *
952 235930356 : hash_search(HTAB *hashp,
953 : const void *keyPtr,
954 : HASHACTION action,
955 : bool *foundPtr)
956 : {
957 235930356 : return hash_search_with_hash_value(hashp,
958 : keyPtr,
959 235930356 : hashp->hash(keyPtr, hashp->keysize),
960 : action,
961 : foundPtr);
962 : }
963 :
964 : void *
965 384599210 : hash_search_with_hash_value(HTAB *hashp,
966 : const void *keyPtr,
967 : uint32 hashvalue,
968 : HASHACTION action,
969 : bool *foundPtr)
970 : {
971 384599210 : HASHHDR *hctl = hashp->hctl;
972 384599210 : int freelist_idx = FREELIST_IDX(hctl, hashvalue);
973 : Size keysize;
974 : HASHBUCKET currBucket;
975 : HASHBUCKET *prevBucketPtr;
976 : HashCompareFunc match;
977 :
978 : #ifdef HASH_STATISTICS
979 : hctl->accesses++;
980 : #endif
981 :
982 : /*
983 : * If inserting, check if it is time to split a bucket.
984 : *
985 : * NOTE: failure to expand table is not a fatal error, it just means we
986 : * have to run at higher fill factor than we wanted. However, if we're
987 : * using the palloc allocator then it will throw error anyway on
988 : * out-of-memory, so we must do this before modifying the table.
989 : */
990 384599210 : if (action == HASH_ENTER || action == HASH_ENTER_NULL)
991 : {
992 : /*
993 : * Can't split if running in partitioned mode, nor if frozen, nor if
994 : * table is the subject of any active hash_seq_search scans.
995 : */
996 103243472 : if (hctl->freeList[0].nentries > (int64) hctl->max_bucket &&
997 756190 : !IS_PARTITIONED(hctl) && !hashp->frozen &&
998 756190 : !has_seq_scans(hashp))
999 756190 : (void) expand_table(hashp);
1000 : }
1001 :
1002 : /*
1003 : * Do the initial lookup
1004 : */
1005 384599210 : (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
1006 384599210 : currBucket = *prevBucketPtr;
1007 :
1008 : /*
1009 : * Follow collision chain looking for matching key
1010 : */
1011 384599210 : match = hashp->match; /* save one fetch in inner loop */
1012 384599210 : keysize = hashp->keysize; /* ditto */
1013 :
1014 473721742 : while (currBucket != NULL)
1015 : {
1016 703180658 : if (currBucket->hashvalue == hashvalue &&
1017 307034132 : match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
1018 307023994 : break;
1019 89122532 : prevBucketPtr = &(currBucket->link);
1020 89122532 : currBucket = *prevBucketPtr;
1021 : #ifdef HASH_STATISTICS
1022 : hctl->collisions++;
1023 : #endif
1024 : }
1025 :
1026 384599210 : if (foundPtr)
1027 106451566 : *foundPtr = (bool) (currBucket != NULL);
1028 :
1029 : /*
1030 : * OK, now what?
1031 : */
1032 384599210 : switch (action)
1033 : {
1034 227602046 : case HASH_FIND:
1035 227602046 : if (currBucket != NULL)
1036 215130306 : return ELEMENTKEY(currBucket);
1037 12471740 : return NULL;
1038 :
1039 53753692 : case HASH_REMOVE:
1040 53753692 : if (currBucket != NULL)
1041 : {
1042 : /* if partitioned, must lock to touch nentries and freeList */
1043 53746926 : if (IS_PARTITIONED(hctl))
1044 11541040 : SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1045 :
1046 : /* delete the record from the appropriate nentries counter. */
1047 : Assert(hctl->freeList[freelist_idx].nentries > 0);
1048 53746926 : hctl->freeList[freelist_idx].nentries--;
1049 :
1050 : /* remove record from hash bucket's chain. */
1051 53746926 : *prevBucketPtr = currBucket->link;
1052 :
1053 : /* add the record to the appropriate freelist. */
1054 53746926 : currBucket->link = hctl->freeList[freelist_idx].freeList;
1055 53746926 : hctl->freeList[freelist_idx].freeList = currBucket;
1056 :
1057 53746926 : if (IS_PARTITIONED(hctl))
1058 11541040 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1059 :
1060 : /*
1061 : * better hope the caller is synchronizing access to this
1062 : * element, because someone else is going to reuse it the next
1063 : * time something is added to the table
1064 : */
1065 53746926 : return ELEMENTKEY(currBucket);
1066 : }
1067 6766 : return NULL;
1068 :
1069 103243472 : case HASH_ENTER:
1070 : case HASH_ENTER_NULL:
1071 : /* Return existing element if found, else create one */
1072 103243472 : if (currBucket != NULL)
1073 38146762 : return ELEMENTKEY(currBucket);
1074 :
1075 : /* disallow inserts if frozen */
1076 65096710 : if (hashp->frozen)
1077 0 : elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1078 : hashp->tabname);
1079 :
1080 65096710 : currBucket = get_hash_entry(hashp, freelist_idx);
1081 65096710 : if (currBucket == NULL)
1082 : {
1083 : /* out of memory */
1084 0 : if (action == HASH_ENTER_NULL)
1085 0 : return NULL;
1086 : /* report a generic message */
1087 0 : if (hashp->isshared)
1088 0 : ereport(ERROR,
1089 : (errcode(ERRCODE_OUT_OF_MEMORY),
1090 : errmsg("out of shared memory")));
1091 : else
1092 0 : ereport(ERROR,
1093 : (errcode(ERRCODE_OUT_OF_MEMORY),
1094 : errmsg("out of memory")));
1095 : }
1096 :
1097 : /* link into hashbucket chain */
1098 65096710 : *prevBucketPtr = currBucket;
1099 65096710 : currBucket->link = NULL;
1100 :
1101 : /* copy key into record */
1102 65096710 : currBucket->hashvalue = hashvalue;
1103 65096710 : hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1104 :
1105 : /*
1106 : * Caller is expected to fill the data field on return. DO NOT
1107 : * insert any code that could possibly throw error here, as doing
1108 : * so would leave the table entry incomplete and hence corrupt the
1109 : * caller's data structure.
1110 : */
1111 :
1112 65096710 : return ELEMENTKEY(currBucket);
1113 : }
1114 :
1115 0 : elog(ERROR, "unrecognized hash action code: %d", (int) action);
1116 :
1117 : return NULL; /* keep compiler quiet */
1118 : }
1119 :
1120 : /*
1121 : * hash_update_hash_key -- change the hash key of an existing table entry
1122 : *
1123 : * This is equivalent to removing the entry, making a new entry, and copying
1124 : * over its data, except that the entry never goes to the table's freelist.
1125 : * Therefore this cannot suffer an out-of-memory failure, even if there are
1126 : * other processes operating in other partitions of the hashtable.
1127 : *
1128 : * Returns true if successful, false if the requested new hash key is already
1129 : * present. Throws error if the specified entry pointer isn't actually a
1130 : * table member.
1131 : *
1132 : * NB: currently, there is no special case for old and new hash keys being
1133 : * identical, which means we'll report false for that situation. This is
1134 : * preferable for existing uses.
1135 : *
1136 : * NB: for a partitioned hashtable, caller must hold lock on both relevant
1137 : * partitions, if the new hash key would belong to a different partition.
1138 : */
1139 : bool
1140 1332 : hash_update_hash_key(HTAB *hashp,
1141 : void *existingEntry,
1142 : const void *newKeyPtr)
1143 : {
1144 1332 : HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1145 : uint32 newhashvalue;
1146 : Size keysize;
1147 : uint32 bucket;
1148 : uint32 newbucket;
1149 : HASHBUCKET currBucket;
1150 : HASHBUCKET *prevBucketPtr;
1151 : HASHBUCKET *oldPrevPtr;
1152 : HashCompareFunc match;
1153 :
1154 : #ifdef HASH_STATISTICS
1155 : HASHHDR *hctl = hashp->hctl;
1156 :
1157 : hctl->accesses++;
1158 : #endif
1159 :
1160 : /* disallow updates if frozen */
1161 1332 : if (hashp->frozen)
1162 0 : elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1163 : hashp->tabname);
1164 :
1165 : /*
1166 : * Lookup the existing element using its saved hash value. We need to do
1167 : * this to be able to unlink it from its hash chain, but as a side benefit
1168 : * we can verify the validity of the passed existingEntry pointer.
1169 : */
1170 1332 : bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
1171 : &prevBucketPtr);
1172 1332 : currBucket = *prevBucketPtr;
1173 :
1174 1334 : while (currBucket != NULL)
1175 : {
1176 1334 : if (currBucket == existingElement)
1177 1332 : break;
1178 2 : prevBucketPtr = &(currBucket->link);
1179 2 : currBucket = *prevBucketPtr;
1180 : }
1181 :
1182 1332 : if (currBucket == NULL)
1183 0 : elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1184 : hashp->tabname);
1185 :
1186 1332 : oldPrevPtr = prevBucketPtr;
1187 :
1188 : /*
1189 : * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1190 : * chain we want to put the entry into.
1191 : */
1192 1332 : newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1193 1332 : newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
1194 1332 : currBucket = *prevBucketPtr;
1195 :
1196 : /*
1197 : * Follow collision chain looking for matching key
1198 : */
1199 1332 : match = hashp->match; /* save one fetch in inner loop */
1200 1332 : keysize = hashp->keysize; /* ditto */
1201 :
1202 1596 : while (currBucket != NULL)
1203 : {
1204 264 : if (currBucket->hashvalue == newhashvalue &&
1205 0 : match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1206 0 : break;
1207 264 : prevBucketPtr = &(currBucket->link);
1208 264 : currBucket = *prevBucketPtr;
1209 : #ifdef HASH_STATISTICS
1210 : hctl->collisions++;
1211 : #endif
1212 : }
1213 :
1214 1332 : if (currBucket != NULL)
1215 0 : return false; /* collision with an existing entry */
1216 :
1217 1332 : currBucket = existingElement;
1218 :
1219 : /*
1220 : * If old and new hash values belong to the same bucket, we need not
1221 : * change any chain links, and indeed should not since this simplistic
1222 : * update will corrupt the list if currBucket is the last element. (We
1223 : * cannot fall out earlier, however, since we need to scan the bucket to
1224 : * check for duplicate keys.)
1225 : */
1226 1332 : if (bucket != newbucket)
1227 : {
1228 : /* OK to remove record from old hash bucket's chain. */
1229 1104 : *oldPrevPtr = currBucket->link;
1230 :
1231 : /* link into new hashbucket chain */
1232 1104 : *prevBucketPtr = currBucket;
1233 1104 : currBucket->link = NULL;
1234 : }
1235 :
1236 : /* copy new key into record */
1237 1332 : currBucket->hashvalue = newhashvalue;
1238 1332 : hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1239 :
1240 : /* rest of record is untouched */
1241 :
1242 1332 : return true;
1243 : }
1244 :
1245 : /*
1246 : * Allocate a new hashtable entry if possible; return NULL if out of memory.
1247 : * (Or, if the underlying space allocator throws error for out-of-memory,
1248 : * we won't return at all.)
1249 : */
1250 : static HASHBUCKET
1251 65096710 : get_hash_entry(HTAB *hashp, int freelist_idx)
1252 : {
1253 65096710 : HASHHDR *hctl = hashp->hctl;
1254 : HASHBUCKET newElement;
1255 :
1256 : for (;;)
1257 : {
1258 : /* if partitioned, must lock to touch nentries and freeList */
1259 65768504 : if (IS_PARTITIONED(hctl))
1260 12772794 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1261 :
1262 : /* try to get an entry from the freelist */
1263 65768504 : newElement = hctl->freeList[freelist_idx].freeList;
1264 :
1265 65768504 : if (newElement != NULL)
1266 65096710 : break;
1267 :
1268 671794 : if (IS_PARTITIONED(hctl))
1269 3144 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1270 :
1271 : /*
1272 : * No free elements in this freelist. In a partitioned table, there
1273 : * might be entries in other freelists, but to reduce contention we
1274 : * prefer to first try to get another chunk of buckets from the main
1275 : * shmem allocator. If that fails, though, we *MUST* root through all
1276 : * the other freelists before giving up. There are multiple callers
1277 : * that assume that they can allocate every element in the initially
1278 : * requested table size, or that deleting an element guarantees they
1279 : * can insert a new element, even if shared memory is entirely full.
1280 : * Failing because the needed element is in a different freelist is
1281 : * not acceptable.
1282 : */
1283 671794 : if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1284 : {
1285 : int borrow_from_idx;
1286 :
1287 0 : if (!IS_PARTITIONED(hctl))
1288 0 : return NULL; /* out of memory */
1289 :
1290 : /* try to borrow element from another freelist */
1291 0 : borrow_from_idx = freelist_idx;
1292 : for (;;)
1293 : {
1294 0 : borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1295 0 : if (borrow_from_idx == freelist_idx)
1296 0 : break; /* examined all freelists, fail */
1297 :
1298 0 : SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1299 0 : newElement = hctl->freeList[borrow_from_idx].freeList;
1300 :
1301 0 : if (newElement != NULL)
1302 : {
1303 0 : hctl->freeList[borrow_from_idx].freeList = newElement->link;
1304 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1305 :
1306 : /* careful: count the new element in its proper freelist */
1307 0 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1308 0 : hctl->freeList[freelist_idx].nentries++;
1309 0 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1310 :
1311 0 : return newElement;
1312 : }
1313 :
1314 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1315 : }
1316 :
1317 : /* no elements available to borrow either, so out of memory */
1318 0 : return NULL;
1319 : }
1320 : }
1321 :
1322 : /* remove entry from freelist, bump nentries */
1323 65096710 : hctl->freeList[freelist_idx].freeList = newElement->link;
1324 65096710 : hctl->freeList[freelist_idx].nentries++;
1325 :
1326 65096710 : if (IS_PARTITIONED(hctl))
1327 12769650 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1328 :
1329 65096710 : return newElement;
1330 : }
1331 :
1332 : /*
1333 : * hash_get_num_entries -- get the number of entries in a hashtable
1334 : */
1335 : int64
1336 120950 : hash_get_num_entries(HTAB *hashp)
1337 : {
1338 : int i;
1339 120950 : int64 sum = hashp->hctl->freeList[0].nentries;
1340 :
1341 : /*
1342 : * We currently don't bother with acquiring the mutexes; it's only
1343 : * sensible to call this function if you've got lock on all partitions of
1344 : * the table.
1345 : */
1346 120950 : if (IS_PARTITIONED(hashp->hctl))
1347 : {
1348 117504 : for (i = 1; i < NUM_FREELISTS; i++)
1349 113832 : sum += hashp->hctl->freeList[i].nentries;
1350 : }
1351 :
1352 120950 : return sum;
1353 : }
1354 :
1355 : /*
1356 : * hash_seq_init/_search/_term
1357 : * Sequentially search through hash table and return
1358 : * all the elements one by one, return NULL when no more.
1359 : *
1360 : * hash_seq_term should be called if and only if the scan is abandoned before
1361 : * completion; if hash_seq_search returns NULL then it has already done the
1362 : * end-of-scan cleanup.
1363 : *
1364 : * NOTE: caller may delete the returned element before continuing the scan.
1365 : * However, deleting any other element while the scan is in progress is
1366 : * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1367 : * if elements are added to the table while the scan is in progress, it is
1368 : * unspecified whether they will be visited by the scan or not.
1369 : *
1370 : * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1371 : * worry about hash_seq_term cleanup, if the hashtable is first locked against
1372 : * further insertions by calling hash_freeze.
1373 : *
1374 : * NOTE: to use this with a partitioned hashtable, caller had better hold
1375 : * at least shared lock on all partitions of the table throughout the scan!
1376 : * We can cope with insertions or deletions by our own backend, but *not*
1377 : * with concurrent insertions or deletions by another.
1378 : */
1379 : void
1380 5597888 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1381 : {
1382 5597888 : status->hashp = hashp;
1383 5597888 : status->curBucket = 0;
1384 5597888 : status->curEntry = NULL;
1385 5597888 : status->hasHashvalue = false;
1386 5597888 : if (!hashp->frozen)
1387 5597888 : register_seq_scan(hashp);
1388 5597888 : }
1389 :
1390 : /*
1391 : * Same as above but scan by the given hash value.
1392 : * See also hash_seq_search().
1393 : *
1394 : * NOTE: the default hash function doesn't match syscache hash function.
1395 : * Thus, if you're going to use this function in syscache callback, make sure
1396 : * you're using custom hash function. See relatt_cache_syshash()
1397 : * for example.
1398 : */
1399 : void
1400 1571638 : hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp,
1401 : uint32 hashvalue)
1402 : {
1403 : HASHBUCKET *bucketPtr;
1404 :
1405 1571638 : hash_seq_init(status, hashp);
1406 :
1407 1571638 : status->hasHashvalue = true;
1408 1571638 : status->hashvalue = hashvalue;
1409 :
1410 1571638 : status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
1411 1571638 : status->curEntry = *bucketPtr;
1412 1571638 : }
1413 :
1414 : void *
1415 57418914 : hash_seq_search(HASH_SEQ_STATUS *status)
1416 : {
1417 : HTAB *hashp;
1418 : HASHHDR *hctl;
1419 : uint32 max_bucket;
1420 : int64 ssize;
1421 : int64 segment_num;
1422 : int64 segment_ndx;
1423 : HASHSEGMENT segp;
1424 : uint32 curBucket;
1425 : HASHELEMENT *curElem;
1426 :
1427 57418914 : if (status->hasHashvalue)
1428 : {
1429 : /*
1430 : * Scan entries only in the current bucket because only this bucket
1431 : * can contain entries with the given hash value.
1432 : */
1433 1772996 : while ((curElem = status->curEntry) != NULL)
1434 : {
1435 201358 : status->curEntry = curElem->link;
1436 201358 : if (status->hashvalue != curElem->hashvalue)
1437 192226 : continue;
1438 9132 : return (void *) ELEMENTKEY(curElem);
1439 : }
1440 :
1441 1571638 : hash_seq_term(status);
1442 1571638 : return NULL;
1443 : }
1444 :
1445 55838144 : if ((curElem = status->curEntry) != NULL)
1446 : {
1447 : /* Continuing scan of curBucket... */
1448 16123662 : status->curEntry = curElem->link;
1449 16123662 : if (status->curEntry == NULL) /* end of this bucket */
1450 11236006 : ++status->curBucket;
1451 16123662 : return ELEMENTKEY(curElem);
1452 : }
1453 :
1454 : /*
1455 : * Search for next nonempty bucket starting at curBucket.
1456 : */
1457 39714482 : curBucket = status->curBucket;
1458 39714482 : hashp = status->hashp;
1459 39714482 : hctl = hashp->hctl;
1460 39714482 : ssize = hashp->ssize;
1461 39714482 : max_bucket = hctl->max_bucket;
1462 :
1463 39714482 : if (curBucket > max_bucket)
1464 : {
1465 130660 : hash_seq_term(status);
1466 130660 : return NULL; /* search is done */
1467 : }
1468 :
1469 : /*
1470 : * first find the right segment in the table directory.
1471 : */
1472 39583822 : segment_num = curBucket >> hashp->sshift;
1473 39583822 : segment_ndx = MOD(curBucket, ssize);
1474 :
1475 39583822 : segp = hashp->dir[segment_num];
1476 :
1477 : /*
1478 : * Pick up the first item in this bucket's chain. If chain is not empty
1479 : * we can begin searching it. Otherwise we have to advance to find the
1480 : * next nonempty bucket. We try to optimize that case since searching a
1481 : * near-empty hashtable has to iterate this loop a lot.
1482 : */
1483 206195188 : while ((curElem = segp[segment_ndx]) == NULL)
1484 : {
1485 : /* empty bucket, advance to next */
1486 170468846 : if (++curBucket > max_bucket)
1487 : {
1488 3857480 : status->curBucket = curBucket;
1489 3857480 : hash_seq_term(status);
1490 3857480 : return NULL; /* search is done */
1491 : }
1492 166611366 : if (++segment_ndx >= ssize)
1493 : {
1494 277358 : segment_num++;
1495 277358 : segment_ndx = 0;
1496 277358 : segp = hashp->dir[segment_num];
1497 : }
1498 : }
1499 :
1500 : /* Begin scan of curBucket... */
1501 35726342 : status->curEntry = curElem->link;
1502 35726342 : if (status->curEntry == NULL) /* end of this bucket */
1503 24489974 : ++curBucket;
1504 35726342 : status->curBucket = curBucket;
1505 35726342 : return ELEMENTKEY(curElem);
1506 : }
1507 :
1508 : void
1509 5597868 : hash_seq_term(HASH_SEQ_STATUS *status)
1510 : {
1511 5597868 : if (!status->hashp->frozen)
1512 5597868 : deregister_seq_scan(status->hashp);
1513 5597868 : }
1514 :
1515 : /*
1516 : * hash_freeze
1517 : * Freeze a hashtable against future insertions (deletions are
1518 : * still allowed)
1519 : *
1520 : * The reason for doing this is that by preventing any more bucket splits,
1521 : * we no longer need to worry about registering hash_seq_search scans,
1522 : * and thus caller need not be careful about ensuring hash_seq_term gets
1523 : * called at the right times.
1524 : *
1525 : * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1526 : * with active scans (since hash_seq_term would then do the wrong thing).
1527 : */
1528 : void
1529 0 : hash_freeze(HTAB *hashp)
1530 : {
1531 0 : if (hashp->isshared)
1532 0 : elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1533 0 : if (!hashp->frozen && has_seq_scans(hashp))
1534 0 : elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1535 : hashp->tabname);
1536 0 : hashp->frozen = true;
1537 0 : }
1538 :
1539 :
1540 : /********************************* UTILITIES ************************/
1541 :
1542 : /*
1543 : * Expand the table by adding one more hash bucket.
1544 : */
1545 : static bool
1546 756190 : expand_table(HTAB *hashp)
1547 : {
1548 756190 : HASHHDR *hctl = hashp->hctl;
1549 : HASHSEGMENT old_seg,
1550 : new_seg;
1551 : int64 old_bucket,
1552 : new_bucket;
1553 : int64 new_segnum,
1554 : new_segndx;
1555 : int64 old_segnum,
1556 : old_segndx;
1557 : HASHBUCKET *oldlink,
1558 : *newlink;
1559 : HASHBUCKET currElement,
1560 : nextElement;
1561 :
1562 : Assert(!IS_PARTITIONED(hctl));
1563 :
1564 : #ifdef HASH_STATISTICS
1565 : hctl->expansions++;
1566 : #endif
1567 :
1568 756190 : new_bucket = hctl->max_bucket + 1;
1569 756190 : new_segnum = new_bucket >> hashp->sshift;
1570 756190 : new_segndx = MOD(new_bucket, hashp->ssize);
1571 :
1572 756190 : if (new_segnum >= hctl->nsegs)
1573 : {
1574 : /* Allocate new segment if necessary -- could fail if dir full */
1575 2454 : if (new_segnum >= hctl->dsize)
1576 0 : if (!dir_realloc(hashp))
1577 0 : return false;
1578 2454 : if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1579 0 : return false;
1580 2454 : hctl->nsegs++;
1581 : }
1582 :
1583 : /* OK, we created a new bucket */
1584 756190 : hctl->max_bucket++;
1585 :
1586 : /*
1587 : * *Before* changing masks, find old bucket corresponding to same hash
1588 : * values; values in that bucket may need to be relocated to new bucket.
1589 : * Note that new_bucket is certainly larger than low_mask at this point,
1590 : * so we can skip the first step of the regular hash mask calc.
1591 : */
1592 756190 : old_bucket = (new_bucket & hctl->low_mask);
1593 :
1594 : /*
1595 : * If we crossed a power of 2, readjust masks.
1596 : */
1597 756190 : if ((uint32) new_bucket > hctl->high_mask)
1598 : {
1599 4566 : hctl->low_mask = hctl->high_mask;
1600 4566 : hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1601 : }
1602 :
1603 : /*
1604 : * Relocate records to the new bucket. NOTE: because of the way the hash
1605 : * masking is done in calc_bucket, only one old bucket can need to be
1606 : * split at this point. With a different way of reducing the hash value,
1607 : * that might not be true!
1608 : */
1609 756190 : old_segnum = old_bucket >> hashp->sshift;
1610 756190 : old_segndx = MOD(old_bucket, hashp->ssize);
1611 :
1612 756190 : old_seg = hashp->dir[old_segnum];
1613 756190 : new_seg = hashp->dir[new_segnum];
1614 :
1615 756190 : oldlink = &old_seg[old_segndx];
1616 756190 : newlink = &new_seg[new_segndx];
1617 :
1618 756190 : for (currElement = *oldlink;
1619 1815598 : currElement != NULL;
1620 1059408 : currElement = nextElement)
1621 : {
1622 1059408 : nextElement = currElement->link;
1623 1059408 : if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1624 : {
1625 519394 : *oldlink = currElement;
1626 519394 : oldlink = &currElement->link;
1627 : }
1628 : else
1629 : {
1630 540014 : *newlink = currElement;
1631 540014 : newlink = &currElement->link;
1632 : }
1633 : }
1634 : /* don't forget to terminate the rebuilt hash chains... */
1635 756190 : *oldlink = NULL;
1636 756190 : *newlink = NULL;
1637 :
1638 756190 : return true;
1639 : }
1640 :
1641 :
1642 : static bool
1643 0 : dir_realloc(HTAB *hashp)
1644 : {
1645 : HASHSEGMENT *p;
1646 : HASHSEGMENT *old_p;
1647 : int64 new_dsize;
1648 : int64 old_dirsize;
1649 : int64 new_dirsize;
1650 :
1651 0 : if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1652 0 : return false;
1653 :
1654 : /* Reallocate directory */
1655 0 : new_dsize = hashp->hctl->dsize << 1;
1656 0 : old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1657 0 : new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1658 :
1659 0 : old_p = hashp->dir;
1660 0 : CurrentDynaHashCxt = hashp->hcxt;
1661 0 : p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1662 :
1663 0 : if (p != NULL)
1664 : {
1665 0 : memcpy(p, old_p, old_dirsize);
1666 0 : MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1667 0 : hashp->dir = p;
1668 0 : hashp->hctl->dsize = new_dsize;
1669 :
1670 : /* XXX assume the allocator is palloc, so we know how to free */
1671 : Assert(hashp->alloc == DynaHashAlloc);
1672 0 : pfree(old_p);
1673 :
1674 0 : return true;
1675 : }
1676 :
1677 0 : return false;
1678 : }
1679 :
1680 :
1681 : static HASHSEGMENT
1682 1326272 : seg_alloc(HTAB *hashp)
1683 : {
1684 : HASHSEGMENT segp;
1685 :
1686 1326272 : CurrentDynaHashCxt = hashp->hcxt;
1687 1326272 : segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1688 :
1689 1326272 : if (!segp)
1690 0 : return NULL;
1691 :
1692 1326272 : MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1693 :
1694 1326272 : return segp;
1695 : }
1696 :
1697 : /*
1698 : * allocate some new elements and link them into the indicated free list
1699 : */
1700 : static bool
1701 1261698 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1702 : {
1703 1261698 : HASHHDR *hctl = hashp->hctl;
1704 : Size elementSize;
1705 : Size requestSize;
1706 : char *allocedBlock;
1707 : HASHELEMENT *firstElement;
1708 : HASHELEMENT *tmpElement;
1709 : HASHELEMENT *prevElement;
1710 : int i;
1711 :
1712 1261698 : if (hctl->isfixed)
1713 0 : return false;
1714 :
1715 : /* Each element has a HASHELEMENT header plus user data. */
1716 1261698 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1717 :
1718 1261698 : requestSize = nelem * elementSize;
1719 :
1720 : /* Add space for slist_node list link if we need one. */
1721 : #ifdef USE_VALGRIND
1722 : if (!hashp->isshared)
1723 : requestSize += MAXALIGN(sizeof(slist_node));
1724 : #endif
1725 :
1726 : /* Allocate the memory. */
1727 1261698 : CurrentDynaHashCxt = hashp->hcxt;
1728 1261698 : allocedBlock = hashp->alloc(requestSize);
1729 :
1730 1261698 : if (!allocedBlock)
1731 0 : return false;
1732 :
1733 : /*
1734 : * If USE_VALGRIND, each allocated block of elements of a non-shared
1735 : * hashtable is chained into a list, so that Valgrind won't think it's
1736 : * been leaked.
1737 : */
1738 : #ifdef USE_VALGRIND
1739 : if (hashp->isshared)
1740 : firstElement = (HASHELEMENT *) allocedBlock;
1741 : else
1742 : {
1743 : slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock);
1744 : firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node)));
1745 : }
1746 : #else
1747 1261698 : firstElement = (HASHELEMENT *) allocedBlock;
1748 : #endif
1749 :
1750 : /* prepare to link all the new entries into the freelist */
1751 1261698 : prevElement = NULL;
1752 1261698 : tmpElement = firstElement;
1753 108942992 : for (i = 0; i < nelem; i++)
1754 : {
1755 107681294 : tmpElement->link = prevElement;
1756 107681294 : prevElement = tmpElement;
1757 107681294 : tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1758 : }
1759 :
1760 : /* if partitioned, must lock to touch freeList */
1761 1261698 : if (IS_PARTITIONED(hctl))
1762 347464 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1763 :
1764 : /* freelist could be nonempty if two backends did this concurrently */
1765 1261698 : firstElement->link = hctl->freeList[freelist_idx].freeList;
1766 1261698 : hctl->freeList[freelist_idx].freeList = prevElement;
1767 :
1768 1261698 : if (IS_PARTITIONED(hctl))
1769 347464 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1770 :
1771 1261698 : return true;
1772 : }
1773 :
1774 : /*
1775 : * Do initial lookup of a bucket for the given hash value, retrieving its
1776 : * bucket number and its hash bucket.
1777 : */
1778 : static inline uint32
1779 386173512 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
1780 : {
1781 386173512 : HASHHDR *hctl = hashp->hctl;
1782 : HASHSEGMENT segp;
1783 : int64 segment_num;
1784 : int64 segment_ndx;
1785 : uint32 bucket;
1786 :
1787 386173512 : bucket = calc_bucket(hctl, hashvalue);
1788 :
1789 386173512 : segment_num = bucket >> hashp->sshift;
1790 386173512 : segment_ndx = MOD(bucket, hashp->ssize);
1791 :
1792 386173512 : segp = hashp->dir[segment_num];
1793 :
1794 386173512 : if (segp == NULL)
1795 0 : hash_corrupted(hashp);
1796 :
1797 386173512 : *bucketptr = &segp[segment_ndx];
1798 386173512 : return bucket;
1799 : }
1800 :
1801 : /* complain when we have detected a corrupted hashtable */
1802 : static void
1803 0 : hash_corrupted(HTAB *hashp)
1804 : {
1805 : /*
1806 : * If the corruption is in a shared hashtable, we'd better force a
1807 : * systemwide restart. Otherwise, just shut down this one backend.
1808 : */
1809 0 : if (hashp->isshared)
1810 0 : elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1811 : else
1812 0 : elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1813 : }
1814 :
1815 : /* calculate ceil(log base 2) of num */
1816 : int
1817 1605070 : my_log2(int64 num)
1818 : {
1819 : /*
1820 : * guard against too-large input, which would be invalid for
1821 : * pg_ceil_log2_*()
1822 : */
1823 1605070 : if (num > PG_INT64_MAX / 2)
1824 0 : num = PG_INT64_MAX / 2;
1825 :
1826 1605070 : return pg_ceil_log2_64(num);
1827 : }
1828 :
1829 : /* calculate first power of 2 >= num, bounded to what will fit in a int64 */
1830 : static int64
1831 110864 : next_pow2_int64(int64 num)
1832 : {
1833 : /* my_log2's internal range check is sufficient */
1834 110864 : return 1L << my_log2(num);
1835 : }
1836 :
1837 : /* calculate first power of 2 >= num, bounded to what will fit in an int */
1838 : static int
1839 1451424 : next_pow2_int(int64 num)
1840 : {
1841 1451424 : if (num > INT_MAX / 2)
1842 0 : num = INT_MAX / 2;
1843 1451424 : return 1 << my_log2(num);
1844 : }
1845 :
1846 :
1847 : /************************* SEQ SCAN TRACKING ************************/
1848 :
1849 : /*
1850 : * We track active hash_seq_search scans here. The need for this mechanism
1851 : * comes from the fact that a scan will get confused if a bucket split occurs
1852 : * while it's in progress: it might visit entries twice, or even miss some
1853 : * entirely (if it's partway through the same bucket that splits). Hence
1854 : * we want to inhibit bucket splits if there are any active scans on the
1855 : * table being inserted into. This is a fairly rare case in current usage,
1856 : * so just postponing the split until the next insertion seems sufficient.
1857 : *
1858 : * Given present usages of the function, only a few scans are likely to be
1859 : * open concurrently; so a finite-size stack of open scans seems sufficient,
1860 : * and we don't worry that linear search is too slow. Note that we do
1861 : * allow multiple scans of the same hashtable to be open concurrently.
1862 : *
1863 : * This mechanism can support concurrent scan and insertion in a shared
1864 : * hashtable if it's the same backend doing both. It would fail otherwise,
1865 : * but locking reasons seem to preclude any such scenario anyway, so we don't
1866 : * worry.
1867 : *
1868 : * This arrangement is reasonably robust if a transient hashtable is deleted
1869 : * without notifying us. The absolute worst case is we might inhibit splits
1870 : * in another table created later at exactly the same address. We will give
1871 : * a warning at transaction end for reference leaks, so any bugs leading to
1872 : * lack of notification should be easy to catch.
1873 : */
1874 :
1875 : #define MAX_SEQ_SCANS 100
1876 :
1877 : static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1878 : static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1879 : static int num_seq_scans = 0;
1880 :
1881 :
1882 : /* Register a table as having an active hash_seq_search scan */
1883 : static void
1884 5597888 : register_seq_scan(HTAB *hashp)
1885 : {
1886 5597888 : if (num_seq_scans >= MAX_SEQ_SCANS)
1887 0 : elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1888 : hashp->tabname);
1889 5597888 : seq_scan_tables[num_seq_scans] = hashp;
1890 5597888 : seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1891 5597888 : num_seq_scans++;
1892 5597888 : }
1893 :
1894 : /* Deregister an active scan */
1895 : static void
1896 5597868 : deregister_seq_scan(HTAB *hashp)
1897 : {
1898 : int i;
1899 :
1900 : /* Search backward since it's most likely at the stack top */
1901 5597868 : for (i = num_seq_scans - 1; i >= 0; i--)
1902 : {
1903 5597868 : if (seq_scan_tables[i] == hashp)
1904 : {
1905 5597868 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1906 5597868 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1907 5597868 : num_seq_scans--;
1908 5597868 : return;
1909 : }
1910 : }
1911 0 : elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1912 : hashp->tabname);
1913 : }
1914 :
1915 : /* Check if a table has any active scan */
1916 : static bool
1917 756190 : has_seq_scans(HTAB *hashp)
1918 : {
1919 : int i;
1920 :
1921 756190 : for (i = 0; i < num_seq_scans; i++)
1922 : {
1923 0 : if (seq_scan_tables[i] == hashp)
1924 0 : return true;
1925 : }
1926 756190 : return false;
1927 : }
1928 :
1929 : /* Clean up any open scans at end of transaction */
1930 : void
1931 1195080 : AtEOXact_HashTables(bool isCommit)
1932 : {
1933 : /*
1934 : * During abort cleanup, open scans are expected; just silently clean 'em
1935 : * out. An open scan at commit means someone forgot a hash_seq_term()
1936 : * call, so complain.
1937 : *
1938 : * Note: it's tempting to try to print the tabname here, but refrain for
1939 : * fear of touching deallocated memory. This isn't a user-facing message
1940 : * anyway, so it needn't be pretty.
1941 : */
1942 1195080 : if (isCommit)
1943 : {
1944 : int i;
1945 :
1946 1145400 : for (i = 0; i < num_seq_scans; i++)
1947 : {
1948 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1949 : seq_scan_tables[i]);
1950 : }
1951 : }
1952 1195080 : num_seq_scans = 0;
1953 1195080 : }
1954 :
1955 : /* Clean up any open scans at end of subtransaction */
1956 : void
1957 20102 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1958 : {
1959 : int i;
1960 :
1961 : /*
1962 : * Search backward to make cleanup easy. Note we must check all entries,
1963 : * not only those at the end of the array, because deletion technique
1964 : * doesn't keep them in order.
1965 : */
1966 20102 : for (i = num_seq_scans - 1; i >= 0; i--)
1967 : {
1968 0 : if (seq_scan_level[i] >= nestDepth)
1969 : {
1970 0 : if (isCommit)
1971 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1972 : seq_scan_tables[i]);
1973 0 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1974 0 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1975 0 : num_seq_scans--;
1976 : }
1977 : }
1978 20102 : }
|