Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dynahash.c
4 : * dynamic chained hash tables
5 : *
6 : * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 : * shared memory. For shared hash tables, it is the caller's responsibility
8 : * to provide appropriate access interlocking. The simplest convention is
9 : * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 : * hash_seq_search) need only shared lock, but any update requires exclusive
11 : * lock. For heavily-used shared tables, the single-lock approach creates a
12 : * concurrency bottleneck, so we also support "partitioned" locking wherein
13 : * there are multiple LWLocks guarding distinct subsets of the table. To use
14 : * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 : * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 : * Therefore, each hash bucket chain operates independently, and no fields
17 : * of the hash header change after init except nentries and freeList.
18 : * (A partitioned table uses multiple copies of those fields, guarded by
19 : * spinlocks, for additional concurrency.)
20 : * This lets any subset of the hash buckets be treated as a separately
21 : * lockable partition. We expect callers to use the low-order bits of a
22 : * lookup key's hash value as a partition number --- this will work because
23 : * of the way calc_bucket() maps hash values to bucket numbers.
24 : *
25 : * For hash tables in shared memory, the memory allocator function should
26 : * match malloc's semantics of returning NULL on failure. For hash tables
27 : * in local memory, we typically use palloc() which will throw error on
28 : * failure. The code in this file has to cope with both cases.
29 : *
30 : * dynahash.c provides support for these types of lookup keys:
31 : *
32 : * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
33 : * compared as though by strcmp(). This is selected by specifying the
34 : * HASH_STRINGS flag to hash_create.
35 : *
36 : * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
37 : * (Caller must ensure there are no undefined padding bits in the keys!)
38 : * This is selected by specifying the HASH_BLOBS flag to hash_create.
39 : *
40 : * 3. More complex key behavior can be selected by specifying user-supplied
41 : * hashing, comparison, and/or key-copying functions. At least a hashing
42 : * function must be supplied; comparison defaults to memcmp() and key copying
43 : * to memcpy() when a user-defined hashing function is selected.
44 : *
45 : * Compared to simplehash, dynahash has the following benefits:
46 : *
47 : * - It supports partitioning, which is useful for shared memory access using
48 : * locks.
49 : * - Shared memory hashes are allocated in a fixed size area at startup and
50 : * are discoverable by name from other processes.
51 : * - Because entries don't need to be moved in the case of hash conflicts,
52 : * dynahash has better performance for large entries.
53 : * - Guarantees stable pointers to entries.
54 : *
55 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
56 : * Portions Copyright (c) 1994, Regents of the University of California
57 : *
58 : *
59 : * IDENTIFICATION
60 : * src/backend/utils/hash/dynahash.c
61 : *
62 : *-------------------------------------------------------------------------
63 : */
64 :
65 : /*
66 : * Original comments:
67 : *
68 : * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
69 : * Coded into C, with minor code improvements, and with hsearch(3) interface,
70 : * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
71 : * also, hcreate/hdestroy routines added to simulate hsearch(3).
72 : *
73 : * These routines simulate hsearch(3) and family, with the important
74 : * difference that the hash table is dynamic - can grow indefinitely
75 : * beyond its original size (as supplied to hcreate()).
76 : *
77 : * Performance appears to be comparable to that of hsearch(3).
78 : * The 'source-code' options referred to in hsearch(3)'s 'man' page
79 : * are not implemented; otherwise functionality is identical.
80 : *
81 : * Compilation controls:
82 : * HASH_DEBUG controls some informative traces, mainly for debugging.
83 : * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
84 : * when combined with HASH_DEBUG, these are displayed by hdestroy().
85 : *
86 : * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 : * concatenation property, in probably unnecessary code 'optimization'.
88 : *
89 : * Modified margo@postgres.berkeley.edu February 1990
90 : * added multiple table interface
91 : * Modified by sullivan@postgres.berkeley.edu April 1990
92 : * changed ctl structure for shared memory
93 : */
94 :
95 : #include "postgres.h"
96 :
97 : #include <limits.h>
98 :
99 : #include "access/xact.h"
100 : #include "common/hashfn.h"
101 : #include "port/pg_bitutils.h"
102 : #include "storage/shmem.h"
103 : #include "storage/spin.h"
104 : #include "utils/dynahash.h"
105 : #include "utils/memutils.h"
106 :
107 :
108 : /*
109 : * Constants
110 : *
111 : * A hash table has a top-level "directory", each of whose entries points
112 : * to a "segment" of ssize bucket headers. The maximum number of hash
113 : * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
114 : * the number of records in the table can be larger, but we don't want a
115 : * whole lot of records per bucket or performance goes down.
116 : *
117 : * In a hash table allocated in shared memory, the directory cannot be
118 : * expanded because it must stay at a fixed address. The directory size
119 : * should be selected using hash_select_dirsize (and you'd better have
120 : * a good idea of the maximum number of entries!). For non-shared hash
121 : * tables, the initial directory size can be left at the default.
122 : */
123 : #define DEF_SEGSIZE 256
124 : #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
125 : #define DEF_DIRSIZE 256
126 :
127 : /* Number of freelists to be used for a partitioned hash table. */
128 : #define NUM_FREELISTS 32
129 :
130 : /* A hash bucket is a linked list of HASHELEMENTs */
131 : typedef HASHELEMENT *HASHBUCKET;
132 :
133 : /* A hash segment is an array of bucket headers */
134 : typedef HASHBUCKET *HASHSEGMENT;
135 :
136 : /*
137 : * Per-freelist data.
138 : *
139 : * In a partitioned hash table, each freelist is associated with a specific
140 : * set of hashcodes, as determined by the FREELIST_IDX() macro below.
141 : * nentries tracks the number of live hashtable entries having those hashcodes
142 : * (NOT the number of entries in the freelist, as you might expect).
143 : *
144 : * The coverage of a freelist might be more or less than one partition, so it
145 : * needs its own lock rather than relying on caller locking. Relying on that
146 : * wouldn't work even if the coverage was the same, because of the occasional
147 : * need to "borrow" entries from another freelist; see get_hash_entry().
148 : *
149 : * Using an array of FreeListData instead of separate arrays of mutexes,
150 : * nentries and freeLists helps to reduce sharing of cache lines between
151 : * different mutexes.
152 : */
153 : typedef struct
154 : {
155 : slock_t mutex; /* spinlock for this freelist */
156 : long nentries; /* number of entries in associated buckets */
157 : HASHELEMENT *freeList; /* chain of free elements */
158 : } FreeListData;
159 :
160 : /*
161 : * Header structure for a hash table --- contains all changeable info
162 : *
163 : * In a shared-memory hash table, the HASHHDR is in shared memory, while
164 : * each backend has a local HTAB struct. For a non-shared table, there isn't
165 : * any functional difference between HASHHDR and HTAB, but we separate them
166 : * anyway to share code between shared and non-shared tables.
167 : */
168 : struct HASHHDR
169 : {
170 : /*
171 : * The freelist can become a point of contention in high-concurrency hash
172 : * tables, so we use an array of freelists, each with its own mutex and
173 : * nentries count, instead of just a single one. Although the freelists
174 : * normally operate independently, we will scavenge entries from freelists
175 : * other than a hashcode's default freelist when necessary.
176 : *
177 : * If the hash table is not partitioned, only freeList[0] is used and its
178 : * spinlock is not used at all; callers' locking is assumed sufficient.
179 : */
180 : FreeListData freeList[NUM_FREELISTS];
181 :
182 : /* These fields can change, but not in a partitioned table */
183 : /* Also, dsize can't change in a shared table, even if unpartitioned */
184 : long dsize; /* directory size */
185 : long nsegs; /* number of allocated segments (<= dsize) */
186 : uint32 max_bucket; /* ID of maximum bucket in use */
187 : uint32 high_mask; /* mask to modulo into entire table */
188 : uint32 low_mask; /* mask to modulo into lower half of table */
189 :
190 : /* These fields are fixed at hashtable creation */
191 : Size keysize; /* hash key length in bytes */
192 : Size entrysize; /* total user element size in bytes */
193 : long num_partitions; /* # partitions (must be power of 2), or 0 */
194 : long max_dsize; /* 'dsize' limit if directory is fixed size */
195 : long ssize; /* segment size --- must be power of 2 */
196 : int sshift; /* segment shift = log2(ssize) */
197 : int nelem_alloc; /* number of entries to allocate at once */
198 : bool isfixed; /* if true, don't enlarge */
199 :
200 : #ifdef HASH_STATISTICS
201 :
202 : /*
203 : * Count statistics here. NB: stats code doesn't bother with mutex, so
204 : * counts could be corrupted a bit in a partitioned table.
205 : */
206 : long accesses;
207 : long collisions;
208 : #endif
209 : };
210 :
211 : #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
212 :
213 : #define FREELIST_IDX(hctl, hashcode) \
214 : (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
215 :
216 : /*
217 : * Top control structure for a hashtable --- in a shared table, each backend
218 : * has its own copy (OK since no fields change at runtime)
219 : */
220 : struct HTAB
221 : {
222 : HASHHDR *hctl; /* => shared control information */
223 : HASHSEGMENT *dir; /* directory of segment starts */
224 : HashValueFunc hash; /* hash function */
225 : HashCompareFunc match; /* key comparison function */
226 : HashCopyFunc keycopy; /* key copying function */
227 : HashAllocFunc alloc; /* memory allocator */
228 : MemoryContext hcxt; /* memory context if default allocator used */
229 : char *tabname; /* table name (for error messages) */
230 : bool isshared; /* true if table is in shared memory */
231 :
232 : /* freezing a shared table isn't allowed, so we can keep state here */
233 : bool frozen; /* true = no more inserts allowed */
234 :
235 : /* We keep local copies of these fixed values to reduce contention */
236 : Size keysize; /* hash key length in bytes */
237 : long ssize; /* segment size --- must be power of 2 */
238 : int sshift; /* segment shift = log2(ssize) */
239 : };
240 :
241 : /*
242 : * Key (also entry) part of a HASHELEMENT
243 : */
244 : #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
245 :
246 : /*
247 : * Obtain element pointer given pointer to key
248 : */
249 : #define ELEMENT_FROM_KEY(key) \
250 : ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
251 :
252 : /*
253 : * Fast MOD arithmetic, assuming that y is a power of 2 !
254 : */
255 : #define MOD(x,y) ((x) & ((y)-1))
256 :
257 : #ifdef HASH_STATISTICS
258 : static long hash_accesses,
259 : hash_collisions,
260 : hash_expansions;
261 : #endif
262 :
263 : /*
264 : * Private function prototypes
265 : */
266 : static void *DynaHashAlloc(Size size);
267 : static HASHSEGMENT seg_alloc(HTAB *hashp);
268 : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
269 : static bool dir_realloc(HTAB *hashp);
270 : static bool expand_table(HTAB *hashp);
271 : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
272 : static void hdefault(HTAB *hashp);
273 : static int choose_nelem_alloc(Size entrysize);
274 : static bool init_htab(HTAB *hashp, long nelem);
275 : pg_noreturn static void hash_corrupted(HTAB *hashp);
276 : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
277 : HASHBUCKET **bucketptr);
278 : static long next_pow2_long(long num);
279 : static int next_pow2_int(long num);
280 : static void register_seq_scan(HTAB *hashp);
281 : static void deregister_seq_scan(HTAB *hashp);
282 : static bool has_seq_scans(HTAB *hashp);
283 :
284 :
285 : /*
286 : * memory allocation support
287 : */
288 : static MemoryContext CurrentDynaHashCxt = NULL;
289 :
290 : static void *
291 3140586 : DynaHashAlloc(Size size)
292 : {
293 : Assert(MemoryContextIsValid(CurrentDynaHashCxt));
294 3140586 : return MemoryContextAllocExtended(CurrentDynaHashCxt, size,
295 : MCXT_ALLOC_NO_OOM);
296 : }
297 :
298 :
299 : /*
300 : * HashCompareFunc for string keys
301 : *
302 : * Because we copy keys with strlcpy(), they will be truncated at keysize-1
303 : * bytes, so we can only compare that many ... hence strncmp is almost but
304 : * not quite the right thing.
305 : */
306 : static int
307 1028474 : string_compare(const char *key1, const char *key2, Size keysize)
308 : {
309 1028474 : return strncmp(key1, key2, keysize - 1);
310 : }
311 :
312 :
313 : /************************** CREATE ROUTINES **********************/
314 :
315 : /*
316 : * hash_create -- create a new dynamic hash table
317 : *
318 : * tabname: a name for the table (for debugging purposes)
319 : * nelem: maximum number of elements expected
320 : * *info: additional table parameters, as indicated by flags
321 : * flags: bitmask indicating which parameters to take from *info
322 : *
323 : * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
324 : * optional, but the default keysize and entrysize values were useless.)
325 : * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
326 : * or HASH_FUNCTION, to define the key hashing semantics (C strings,
327 : * binary blobs, or custom, respectively). Callers specifying a custom
328 : * hash function will likely also want to use HASH_COMPARE, and perhaps
329 : * also HASH_KEYCOPY, to control key comparison and copying.
330 : * Another often-used flag is HASH_CONTEXT, to allocate the hash table
331 : * under info->hcxt rather than under TopMemoryContext; the default
332 : * behavior is only suitable for session-lifespan hash tables.
333 : * Other flags bits are special-purpose and seldom used, except for those
334 : * associated with shared-memory hash tables, for which see ShmemInitHash().
335 : *
336 : * Fields in *info are read only when the associated flags bit is set.
337 : * It is not necessary to initialize other fields of *info.
338 : * Neither tabname nor *info need persist after the hash_create() call.
339 : *
340 : * Note: It is deprecated for callers of hash_create() to explicitly specify
341 : * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
342 : * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
343 : * one of these.
344 : *
345 : * Note: for a shared-memory hashtable, nelem needs to be a pretty good
346 : * estimate, since we can't expand the table on the fly. But an unshared
347 : * hashtable can be expanded on-the-fly, so it's better for nelem to be
348 : * on the small side and let the table grow if it's exceeded. An overly
349 : * large nelem will penalize hash_seq_search speed without buying much.
350 : */
351 : HTAB *
352 734494 : hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
353 : {
354 : HTAB *hashp;
355 : HASHHDR *hctl;
356 :
357 : /*
358 : * Hash tables now allocate space for key and data, but you have to say
359 : * how much space to allocate.
360 : */
361 : Assert(flags & HASH_ELEM);
362 : Assert(info->keysize > 0);
363 : Assert(info->entrysize >= info->keysize);
364 :
365 : /*
366 : * For shared hash tables, we have a local hash header (HTAB struct) that
367 : * we allocate in TopMemoryContext; all else is in shared memory.
368 : *
369 : * For non-shared hash tables, everything including the hash header is in
370 : * a memory context created specially for the hash table --- this makes
371 : * hash_destroy very simple. The memory context is made a child of either
372 : * a context specified by the caller, or TopMemoryContext if nothing is
373 : * specified.
374 : */
375 734494 : if (flags & HASH_SHARED_MEM)
376 : {
377 : /* Set up to allocate the hash header */
378 19382 : CurrentDynaHashCxt = TopMemoryContext;
379 : }
380 : else
381 : {
382 : /* Create the hash table's private memory context */
383 715112 : if (flags & HASH_CONTEXT)
384 415066 : CurrentDynaHashCxt = info->hcxt;
385 : else
386 300046 : CurrentDynaHashCxt = TopMemoryContext;
387 715112 : CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
388 : "dynahash",
389 : ALLOCSET_DEFAULT_SIZES);
390 : }
391 :
392 : /* Initialize the hash header, plus a copy of the table name */
393 734494 : hashp = (HTAB *) MemoryContextAlloc(CurrentDynaHashCxt,
394 734494 : sizeof(HTAB) + strlen(tabname) + 1);
395 9548422 : MemSet(hashp, 0, sizeof(HTAB));
396 :
397 734494 : hashp->tabname = (char *) (hashp + 1);
398 734494 : strcpy(hashp->tabname, tabname);
399 :
400 : /* If we have a private context, label it with hashtable's name */
401 734494 : if (!(flags & HASH_SHARED_MEM))
402 715112 : MemoryContextSetIdentifier(CurrentDynaHashCxt, hashp->tabname);
403 :
404 : /*
405 : * Select the appropriate hash function (see comments at head of file).
406 : */
407 734494 : if (flags & HASH_FUNCTION)
408 : {
409 : Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
410 27872 : hashp->hash = info->hash;
411 : }
412 706622 : else if (flags & HASH_BLOBS)
413 : {
414 : Assert(!(flags & HASH_STRINGS));
415 : /* We can optimize hashing for common key sizes */
416 598670 : if (info->keysize == sizeof(uint32))
417 421192 : hashp->hash = uint32_hash;
418 : else
419 177478 : hashp->hash = tag_hash;
420 : }
421 : else
422 : {
423 : /*
424 : * string_hash used to be considered the default hash method, and in a
425 : * non-assert build it effectively still is. But we now consider it
426 : * an assertion error to not say HASH_STRINGS explicitly. To help
427 : * catch mistaken usage of HASH_STRINGS, we also insist on a
428 : * reasonably long string length: if the keysize is only 4 or 8 bytes,
429 : * it's almost certainly an integer or pointer not a string.
430 : */
431 : Assert(flags & HASH_STRINGS);
432 : Assert(info->keysize > 8);
433 :
434 107952 : hashp->hash = string_hash;
435 : }
436 :
437 : /*
438 : * If you don't specify a match function, it defaults to string_compare if
439 : * you used string_hash, and to memcmp otherwise.
440 : *
441 : * Note: explicitly specifying string_hash is deprecated, because this
442 : * might not work for callers in loadable modules on some platforms due to
443 : * referencing a trampoline instead of the string_hash function proper.
444 : * Specify HASH_STRINGS instead.
445 : */
446 734494 : if (flags & HASH_COMPARE)
447 14710 : hashp->match = info->match;
448 719784 : else if (hashp->hash == string_hash)
449 107952 : hashp->match = (HashCompareFunc) string_compare;
450 : else
451 611832 : hashp->match = memcmp;
452 :
453 : /*
454 : * Similarly, the key-copying function defaults to strlcpy or memcpy.
455 : */
456 734494 : if (flags & HASH_KEYCOPY)
457 0 : hashp->keycopy = info->keycopy;
458 734494 : else if (hashp->hash == string_hash)
459 : {
460 : /*
461 : * The signature of keycopy is meant for memcpy(), which returns
462 : * void*, but strlcpy() returns size_t. Since we never use the return
463 : * value of keycopy, and size_t is pretty much always the same size as
464 : * void *, this should be safe. The extra cast in the middle is to
465 : * avoid warnings from -Wcast-function-type.
466 : */
467 107952 : hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
468 : }
469 : else
470 626542 : hashp->keycopy = memcpy;
471 :
472 : /* And select the entry allocation function, too. */
473 734494 : if (flags & HASH_ALLOC)
474 19382 : hashp->alloc = info->alloc;
475 : else
476 715112 : hashp->alloc = DynaHashAlloc;
477 :
478 734494 : if (flags & HASH_SHARED_MEM)
479 : {
480 : /*
481 : * ctl structure and directory are preallocated for shared memory
482 : * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
483 : * well.
484 : */
485 19382 : hashp->hctl = info->hctl;
486 19382 : hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
487 19382 : hashp->hcxt = NULL;
488 19382 : hashp->isshared = true;
489 :
490 : /* hash table already exists, we're just attaching to it */
491 19382 : if (flags & HASH_ATTACH)
492 : {
493 : /* make local copies of some heavily-used values */
494 0 : hctl = hashp->hctl;
495 0 : hashp->keysize = hctl->keysize;
496 0 : hashp->ssize = hctl->ssize;
497 0 : hashp->sshift = hctl->sshift;
498 :
499 0 : return hashp;
500 : }
501 : }
502 : else
503 : {
504 : /* setup hash table defaults */
505 715112 : hashp->hctl = NULL;
506 715112 : hashp->dir = NULL;
507 715112 : hashp->hcxt = CurrentDynaHashCxt;
508 715112 : hashp->isshared = false;
509 : }
510 :
511 734494 : if (!hashp->hctl)
512 : {
513 715112 : hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
514 715112 : if (!hashp->hctl)
515 0 : ereport(ERROR,
516 : (errcode(ERRCODE_OUT_OF_MEMORY),
517 : errmsg("out of memory")));
518 : }
519 :
520 734494 : hashp->frozen = false;
521 :
522 734494 : hdefault(hashp);
523 :
524 734494 : hctl = hashp->hctl;
525 :
526 734494 : if (flags & HASH_PARTITION)
527 : {
528 : /* Doesn't make sense to partition a local hash table */
529 : Assert(flags & HASH_SHARED_MEM);
530 :
531 : /*
532 : * The number of partitions had better be a power of 2. Also, it must
533 : * be less than INT_MAX (see init_htab()), so call the int version of
534 : * next_pow2.
535 : */
536 : Assert(info->num_partitions == next_pow2_int(info->num_partitions));
537 :
538 10760 : hctl->num_partitions = info->num_partitions;
539 : }
540 :
541 734494 : if (flags & HASH_SEGMENT)
542 : {
543 0 : hctl->ssize = info->ssize;
544 0 : hctl->sshift = my_log2(info->ssize);
545 : /* ssize had better be a power of 2 */
546 : Assert(hctl->ssize == (1L << hctl->sshift));
547 : }
548 :
549 : /*
550 : * SHM hash tables have fixed directory size passed by the caller.
551 : */
552 734494 : if (flags & HASH_DIRSIZE)
553 : {
554 19382 : hctl->max_dsize = info->max_dsize;
555 19382 : hctl->dsize = info->dsize;
556 : }
557 :
558 : /* remember the entry sizes, too */
559 734494 : hctl->keysize = info->keysize;
560 734494 : hctl->entrysize = info->entrysize;
561 :
562 : /* make local copies of heavily-used constant fields */
563 734494 : hashp->keysize = hctl->keysize;
564 734494 : hashp->ssize = hctl->ssize;
565 734494 : hashp->sshift = hctl->sshift;
566 :
567 : /* Build the hash directory structure */
568 734494 : if (!init_htab(hashp, nelem))
569 0 : elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
570 :
571 : /*
572 : * For a shared hash table, preallocate the requested number of elements.
573 : * This reduces problems with run-time out-of-shared-memory conditions.
574 : *
575 : * For a non-shared hash table, preallocate the requested number of
576 : * elements if it's less than our chosen nelem_alloc. This avoids wasting
577 : * space if the caller correctly estimates a small table size.
578 : */
579 734494 : if ((flags & HASH_SHARED_MEM) ||
580 715112 : nelem < hctl->nelem_alloc)
581 : {
582 : int i,
583 : freelist_partitions,
584 : nelem_alloc,
585 : nelem_alloc_first;
586 :
587 : /*
588 : * If hash table is partitioned, give each freelist an equal share of
589 : * the initial allocation. Otherwise only freeList[0] is used.
590 : */
591 266060 : if (IS_PARTITIONED(hashp->hctl))
592 10760 : freelist_partitions = NUM_FREELISTS;
593 : else
594 255300 : freelist_partitions = 1;
595 :
596 266060 : nelem_alloc = nelem / freelist_partitions;
597 266060 : if (nelem_alloc <= 0)
598 0 : nelem_alloc = 1;
599 :
600 : /*
601 : * Make sure we'll allocate all the requested elements; freeList[0]
602 : * gets the excess if the request isn't divisible by NUM_FREELISTS.
603 : */
604 266060 : if (nelem_alloc * freelist_partitions < nelem)
605 106 : nelem_alloc_first =
606 106 : nelem - nelem_alloc * (freelist_partitions - 1);
607 : else
608 265954 : nelem_alloc_first = nelem_alloc;
609 :
610 865680 : for (i = 0; i < freelist_partitions; i++)
611 : {
612 599620 : int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
613 :
614 599620 : if (!element_alloc(hashp, temp, i))
615 0 : ereport(ERROR,
616 : (errcode(ERRCODE_OUT_OF_MEMORY),
617 : errmsg("out of memory")));
618 : }
619 : }
620 :
621 : /* Set isfixed if requested, but not till after we build initial entries */
622 734494 : if (flags & HASH_FIXED_SIZE)
623 6456 : hctl->isfixed = true;
624 :
625 734494 : return hashp;
626 : }
627 :
628 : /*
629 : * Set default HASHHDR parameters.
630 : */
631 : static void
632 734494 : hdefault(HTAB *hashp)
633 : {
634 734494 : HASHHDR *hctl = hashp->hctl;
635 :
636 79325352 : MemSet(hctl, 0, sizeof(HASHHDR));
637 :
638 734494 : hctl->dsize = DEF_DIRSIZE;
639 734494 : hctl->nsegs = 0;
640 :
641 734494 : hctl->num_partitions = 0; /* not partitioned */
642 :
643 : /* table has no fixed maximum size */
644 734494 : hctl->max_dsize = NO_MAX_DSIZE;
645 :
646 734494 : hctl->ssize = DEF_SEGSIZE;
647 734494 : hctl->sshift = DEF_SEGSIZE_SHIFT;
648 :
649 734494 : hctl->isfixed = false; /* can be enlarged */
650 :
651 : #ifdef HASH_STATISTICS
652 : hctl->accesses = hctl->collisions = 0;
653 : #endif
654 734494 : }
655 :
656 : /*
657 : * Given the user-specified entry size, choose nelem_alloc, ie, how many
658 : * elements to add to the hash table when we need more.
659 : */
660 : static int
661 770490 : choose_nelem_alloc(Size entrysize)
662 : {
663 : int nelem_alloc;
664 : Size elementSize;
665 : Size allocSize;
666 :
667 : /* Each element has a HASHELEMENT header plus user data. */
668 : /* NB: this had better match element_alloc() */
669 770490 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
670 :
671 : /*
672 : * The idea here is to choose nelem_alloc at least 32, but round up so
673 : * that the allocation request will be a power of 2 or just less. This
674 : * makes little difference for hash tables in shared memory, but for hash
675 : * tables managed by palloc, the allocation request will be rounded up to
676 : * a power of 2 anyway. If we fail to take this into account, we'll waste
677 : * as much as half the allocated space.
678 : */
679 770490 : allocSize = 32 * 4; /* assume elementSize at least 8 */
680 : do
681 : {
682 2952782 : allocSize <<= 1;
683 2952782 : nelem_alloc = allocSize / elementSize;
684 2952782 : } while (nelem_alloc < 32);
685 :
686 770490 : return nelem_alloc;
687 : }
688 :
689 : /*
690 : * Compute derived fields of hctl and build the initial directory/segment
691 : * arrays
692 : */
693 : static bool
694 734494 : init_htab(HTAB *hashp, long nelem)
695 : {
696 734494 : HASHHDR *hctl = hashp->hctl;
697 : HASHSEGMENT *segp;
698 : int nbuckets;
699 : int nsegs;
700 : int i;
701 :
702 : /*
703 : * initialize mutexes if it's a partitioned table
704 : */
705 734494 : if (IS_PARTITIONED(hctl))
706 355080 : for (i = 0; i < NUM_FREELISTS; i++)
707 344320 : SpinLockInit(&(hctl->freeList[i].mutex));
708 :
709 : /*
710 : * Allocate space for the next greater power of two number of buckets,
711 : * assuming a desired maximum load factor of 1.
712 : */
713 734494 : nbuckets = next_pow2_int(nelem);
714 :
715 : /*
716 : * In a partitioned table, nbuckets must be at least equal to
717 : * num_partitions; were it less, keys with apparently different partition
718 : * numbers would map to the same bucket, breaking partition independence.
719 : * (Normally nbuckets will be much bigger; this is just a safety check.)
720 : */
721 734494 : while (nbuckets < hctl->num_partitions)
722 0 : nbuckets <<= 1;
723 :
724 734494 : hctl->max_bucket = hctl->low_mask = nbuckets - 1;
725 734494 : hctl->high_mask = (nbuckets << 1) - 1;
726 :
727 : /*
728 : * Figure number of directory segments needed, round up to a power of 2
729 : */
730 734494 : nsegs = (nbuckets - 1) / hctl->ssize + 1;
731 734494 : nsegs = next_pow2_int(nsegs);
732 :
733 : /*
734 : * Make sure directory is big enough. If pre-allocated directory is too
735 : * small, choke (caller screwed up).
736 : */
737 734494 : if (nsegs > hctl->dsize)
738 : {
739 0 : if (!(hashp->dir))
740 0 : hctl->dsize = nsegs;
741 : else
742 0 : return false;
743 : }
744 :
745 : /* Allocate a directory */
746 734494 : if (!(hashp->dir))
747 : {
748 715112 : CurrentDynaHashCxt = hashp->hcxt;
749 715112 : hashp->dir = (HASHSEGMENT *)
750 715112 : hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
751 715112 : if (!hashp->dir)
752 0 : return false;
753 : }
754 :
755 : /* Allocate initial segments */
756 2068414 : for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
757 : {
758 1333920 : *segp = seg_alloc(hashp);
759 1333920 : if (*segp == NULL)
760 0 : return false;
761 : }
762 :
763 : /* Choose number of entries to allocate at a time */
764 734494 : hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
765 :
766 : #ifdef HASH_DEBUG
767 : fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
768 : "TABLE POINTER ", hashp,
769 : "DIRECTORY SIZE ", hctl->dsize,
770 : "SEGMENT SIZE ", hctl->ssize,
771 : "SEGMENT SHIFT ", hctl->sshift,
772 : "MAX BUCKET ", hctl->max_bucket,
773 : "HIGH MASK ", hctl->high_mask,
774 : "LOW MASK ", hctl->low_mask,
775 : "NSEGS ", hctl->nsegs);
776 : #endif
777 734494 : return true;
778 : }
779 :
780 : /*
781 : * Estimate the space needed for a hashtable containing the given number
782 : * of entries of given size.
783 : * NOTE: this is used to estimate the footprint of hashtables in shared
784 : * memory; therefore it does not count HTAB which is in local memory.
785 : * NB: assumes that all hash structure parameters have default values!
786 : */
787 : Size
788 35996 : hash_estimate_size(long num_entries, Size entrysize)
789 : {
790 : Size size;
791 : long nBuckets,
792 : nSegments,
793 : nDirEntries,
794 : nElementAllocs,
795 : elementSize,
796 : elementAllocCnt;
797 :
798 : /* estimate number of buckets wanted */
799 35996 : nBuckets = next_pow2_long(num_entries);
800 : /* # of segments needed for nBuckets */
801 35996 : nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
802 : /* directory entries */
803 35996 : nDirEntries = DEF_DIRSIZE;
804 35996 : while (nDirEntries < nSegments)
805 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
806 :
807 : /* fixed control info */
808 35996 : size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
809 : /* directory */
810 35996 : size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
811 : /* segments */
812 35996 : size = add_size(size, mul_size(nSegments,
813 : MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
814 : /* elements --- allocated in groups of choose_nelem_alloc() entries */
815 35996 : elementAllocCnt = choose_nelem_alloc(entrysize);
816 35996 : nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
817 35996 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
818 35996 : size = add_size(size,
819 : mul_size(nElementAllocs,
820 : mul_size(elementAllocCnt, elementSize)));
821 :
822 35996 : return size;
823 : }
824 :
825 : /*
826 : * Select an appropriate directory size for a hashtable with the given
827 : * maximum number of entries.
828 : * This is only needed for hashtables in shared memory, whose directories
829 : * cannot be expanded dynamically.
830 : * NB: assumes that all hash structure parameters have default values!
831 : *
832 : * XXX this had better agree with the behavior of init_htab()...
833 : */
834 : long
835 19382 : hash_select_dirsize(long num_entries)
836 : {
837 : long nBuckets,
838 : nSegments,
839 : nDirEntries;
840 :
841 : /* estimate number of buckets wanted */
842 19382 : nBuckets = next_pow2_long(num_entries);
843 : /* # of segments needed for nBuckets */
844 19382 : nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
845 : /* directory entries */
846 19382 : nDirEntries = DEF_DIRSIZE;
847 19382 : while (nDirEntries < nSegments)
848 0 : nDirEntries <<= 1; /* dir_alloc doubles dsize at each call */
849 :
850 19382 : return nDirEntries;
851 : }
852 :
853 : /*
854 : * Compute the required initial memory allocation for a shared-memory
855 : * hashtable with the given parameters. We need space for the HASHHDR
856 : * and for the (non expansible) directory.
857 : */
858 : Size
859 19382 : hash_get_shared_size(HASHCTL *info, int flags)
860 : {
861 : Assert(flags & HASH_DIRSIZE);
862 : Assert(info->dsize == info->max_dsize);
863 19382 : return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
864 : }
865 :
866 :
867 : /********************** DESTROY ROUTINES ************************/
868 :
869 : void
870 136440 : hash_destroy(HTAB *hashp)
871 : {
872 136440 : if (hashp != NULL)
873 : {
874 : /* allocation method must be one we know how to free, too */
875 : Assert(hashp->alloc == DynaHashAlloc);
876 : /* so this hashtable must have its own context */
877 : Assert(hashp->hcxt != NULL);
878 :
879 136440 : hash_stats("destroy", hashp);
880 :
881 : /*
882 : * Free everything by destroying the hash table's memory context.
883 : */
884 136440 : MemoryContextDelete(hashp->hcxt);
885 : }
886 136440 : }
887 :
888 : void
889 136440 : hash_stats(const char *where, HTAB *hashp)
890 : {
891 : #ifdef HASH_STATISTICS
892 : fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
893 : where, hashp->hctl->accesses, hashp->hctl->collisions);
894 :
895 : fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
896 : hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
897 : hashp->hctl->max_bucket, hashp->hctl->nsegs);
898 : fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
899 : where, hash_accesses, hash_collisions);
900 : fprintf(stderr, "hash_stats: total expansions %ld\n",
901 : hash_expansions);
902 : #endif
903 136440 : }
904 :
905 : /*******************************SEARCH ROUTINES *****************************/
906 :
907 :
908 : /*
909 : * get_hash_value -- exported routine to calculate a key's hash value
910 : *
911 : * We export this because for partitioned tables, callers need to compute
912 : * the partition number (from the low-order bits of the hash value) before
913 : * searching.
914 : */
915 : uint32
916 163316064 : get_hash_value(HTAB *hashp, const void *keyPtr)
917 : {
918 163316064 : return hashp->hash(keyPtr, hashp->keysize);
919 : }
920 :
921 : /* Convert a hash value to a bucket number */
922 : static inline uint32
923 379315904 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
924 : {
925 : uint32 bucket;
926 :
927 379315904 : bucket = hash_val & hctl->high_mask;
928 379315904 : if (bucket > hctl->max_bucket)
929 180291464 : bucket = bucket & hctl->low_mask;
930 :
931 379315904 : return bucket;
932 : }
933 :
934 : /*
935 : * hash_search -- look up key in table and perform action
936 : * hash_search_with_hash_value -- same, with key's hash value already computed
937 : *
938 : * action is one of:
939 : * HASH_FIND: look up key in table
940 : * HASH_ENTER: look up key in table, creating entry if not present
941 : * HASH_ENTER_NULL: same, but return NULL if out of memory
942 : * HASH_REMOVE: look up key in table, remove entry if present
943 : *
944 : * Return value is a pointer to the element found/entered/removed if any,
945 : * or NULL if no match was found. (NB: in the case of the REMOVE action,
946 : * the result is a dangling pointer that shouldn't be dereferenced!)
947 : *
948 : * HASH_ENTER will normally ereport a generic "out of memory" error if
949 : * it is unable to create a new entry. The HASH_ENTER_NULL operation is
950 : * the same except it will return NULL if out of memory.
951 : *
952 : * If foundPtr isn't NULL, then *foundPtr is set true if we found an
953 : * existing entry in the table, false otherwise. This is needed in the
954 : * HASH_ENTER case, but is redundant with the return value otherwise.
955 : *
956 : * For hash_search_with_hash_value, the hashvalue parameter must have been
957 : * calculated with get_hash_value().
958 : */
959 : void *
960 230313916 : hash_search(HTAB *hashp,
961 : const void *keyPtr,
962 : HASHACTION action,
963 : bool *foundPtr)
964 : {
965 230313916 : return hash_search_with_hash_value(hashp,
966 : keyPtr,
967 230313916 : hashp->hash(keyPtr, hashp->keysize),
968 : action,
969 : foundPtr);
970 : }
971 :
972 : void *
973 376635166 : hash_search_with_hash_value(HTAB *hashp,
974 : const void *keyPtr,
975 : uint32 hashvalue,
976 : HASHACTION action,
977 : bool *foundPtr)
978 : {
979 376635166 : HASHHDR *hctl = hashp->hctl;
980 376635166 : int freelist_idx = FREELIST_IDX(hctl, hashvalue);
981 : Size keysize;
982 : HASHBUCKET currBucket;
983 : HASHBUCKET *prevBucketPtr;
984 : HashCompareFunc match;
985 :
986 : #ifdef HASH_STATISTICS
987 : hash_accesses++;
988 : hctl->accesses++;
989 : #endif
990 :
991 : /*
992 : * If inserting, check if it is time to split a bucket.
993 : *
994 : * NOTE: failure to expand table is not a fatal error, it just means we
995 : * have to run at higher fill factor than we wanted. However, if we're
996 : * using the palloc allocator then it will throw error anyway on
997 : * out-of-memory, so we must do this before modifying the table.
998 : */
999 376635166 : if (action == HASH_ENTER || action == HASH_ENTER_NULL)
1000 : {
1001 : /*
1002 : * Can't split if running in partitioned mode, nor if frozen, nor if
1003 : * table is the subject of any active hash_seq_search scans.
1004 : */
1005 100848874 : if (hctl->freeList[0].nentries > (long) hctl->max_bucket &&
1006 781220 : !IS_PARTITIONED(hctl) && !hashp->frozen &&
1007 781220 : !has_seq_scans(hashp))
1008 781220 : (void) expand_table(hashp);
1009 : }
1010 :
1011 : /*
1012 : * Do the initial lookup
1013 : */
1014 376635166 : (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
1015 376635166 : currBucket = *prevBucketPtr;
1016 :
1017 : /*
1018 : * Follow collision chain looking for matching key
1019 : */
1020 376635166 : match = hashp->match; /* save one fetch in inner loop */
1021 376635166 : keysize = hashp->keysize; /* ditto */
1022 :
1023 462629360 : while (currBucket != NULL)
1024 : {
1025 688188976 : if (currBucket->hashvalue == hashvalue &&
1026 301100148 : match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
1027 301094634 : break;
1028 85994194 : prevBucketPtr = &(currBucket->link);
1029 85994194 : currBucket = *prevBucketPtr;
1030 : #ifdef HASH_STATISTICS
1031 : hash_collisions++;
1032 : hctl->collisions++;
1033 : #endif
1034 : }
1035 :
1036 376635166 : if (foundPtr)
1037 103650848 : *foundPtr = (bool) (currBucket != NULL);
1038 :
1039 : /*
1040 : * OK, now what?
1041 : */
1042 376635166 : switch (action)
1043 : {
1044 223519104 : case HASH_FIND:
1045 223519104 : if (currBucket != NULL)
1046 210947768 : return ELEMENTKEY(currBucket);
1047 12571336 : return NULL;
1048 :
1049 52267188 : case HASH_REMOVE:
1050 52267188 : if (currBucket != NULL)
1051 : {
1052 : /* if partitioned, must lock to touch nentries and freeList */
1053 52257760 : if (IS_PARTITIONED(hctl))
1054 11346494 : SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
1055 :
1056 : /* delete the record from the appropriate nentries counter. */
1057 : Assert(hctl->freeList[freelist_idx].nentries > 0);
1058 52257760 : hctl->freeList[freelist_idx].nentries--;
1059 :
1060 : /* remove record from hash bucket's chain. */
1061 52257760 : *prevBucketPtr = currBucket->link;
1062 :
1063 : /* add the record to the appropriate freelist. */
1064 52257760 : currBucket->link = hctl->freeList[freelist_idx].freeList;
1065 52257760 : hctl->freeList[freelist_idx].freeList = currBucket;
1066 :
1067 52257760 : if (IS_PARTITIONED(hctl))
1068 11346494 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1069 :
1070 : /*
1071 : * better hope the caller is synchronizing access to this
1072 : * element, because someone else is going to reuse it the next
1073 : * time something is added to the table
1074 : */
1075 52257760 : return ELEMENTKEY(currBucket);
1076 : }
1077 9428 : return NULL;
1078 :
1079 100848874 : case HASH_ENTER:
1080 : case HASH_ENTER_NULL:
1081 : /* Return existing element if found, else create one */
1082 100848874 : if (currBucket != NULL)
1083 37889106 : return ELEMENTKEY(currBucket);
1084 :
1085 : /* disallow inserts if frozen */
1086 62959768 : if (hashp->frozen)
1087 0 : elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1088 : hashp->tabname);
1089 :
1090 62959768 : currBucket = get_hash_entry(hashp, freelist_idx);
1091 62959768 : if (currBucket == NULL)
1092 : {
1093 : /* out of memory */
1094 0 : if (action == HASH_ENTER_NULL)
1095 0 : return NULL;
1096 : /* report a generic message */
1097 0 : if (hashp->isshared)
1098 0 : ereport(ERROR,
1099 : (errcode(ERRCODE_OUT_OF_MEMORY),
1100 : errmsg("out of shared memory")));
1101 : else
1102 0 : ereport(ERROR,
1103 : (errcode(ERRCODE_OUT_OF_MEMORY),
1104 : errmsg("out of memory")));
1105 : }
1106 :
1107 : /* link into hashbucket chain */
1108 62959768 : *prevBucketPtr = currBucket;
1109 62959768 : currBucket->link = NULL;
1110 :
1111 : /* copy key into record */
1112 62959768 : currBucket->hashvalue = hashvalue;
1113 62959768 : hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1114 :
1115 : /*
1116 : * Caller is expected to fill the data field on return. DO NOT
1117 : * insert any code that could possibly throw error here, as doing
1118 : * so would leave the table entry incomplete and hence corrupt the
1119 : * caller's data structure.
1120 : */
1121 :
1122 62959768 : return ELEMENTKEY(currBucket);
1123 : }
1124 :
1125 0 : elog(ERROR, "unrecognized hash action code: %d", (int) action);
1126 :
1127 : return NULL; /* keep compiler quiet */
1128 : }
1129 :
1130 : /*
1131 : * hash_update_hash_key -- change the hash key of an existing table entry
1132 : *
1133 : * This is equivalent to removing the entry, making a new entry, and copying
1134 : * over its data, except that the entry never goes to the table's freelist.
1135 : * Therefore this cannot suffer an out-of-memory failure, even if there are
1136 : * other processes operating in other partitions of the hashtable.
1137 : *
1138 : * Returns true if successful, false if the requested new hash key is already
1139 : * present. Throws error if the specified entry pointer isn't actually a
1140 : * table member.
1141 : *
1142 : * NB: currently, there is no special case for old and new hash keys being
1143 : * identical, which means we'll report false for that situation. This is
1144 : * preferable for existing uses.
1145 : *
1146 : * NB: for a partitioned hashtable, caller must hold lock on both relevant
1147 : * partitions, if the new hash key would belong to a different partition.
1148 : */
1149 : bool
1150 1430 : hash_update_hash_key(HTAB *hashp,
1151 : void *existingEntry,
1152 : const void *newKeyPtr)
1153 : {
1154 1430 : HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1155 : uint32 newhashvalue;
1156 : Size keysize;
1157 : uint32 bucket;
1158 : uint32 newbucket;
1159 : HASHBUCKET currBucket;
1160 : HASHBUCKET *prevBucketPtr;
1161 : HASHBUCKET *oldPrevPtr;
1162 : HashCompareFunc match;
1163 :
1164 : #ifdef HASH_STATISTICS
1165 : hash_accesses++;
1166 : hctl->accesses++;
1167 : #endif
1168 :
1169 : /* disallow updates if frozen */
1170 1430 : if (hashp->frozen)
1171 0 : elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1172 : hashp->tabname);
1173 :
1174 : /*
1175 : * Lookup the existing element using its saved hash value. We need to do
1176 : * this to be able to unlink it from its hash chain, but as a side benefit
1177 : * we can verify the validity of the passed existingEntry pointer.
1178 : */
1179 1430 : bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
1180 : &prevBucketPtr);
1181 1430 : currBucket = *prevBucketPtr;
1182 :
1183 1436 : while (currBucket != NULL)
1184 : {
1185 1436 : if (currBucket == existingElement)
1186 1430 : break;
1187 6 : prevBucketPtr = &(currBucket->link);
1188 6 : currBucket = *prevBucketPtr;
1189 : }
1190 :
1191 1430 : if (currBucket == NULL)
1192 0 : elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1193 : hashp->tabname);
1194 :
1195 1430 : oldPrevPtr = prevBucketPtr;
1196 :
1197 : /*
1198 : * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1199 : * chain we want to put the entry into.
1200 : */
1201 1430 : newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1202 1430 : newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
1203 1430 : currBucket = *prevBucketPtr;
1204 :
1205 : /*
1206 : * Follow collision chain looking for matching key
1207 : */
1208 1430 : match = hashp->match; /* save one fetch in inner loop */
1209 1430 : keysize = hashp->keysize; /* ditto */
1210 :
1211 1662 : while (currBucket != NULL)
1212 : {
1213 232 : if (currBucket->hashvalue == newhashvalue &&
1214 0 : match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1215 0 : break;
1216 232 : prevBucketPtr = &(currBucket->link);
1217 232 : currBucket = *prevBucketPtr;
1218 : #ifdef HASH_STATISTICS
1219 : hash_collisions++;
1220 : hctl->collisions++;
1221 : #endif
1222 : }
1223 :
1224 1430 : if (currBucket != NULL)
1225 0 : return false; /* collision with an existing entry */
1226 :
1227 1430 : currBucket = existingElement;
1228 :
1229 : /*
1230 : * If old and new hash values belong to the same bucket, we need not
1231 : * change any chain links, and indeed should not since this simplistic
1232 : * update will corrupt the list if currBucket is the last element. (We
1233 : * cannot fall out earlier, however, since we need to scan the bucket to
1234 : * check for duplicate keys.)
1235 : */
1236 1430 : if (bucket != newbucket)
1237 : {
1238 : /* OK to remove record from old hash bucket's chain. */
1239 1202 : *oldPrevPtr = currBucket->link;
1240 :
1241 : /* link into new hashbucket chain */
1242 1202 : *prevBucketPtr = currBucket;
1243 1202 : currBucket->link = NULL;
1244 : }
1245 :
1246 : /* copy new key into record */
1247 1430 : currBucket->hashvalue = newhashvalue;
1248 1430 : hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1249 :
1250 : /* rest of record is untouched */
1251 :
1252 1430 : return true;
1253 : }
1254 :
1255 : /*
1256 : * Allocate a new hashtable entry if possible; return NULL if out of memory.
1257 : * (Or, if the underlying space allocator throws error for out-of-memory,
1258 : * we won't return at all.)
1259 : */
1260 : static HASHBUCKET
1261 62959768 : get_hash_entry(HTAB *hashp, int freelist_idx)
1262 : {
1263 62959768 : HASHHDR *hctl = hashp->hctl;
1264 : HASHBUCKET newElement;
1265 :
1266 : for (;;)
1267 : {
1268 : /* if partitioned, must lock to touch nentries and freeList */
1269 63613894 : if (IS_PARTITIONED(hctl))
1270 12591938 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1271 :
1272 : /* try to get an entry from the freelist */
1273 63613894 : newElement = hctl->freeList[freelist_idx].freeList;
1274 :
1275 63613894 : if (newElement != NULL)
1276 62959768 : break;
1277 :
1278 654126 : if (IS_PARTITIONED(hctl))
1279 3140 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1280 :
1281 : /*
1282 : * No free elements in this freelist. In a partitioned table, there
1283 : * might be entries in other freelists, but to reduce contention we
1284 : * prefer to first try to get another chunk of buckets from the main
1285 : * shmem allocator. If that fails, though, we *MUST* root through all
1286 : * the other freelists before giving up. There are multiple callers
1287 : * that assume that they can allocate every element in the initially
1288 : * requested table size, or that deleting an element guarantees they
1289 : * can insert a new element, even if shared memory is entirely full.
1290 : * Failing because the needed element is in a different freelist is
1291 : * not acceptable.
1292 : */
1293 654126 : if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1294 : {
1295 : int borrow_from_idx;
1296 :
1297 0 : if (!IS_PARTITIONED(hctl))
1298 0 : return NULL; /* out of memory */
1299 :
1300 : /* try to borrow element from another freelist */
1301 0 : borrow_from_idx = freelist_idx;
1302 : for (;;)
1303 : {
1304 0 : borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1305 0 : if (borrow_from_idx == freelist_idx)
1306 0 : break; /* examined all freelists, fail */
1307 :
1308 0 : SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1309 0 : newElement = hctl->freeList[borrow_from_idx].freeList;
1310 :
1311 0 : if (newElement != NULL)
1312 : {
1313 0 : hctl->freeList[borrow_from_idx].freeList = newElement->link;
1314 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1315 :
1316 : /* careful: count the new element in its proper freelist */
1317 0 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1318 0 : hctl->freeList[freelist_idx].nentries++;
1319 0 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1320 :
1321 0 : return newElement;
1322 : }
1323 :
1324 0 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1325 : }
1326 :
1327 : /* no elements available to borrow either, so out of memory */
1328 0 : return NULL;
1329 : }
1330 : }
1331 :
1332 : /* remove entry from freelist, bump nentries */
1333 62959768 : hctl->freeList[freelist_idx].freeList = newElement->link;
1334 62959768 : hctl->freeList[freelist_idx].nentries++;
1335 :
1336 62959768 : if (IS_PARTITIONED(hctl))
1337 12588798 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1338 :
1339 62959768 : return newElement;
1340 : }
1341 :
1342 : /*
1343 : * hash_get_num_entries -- get the number of entries in a hashtable
1344 : */
1345 : long
1346 120606 : hash_get_num_entries(HTAB *hashp)
1347 : {
1348 : int i;
1349 120606 : long sum = hashp->hctl->freeList[0].nentries;
1350 :
1351 : /*
1352 : * We currently don't bother with acquiring the mutexes; it's only
1353 : * sensible to call this function if you've got lock on all partitions of
1354 : * the table.
1355 : */
1356 120606 : if (IS_PARTITIONED(hashp->hctl))
1357 : {
1358 124736 : for (i = 1; i < NUM_FREELISTS; i++)
1359 120838 : sum += hashp->hctl->freeList[i].nentries;
1360 : }
1361 :
1362 120606 : return sum;
1363 : }
1364 :
1365 : /*
1366 : * hash_seq_init/_search/_term
1367 : * Sequentially search through hash table and return
1368 : * all the elements one by one, return NULL when no more.
1369 : *
1370 : * hash_seq_term should be called if and only if the scan is abandoned before
1371 : * completion; if hash_seq_search returns NULL then it has already done the
1372 : * end-of-scan cleanup.
1373 : *
1374 : * NOTE: caller may delete the returned element before continuing the scan.
1375 : * However, deleting any other element while the scan is in progress is
1376 : * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1377 : * if elements are added to the table while the scan is in progress, it is
1378 : * unspecified whether they will be visited by the scan or not.
1379 : *
1380 : * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1381 : * worry about hash_seq_term cleanup, if the hashtable is first locked against
1382 : * further insertions by calling hash_freeze.
1383 : *
1384 : * NOTE: to use this with a partitioned hashtable, caller had better hold
1385 : * at least shared lock on all partitions of the table throughout the scan!
1386 : * We can cope with insertions or deletions by our own backend, but *not*
1387 : * with concurrent insertions or deletions by another.
1388 : */
1389 : void
1390 5001490 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1391 : {
1392 5001490 : status->hashp = hashp;
1393 5001490 : status->curBucket = 0;
1394 5001490 : status->curEntry = NULL;
1395 5001490 : status->hasHashvalue = false;
1396 5001490 : if (!hashp->frozen)
1397 5001490 : register_seq_scan(hashp);
1398 5001490 : }
1399 :
1400 : /*
1401 : * Same as above but scan by the given hash value.
1402 : * See also hash_seq_search().
1403 : *
1404 : * NOTE: the default hash function doesn't match syscache hash function.
1405 : * Thus, if you're going to use this function in syscache callback, make sure
1406 : * you're using custom hash function. See relatt_cache_syshash()
1407 : * for example.
1408 : */
1409 : void
1410 1583580 : hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp,
1411 : uint32 hashvalue)
1412 : {
1413 : HASHBUCKET *bucketPtr;
1414 :
1415 1583580 : hash_seq_init(status, hashp);
1416 :
1417 1583580 : status->hasHashvalue = true;
1418 1583580 : status->hashvalue = hashvalue;
1419 :
1420 1583580 : status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
1421 1583580 : status->curEntry = *bucketPtr;
1422 1583580 : }
1423 :
1424 : void *
1425 56514826 : hash_seq_search(HASH_SEQ_STATUS *status)
1426 : {
1427 : HTAB *hashp;
1428 : HASHHDR *hctl;
1429 : uint32 max_bucket;
1430 : long ssize;
1431 : long segment_num;
1432 : long segment_ndx;
1433 : HASHSEGMENT segp;
1434 : uint32 curBucket;
1435 : HASHELEMENT *curElem;
1436 :
1437 56514826 : if (status->hasHashvalue)
1438 : {
1439 : /*
1440 : * Scan entries only in the current bucket because only this bucket
1441 : * can contain entries with the given hash value.
1442 : */
1443 1792304 : while ((curElem = status->curEntry) != NULL)
1444 : {
1445 208724 : status->curEntry = curElem->link;
1446 208724 : if (status->hashvalue != curElem->hashvalue)
1447 199690 : continue;
1448 9034 : return (void *) ELEMENTKEY(curElem);
1449 : }
1450 :
1451 1583580 : hash_seq_term(status);
1452 1583580 : return NULL;
1453 : }
1454 :
1455 54922212 : if ((curElem = status->curEntry) != NULL)
1456 : {
1457 : /* Continuing scan of curBucket... */
1458 16087632 : status->curEntry = curElem->link;
1459 16087632 : if (status->curEntry == NULL) /* end of this bucket */
1460 11223902 : ++status->curBucket;
1461 16087632 : return ELEMENTKEY(curElem);
1462 : }
1463 :
1464 : /*
1465 : * Search for next nonempty bucket starting at curBucket.
1466 : */
1467 38834580 : curBucket = status->curBucket;
1468 38834580 : hashp = status->hashp;
1469 38834580 : hctl = hashp->hctl;
1470 38834580 : ssize = hashp->ssize;
1471 38834580 : max_bucket = hctl->max_bucket;
1472 :
1473 38834580 : if (curBucket > max_bucket)
1474 : {
1475 110220 : hash_seq_term(status);
1476 110220 : return NULL; /* search is done */
1477 : }
1478 :
1479 : /*
1480 : * first find the right segment in the table directory.
1481 : */
1482 38724360 : segment_num = curBucket >> hashp->sshift;
1483 38724360 : segment_ndx = MOD(curBucket, ssize);
1484 :
1485 38724360 : segp = hashp->dir[segment_num];
1486 :
1487 : /*
1488 : * Pick up the first item in this bucket's chain. If chain is not empty
1489 : * we can begin searching it. Otherwise we have to advance to find the
1490 : * next nonempty bucket. We try to optimize that case since searching a
1491 : * near-empty hashtable has to iterate this loop a lot.
1492 : */
1493 197781186 : while ((curElem = segp[segment_ndx]) == NULL)
1494 : {
1495 : /* empty bucket, advance to next */
1496 162325300 : if (++curBucket > max_bucket)
1497 : {
1498 3268474 : status->curBucket = curBucket;
1499 3268474 : hash_seq_term(status);
1500 3268474 : return NULL; /* search is done */
1501 : }
1502 159056826 : if (++segment_ndx >= ssize)
1503 : {
1504 296070 : segment_num++;
1505 296070 : segment_ndx = 0;
1506 296070 : segp = hashp->dir[segment_num];
1507 : }
1508 : }
1509 :
1510 : /* Begin scan of curBucket... */
1511 35455886 : status->curEntry = curElem->link;
1512 35455886 : if (status->curEntry == NULL) /* end of this bucket */
1513 24231474 : ++curBucket;
1514 35455886 : status->curBucket = curBucket;
1515 35455886 : return ELEMENTKEY(curElem);
1516 : }
1517 :
1518 : void
1519 5001470 : hash_seq_term(HASH_SEQ_STATUS *status)
1520 : {
1521 5001470 : if (!status->hashp->frozen)
1522 5001470 : deregister_seq_scan(status->hashp);
1523 5001470 : }
1524 :
1525 : /*
1526 : * hash_freeze
1527 : * Freeze a hashtable against future insertions (deletions are
1528 : * still allowed)
1529 : *
1530 : * The reason for doing this is that by preventing any more bucket splits,
1531 : * we no longer need to worry about registering hash_seq_search scans,
1532 : * and thus caller need not be careful about ensuring hash_seq_term gets
1533 : * called at the right times.
1534 : *
1535 : * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1536 : * with active scans (since hash_seq_term would then do the wrong thing).
1537 : */
1538 : void
1539 0 : hash_freeze(HTAB *hashp)
1540 : {
1541 0 : if (hashp->isshared)
1542 0 : elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1543 0 : if (!hashp->frozen && has_seq_scans(hashp))
1544 0 : elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1545 : hashp->tabname);
1546 0 : hashp->frozen = true;
1547 0 : }
1548 :
1549 :
1550 : /********************************* UTILITIES ************************/
1551 :
1552 : /*
1553 : * Expand the table by adding one more hash bucket.
1554 : */
1555 : static bool
1556 781220 : expand_table(HTAB *hashp)
1557 : {
1558 781220 : HASHHDR *hctl = hashp->hctl;
1559 : HASHSEGMENT old_seg,
1560 : new_seg;
1561 : long old_bucket,
1562 : new_bucket;
1563 : long new_segnum,
1564 : new_segndx;
1565 : long old_segnum,
1566 : old_segndx;
1567 : HASHBUCKET *oldlink,
1568 : *newlink;
1569 : HASHBUCKET currElement,
1570 : nextElement;
1571 :
1572 : Assert(!IS_PARTITIONED(hctl));
1573 :
1574 : #ifdef HASH_STATISTICS
1575 : hash_expansions++;
1576 : #endif
1577 :
1578 781220 : new_bucket = hctl->max_bucket + 1;
1579 781220 : new_segnum = new_bucket >> hashp->sshift;
1580 781220 : new_segndx = MOD(new_bucket, hashp->ssize);
1581 :
1582 781220 : if (new_segnum >= hctl->nsegs)
1583 : {
1584 : /* Allocate new segment if necessary -- could fail if dir full */
1585 2504 : if (new_segnum >= hctl->dsize)
1586 0 : if (!dir_realloc(hashp))
1587 0 : return false;
1588 2504 : if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1589 0 : return false;
1590 2504 : hctl->nsegs++;
1591 : }
1592 :
1593 : /* OK, we created a new bucket */
1594 781220 : hctl->max_bucket++;
1595 :
1596 : /*
1597 : * *Before* changing masks, find old bucket corresponding to same hash
1598 : * values; values in that bucket may need to be relocated to new bucket.
1599 : * Note that new_bucket is certainly larger than low_mask at this point,
1600 : * so we can skip the first step of the regular hash mask calc.
1601 : */
1602 781220 : old_bucket = (new_bucket & hctl->low_mask);
1603 :
1604 : /*
1605 : * If we crossed a power of 2, readjust masks.
1606 : */
1607 781220 : if ((uint32) new_bucket > hctl->high_mask)
1608 : {
1609 4962 : hctl->low_mask = hctl->high_mask;
1610 4962 : hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1611 : }
1612 :
1613 : /*
1614 : * Relocate records to the new bucket. NOTE: because of the way the hash
1615 : * masking is done in calc_bucket, only one old bucket can need to be
1616 : * split at this point. With a different way of reducing the hash value,
1617 : * that might not be true!
1618 : */
1619 781220 : old_segnum = old_bucket >> hashp->sshift;
1620 781220 : old_segndx = MOD(old_bucket, hashp->ssize);
1621 :
1622 781220 : old_seg = hashp->dir[old_segnum];
1623 781220 : new_seg = hashp->dir[new_segnum];
1624 :
1625 781220 : oldlink = &old_seg[old_segndx];
1626 781220 : newlink = &new_seg[new_segndx];
1627 :
1628 781220 : for (currElement = *oldlink;
1629 1875518 : currElement != NULL;
1630 1094298 : currElement = nextElement)
1631 : {
1632 1094298 : nextElement = currElement->link;
1633 1094298 : if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1634 : {
1635 536760 : *oldlink = currElement;
1636 536760 : oldlink = &currElement->link;
1637 : }
1638 : else
1639 : {
1640 557538 : *newlink = currElement;
1641 557538 : newlink = &currElement->link;
1642 : }
1643 : }
1644 : /* don't forget to terminate the rebuilt hash chains... */
1645 781220 : *oldlink = NULL;
1646 781220 : *newlink = NULL;
1647 :
1648 781220 : return true;
1649 : }
1650 :
1651 :
1652 : static bool
1653 0 : dir_realloc(HTAB *hashp)
1654 : {
1655 : HASHSEGMENT *p;
1656 : HASHSEGMENT *old_p;
1657 : long new_dsize;
1658 : long old_dirsize;
1659 : long new_dirsize;
1660 :
1661 0 : if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1662 0 : return false;
1663 :
1664 : /* Reallocate directory */
1665 0 : new_dsize = hashp->hctl->dsize << 1;
1666 0 : old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1667 0 : new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1668 :
1669 0 : old_p = hashp->dir;
1670 0 : CurrentDynaHashCxt = hashp->hcxt;
1671 0 : p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
1672 :
1673 0 : if (p != NULL)
1674 : {
1675 0 : memcpy(p, old_p, old_dirsize);
1676 0 : MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1677 0 : hashp->dir = p;
1678 0 : hashp->hctl->dsize = new_dsize;
1679 :
1680 : /* XXX assume the allocator is palloc, so we know how to free */
1681 : Assert(hashp->alloc == DynaHashAlloc);
1682 0 : pfree(old_p);
1683 :
1684 0 : return true;
1685 : }
1686 :
1687 0 : return false;
1688 : }
1689 :
1690 :
1691 : static HASHSEGMENT
1692 1336424 : seg_alloc(HTAB *hashp)
1693 : {
1694 : HASHSEGMENT segp;
1695 :
1696 1336424 : CurrentDynaHashCxt = hashp->hcxt;
1697 1336424 : segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
1698 :
1699 1336424 : if (!segp)
1700 0 : return NULL;
1701 :
1702 1336424 : MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
1703 :
1704 1336424 : return segp;
1705 : }
1706 :
1707 : /*
1708 : * allocate some new elements and link them into the indicated free list
1709 : */
1710 : static bool
1711 1253746 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1712 : {
1713 1253746 : HASHHDR *hctl = hashp->hctl;
1714 : Size elementSize;
1715 : HASHELEMENT *firstElement;
1716 : HASHELEMENT *tmpElement;
1717 : HASHELEMENT *prevElement;
1718 : int i;
1719 :
1720 1253746 : if (hctl->isfixed)
1721 0 : return false;
1722 :
1723 : /* Each element has a HASHELEMENT header plus user data. */
1724 1253746 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1725 :
1726 1253746 : CurrentDynaHashCxt = hashp->hcxt;
1727 1253746 : firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
1728 :
1729 1253746 : if (!firstElement)
1730 0 : return false;
1731 :
1732 : /* prepare to link all the new entries into the freelist */
1733 1253746 : prevElement = NULL;
1734 1253746 : tmpElement = firstElement;
1735 109553564 : for (i = 0; i < nelem; i++)
1736 : {
1737 108299818 : tmpElement->link = prevElement;
1738 108299818 : prevElement = tmpElement;
1739 108299818 : tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1740 : }
1741 :
1742 : /* if partitioned, must lock to touch freeList */
1743 1253746 : if (IS_PARTITIONED(hctl))
1744 347460 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1745 :
1746 : /* freelist could be nonempty if two backends did this concurrently */
1747 1253746 : firstElement->link = hctl->freeList[freelist_idx].freeList;
1748 1253746 : hctl->freeList[freelist_idx].freeList = prevElement;
1749 :
1750 1253746 : if (IS_PARTITIONED(hctl))
1751 347460 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1752 :
1753 1253746 : return true;
1754 : }
1755 :
1756 : /*
1757 : * Do initial lookup of a bucket for the given hash value, retrieving its
1758 : * bucket number and its hash bucket.
1759 : */
1760 : static inline uint32
1761 378221606 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
1762 : {
1763 378221606 : HASHHDR *hctl = hashp->hctl;
1764 : HASHSEGMENT segp;
1765 : long segment_num;
1766 : long segment_ndx;
1767 : uint32 bucket;
1768 :
1769 378221606 : bucket = calc_bucket(hctl, hashvalue);
1770 :
1771 378221606 : segment_num = bucket >> hashp->sshift;
1772 378221606 : segment_ndx = MOD(bucket, hashp->ssize);
1773 :
1774 378221606 : segp = hashp->dir[segment_num];
1775 :
1776 378221606 : if (segp == NULL)
1777 0 : hash_corrupted(hashp);
1778 :
1779 378221606 : *bucketptr = &segp[segment_ndx];
1780 378221606 : return bucket;
1781 : }
1782 :
1783 : /* complain when we have detected a corrupted hashtable */
1784 : static void
1785 0 : hash_corrupted(HTAB *hashp)
1786 : {
1787 : /*
1788 : * If the corruption is in a shared hashtable, we'd better force a
1789 : * systemwide restart. Otherwise, just shut down this one backend.
1790 : */
1791 0 : if (hashp->isshared)
1792 0 : elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1793 : else
1794 0 : elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1795 : }
1796 :
1797 : /* calculate ceil(log base 2) of num */
1798 : int
1799 1620648 : my_log2(long num)
1800 : {
1801 : /*
1802 : * guard against too-large input, which would be invalid for
1803 : * pg_ceil_log2_*()
1804 : */
1805 1620648 : if (num > LONG_MAX / 2)
1806 0 : num = LONG_MAX / 2;
1807 :
1808 : #if SIZEOF_LONG < 8
1809 : return pg_ceil_log2_32(num);
1810 : #else
1811 1620648 : return pg_ceil_log2_64(num);
1812 : #endif
1813 : }
1814 :
1815 : /* calculate first power of 2 >= num, bounded to what will fit in a long */
1816 : static long
1817 110756 : next_pow2_long(long num)
1818 : {
1819 : /* my_log2's internal range check is sufficient */
1820 110756 : return 1L << my_log2(num);
1821 : }
1822 :
1823 : /* calculate first power of 2 >= num, bounded to what will fit in an int */
1824 : static int
1825 1468988 : next_pow2_int(long num)
1826 : {
1827 1468988 : if (num > INT_MAX / 2)
1828 0 : num = INT_MAX / 2;
1829 1468988 : return 1 << my_log2(num);
1830 : }
1831 :
1832 :
1833 : /************************* SEQ SCAN TRACKING ************************/
1834 :
1835 : /*
1836 : * We track active hash_seq_search scans here. The need for this mechanism
1837 : * comes from the fact that a scan will get confused if a bucket split occurs
1838 : * while it's in progress: it might visit entries twice, or even miss some
1839 : * entirely (if it's partway through the same bucket that splits). Hence
1840 : * we want to inhibit bucket splits if there are any active scans on the
1841 : * table being inserted into. This is a fairly rare case in current usage,
1842 : * so just postponing the split until the next insertion seems sufficient.
1843 : *
1844 : * Given present usages of the function, only a few scans are likely to be
1845 : * open concurrently; so a finite-size stack of open scans seems sufficient,
1846 : * and we don't worry that linear search is too slow. Note that we do
1847 : * allow multiple scans of the same hashtable to be open concurrently.
1848 : *
1849 : * This mechanism can support concurrent scan and insertion in a shared
1850 : * hashtable if it's the same backend doing both. It would fail otherwise,
1851 : * but locking reasons seem to preclude any such scenario anyway, so we don't
1852 : * worry.
1853 : *
1854 : * This arrangement is reasonably robust if a transient hashtable is deleted
1855 : * without notifying us. The absolute worst case is we might inhibit splits
1856 : * in another table created later at exactly the same address. We will give
1857 : * a warning at transaction end for reference leaks, so any bugs leading to
1858 : * lack of notification should be easy to catch.
1859 : */
1860 :
1861 : #define MAX_SEQ_SCANS 100
1862 :
1863 : static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1864 : static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1865 : static int num_seq_scans = 0;
1866 :
1867 :
1868 : /* Register a table as having an active hash_seq_search scan */
1869 : static void
1870 5001490 : register_seq_scan(HTAB *hashp)
1871 : {
1872 5001490 : if (num_seq_scans >= MAX_SEQ_SCANS)
1873 0 : elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1874 : hashp->tabname);
1875 5001490 : seq_scan_tables[num_seq_scans] = hashp;
1876 5001490 : seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1877 5001490 : num_seq_scans++;
1878 5001490 : }
1879 :
1880 : /* Deregister an active scan */
1881 : static void
1882 5001470 : deregister_seq_scan(HTAB *hashp)
1883 : {
1884 : int i;
1885 :
1886 : /* Search backward since it's most likely at the stack top */
1887 5001470 : for (i = num_seq_scans - 1; i >= 0; i--)
1888 : {
1889 5001470 : if (seq_scan_tables[i] == hashp)
1890 : {
1891 5001470 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1892 5001470 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1893 5001470 : num_seq_scans--;
1894 5001470 : return;
1895 : }
1896 : }
1897 0 : elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1898 : hashp->tabname);
1899 : }
1900 :
1901 : /* Check if a table has any active scan */
1902 : static bool
1903 781220 : has_seq_scans(HTAB *hashp)
1904 : {
1905 : int i;
1906 :
1907 781220 : for (i = 0; i < num_seq_scans; i++)
1908 : {
1909 0 : if (seq_scan_tables[i] == hashp)
1910 0 : return true;
1911 : }
1912 781220 : return false;
1913 : }
1914 :
1915 : /* Clean up any open scans at end of transaction */
1916 : void
1917 995822 : AtEOXact_HashTables(bool isCommit)
1918 : {
1919 : /*
1920 : * During abort cleanup, open scans are expected; just silently clean 'em
1921 : * out. An open scan at commit means someone forgot a hash_seq_term()
1922 : * call, so complain.
1923 : *
1924 : * Note: it's tempting to try to print the tabname here, but refrain for
1925 : * fear of touching deallocated memory. This isn't a user-facing message
1926 : * anyway, so it needn't be pretty.
1927 : */
1928 995822 : if (isCommit)
1929 : {
1930 : int i;
1931 :
1932 946420 : for (i = 0; i < num_seq_scans; i++)
1933 : {
1934 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1935 : seq_scan_tables[i]);
1936 : }
1937 : }
1938 995822 : num_seq_scans = 0;
1939 995822 : }
1940 :
1941 : /* Clean up any open scans at end of subtransaction */
1942 : void
1943 20064 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1944 : {
1945 : int i;
1946 :
1947 : /*
1948 : * Search backward to make cleanup easy. Note we must check all entries,
1949 : * not only those at the end of the array, because deletion technique
1950 : * doesn't keep them in order.
1951 : */
1952 20064 : for (i = num_seq_scans - 1; i >= 0; i--)
1953 : {
1954 0 : if (seq_scan_level[i] >= nestDepth)
1955 : {
1956 0 : if (isCommit)
1957 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1958 : seq_scan_tables[i]);
1959 0 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1960 0 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1961 0 : num_seq_scans--;
1962 : }
1963 : }
1964 20064 : }
|