Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dynahash.c
4 : * dynamic chained hash tables
5 : *
6 : * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 : * shared memory. For shared hash tables, it is the caller's responsibility
8 : * to provide appropriate access interlocking. The simplest convention is
9 : * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 : * hash_seq_search) need only shared lock, but any update requires exclusive
11 : * lock. For heavily-used shared tables, the single-lock approach creates a
12 : * concurrency bottleneck, so we also support "partitioned" locking wherein
13 : * there are multiple LWLocks guarding distinct subsets of the table. To use
14 : * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 : * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 : * Therefore, each hash bucket chain operates independently, and no fields
17 : * of the hash header change after init except nentries and freeList.
18 : * (A partitioned table uses multiple copies of those fields, guarded by
19 : * spinlocks, for additional concurrency.)
20 : * This lets any subset of the hash buckets be treated as a separately
21 : * lockable partition. We expect callers to use the low-order bits of a
22 : * lookup key's hash value as a partition number --- this will work because
23 : * of the way calc_bucket() maps hash values to bucket numbers.
24 : *
25 : * The memory allocator function should match malloc's semantics of returning
26 : * NULL on failure. (This is essential for hash tables in shared memory.
27 : * For hash tables in local memory, we used to use palloc() which will throw
28 : * error on failure; but we no longer do, so it's untested whether this
29 : * module will still cope with that behavior.)
30 : *
31 : * dynahash.c provides support for these types of lookup keys:
32 : *
33 : * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
34 : * compared as though by strcmp(). This is selected by specifying the
35 : * HASH_STRINGS flag to hash_create.
36 : *
37 : * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
38 : * (Caller must ensure there are no undefined padding bits in the keys!)
39 : * This is selected by specifying the HASH_BLOBS flag to hash_create.
40 : *
41 : * 3. More complex key behavior can be selected by specifying user-supplied
42 : * hashing, comparison, and/or key-copying functions. At least a hashing
43 : * function must be supplied; comparison defaults to memcmp() and key copying
44 : * to memcpy() when a user-defined hashing function is selected.
45 : *
46 : * Compared to simplehash, dynahash has the following benefits:
47 : *
48 : * - It supports partitioning, which is useful for shared memory access using
49 : * locks.
50 : * - Shared memory hashes are allocated in a fixed size area at startup and
51 : * are discoverable by name from other processes.
52 : * - Because entries don't need to be moved in the case of hash conflicts,
53 : * dynahash has better performance for large entries.
54 : * - Guarantees stable pointers to entries.
55 : *
56 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
57 : * Portions Copyright (c) 1994, Regents of the University of California
58 : *
59 : *
60 : * IDENTIFICATION
61 : * src/backend/utils/hash/dynahash.c
62 : *
63 : *-------------------------------------------------------------------------
64 : */
65 :
66 : /*
67 : * Original comments:
68 : *
69 : * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
70 : * Coded into C, with minor code improvements, and with hsearch(3) interface,
71 : * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
72 : * also, hcreate/hdestroy routines added to simulate hsearch(3).
73 : *
74 : * These routines simulate hsearch(3) and family, with the important
75 : * difference that the hash table is dynamic - can grow indefinitely
76 : * beyond its original size (as supplied to hcreate()).
77 : *
78 : * Performance appears to be comparable to that of hsearch(3).
79 : * The 'source-code' options referred to in hsearch(3)'s 'man' page
80 : * are not implemented; otherwise functionality is identical.
81 : *
82 : * Compilation controls:
83 : * HASH_STATISTICS causes some usage statistics to be maintained, which can be
84 : * logged by calling hash_stats().
85 : *
86 : * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 : * concatenation property, in probably unnecessary code 'optimization'.
88 : *
89 : * Modified margo@postgres.berkeley.edu February 1990
90 : * added multiple table interface
91 : * Modified by sullivan@postgres.berkeley.edu April 1990
92 : * changed ctl structure for shared memory
93 : */
94 :
95 : #include "postgres.h"
96 :
97 : #include <limits.h>
98 :
99 : #include "access/xact.h"
100 : #include "common/hashfn.h"
101 : #include "lib/ilist.h"
102 : #include "port/pg_bitutils.h"
103 : #include "storage/shmem.h"
104 : #include "storage/spin.h"
105 : #include "utils/memutils.h"
106 :
107 :
108 : /*
109 : * Constants
110 : *
111 : * A hash table has a top-level "directory", each of whose entries points to a
112 : * "segment" of HASH_SEGSIZE bucket headers. The maximum number of hash
113 : * buckets is thus dsize * HASH_SEGSIZE (but dsize may be expansible). Of
114 : * course, the number of records in the table can be larger, but we don't want
115 : * a whole lot of records per bucket or performance goes down.
116 : *
117 : * In a hash table allocated in shared memory, the directory cannot be
118 : * expanded because it must stay at a fixed address. The directory size is
119 : * chosen at creation based on the initial number of elements, so even though
120 : * we support allocating more elements later, performance will suffer if the
121 : * table grows much beyond the initial size. (Currently, shared memory hash
122 : * tables are only created by ShmemRequestHash()/ShmemInitHash() though, which
123 : * doesn't support growing at all.)
124 : */
125 : #define HASH_SEGSIZE 256
126 : #define HASH_SEGSIZE_SHIFT 8 /* must be log2(HASH_SEGSIZE) */
127 : #define DEF_DIRSIZE 256
128 :
129 : /* Number of freelists to be used for a partitioned hash table. */
130 : #define NUM_FREELISTS 32
131 :
132 : /* A hash bucket is a linked list of HASHELEMENTs */
133 : typedef HASHELEMENT *HASHBUCKET;
134 :
135 : /* A hash segment is an array of bucket headers */
136 : typedef HASHBUCKET *HASHSEGMENT;
137 :
138 : /*
139 : * Per-freelist data.
140 : *
141 : * In a partitioned hash table, each freelist is associated with a specific
142 : * set of hashcodes, as determined by the FREELIST_IDX() macro below.
143 : * nentries tracks the number of live hashtable entries having those hashcodes
144 : * (NOT the number of entries in the freelist, as you might expect).
145 : *
146 : * The coverage of a freelist might be more or less than one partition, so it
147 : * needs its own lock rather than relying on caller locking. Relying on that
148 : * wouldn't work even if the coverage was the same, because of the occasional
149 : * need to "borrow" entries from another freelist; see get_hash_entry().
150 : *
151 : * Using an array of FreeListData instead of separate arrays of mutexes,
152 : * nentries and freeLists helps to reduce sharing of cache lines between
153 : * different mutexes.
154 : */
155 : typedef struct
156 : {
157 : slock_t mutex; /* spinlock for this freelist */
158 : int64 nentries; /* number of entries in associated buckets */
159 : HASHELEMENT *freeList; /* chain of free elements */
160 : } FreeListData;
161 :
162 : /*
163 : * Header structure for a hash table --- contains all changeable info
164 : *
165 : * In a shared-memory hash table, the HASHHDR is in shared memory, while
166 : * each backend has a local HTAB struct. For a non-shared table, there isn't
167 : * any functional difference between HASHHDR and HTAB, but we separate them
168 : * anyway to share code between shared and non-shared tables.
169 : */
170 : struct HASHHDR
171 : {
172 : /*
173 : * The freelist can become a point of contention in high-concurrency hash
174 : * tables, so we use an array of freelists, each with its own mutex and
175 : * nentries count, instead of just a single one. Although the freelists
176 : * normally operate independently, we will scavenge entries from freelists
177 : * other than a hashcode's default freelist when necessary.
178 : *
179 : * If the hash table is not partitioned, only freeList[0] is used and its
180 : * spinlock is not used at all; callers' locking is assumed sufficient.
181 : */
182 : FreeListData freeList[NUM_FREELISTS];
183 :
184 : /* These fields can change, but not in a partitioned table */
185 : /* Also, dsize can't change in a shared table, even if unpartitioned */
186 : int64 dsize; /* directory size */
187 : int64 nsegs; /* number of allocated segments (<= dsize) */
188 : uint32 max_bucket; /* ID of maximum bucket in use */
189 : uint32 high_mask; /* mask to modulo into entire table */
190 : uint32 low_mask; /* mask to modulo into lower half of table */
191 :
192 : /* These fields are fixed at hashtable creation */
193 : Size keysize; /* hash key length in bytes */
194 : Size entrysize; /* total user element size in bytes */
195 : int64 num_partitions; /* # partitions (must be power of 2), or 0 */
196 : int64 max_dsize; /* 'dsize' limit if directory is fixed size */
197 : int nelem_alloc; /* number of entries to allocate at once */
198 : bool isfixed; /* if true, don't enlarge */
199 :
200 : /* Current directory. In shared tables, this doesn't change */
201 : HASHSEGMENT *dir;
202 :
203 : #ifdef HASH_STATISTICS
204 :
205 : /*
206 : * Count statistics here. NB: stats code doesn't bother with mutex, so
207 : * counts could be corrupted a bit in a partitioned table.
208 : */
209 : uint64 accesses;
210 : uint64 collisions;
211 : uint64 expansions;
212 : #endif
213 : };
214 :
215 : #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
216 :
217 : #define FREELIST_IDX(hctl, hashcode) \
218 : (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
219 :
220 : /*
221 : * Top control structure for a hashtable --- in a shared table, each backend
222 : * has its own copy (OK since no fields change at runtime)
223 : */
224 : struct HTAB
225 : {
226 : HASHHDR *hctl; /* => shared control information */
227 : HASHSEGMENT *dir; /* directory of segment starts */
228 : HashValueFunc hash; /* hash function */
229 : HashCompareFunc match; /* key comparison function */
230 : HashCopyFunc keycopy; /* key copying function */
231 : HashAllocFunc alloc; /* memory allocator */
232 : void *alloc_arg; /* opaque argument passed to allocator */
233 : MemoryContext hcxt; /* memory context if default allocator used */
234 : char *tabname; /* table name (for error messages) */
235 : bool isshared; /* true if table is in shared memory */
236 :
237 : /* freezing a shared table isn't allowed, so we can keep state here */
238 : bool frozen; /* true = no more inserts allowed */
239 :
240 : /* We keep local copies of these fixed values to reduce contention */
241 : Size keysize; /* hash key length in bytes */
242 :
243 : /*
244 : * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
245 : * all the element blocks they have allocated. This pacifies Valgrind,
246 : * which would otherwise often claim that the element blocks are "possibly
247 : * lost" for lack of any non-interior pointers to their starts.
248 : */
249 : #ifdef USE_VALGRIND
250 : slist_head element_blocks;
251 : #endif
252 : };
253 :
254 : /*
255 : * Key (also entry) part of a HASHELEMENT
256 : */
257 : #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
258 :
259 : /*
260 : * Obtain element pointer given pointer to key
261 : */
262 : #define ELEMENT_FROM_KEY(key) \
263 : ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
264 :
265 : /*
266 : * Fast MOD arithmetic, assuming that y is a power of 2 !
267 : */
268 : #define MOD(x,y) ((x) & ((y)-1))
269 :
270 : /*
271 : * Private function prototypes
272 : */
273 : static void *DynaHashAlloc(Size size, void *alloc_arg);
274 : static HASHSEGMENT seg_alloc(HTAB *hashp);
275 : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
276 : static bool dir_realloc(HTAB *hashp);
277 : static bool expand_table(HTAB *hashp);
278 : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
279 : static void hdefault(HTAB *hashp);
280 : static int choose_nelem_alloc(Size entrysize);
281 : static bool init_htab(HTAB *hashp, int64 nelem);
282 : pg_noreturn static void hash_corrupted(HTAB *hashp);
283 : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
284 : HASHBUCKET **bucketptr);
285 : static int my_log2(int64 num);
286 : static int64 next_pow2_int64(int64 num);
287 : static int next_pow2_int(int64 num);
288 : static void register_seq_scan(HTAB *hashp);
289 : static void deregister_seq_scan(HTAB *hashp);
290 : static bool has_seq_scans(HTAB *hashp);
291 :
292 :
293 : /*
294 : * memory allocation support
295 : */
296 : static void *
297 1865966 : DynaHashAlloc(Size size, void *alloc_arg)
298 : {
299 1865966 : MemoryContext hcxt = (MemoryContext) alloc_arg;
300 :
301 : Assert(MemoryContextIsValid(hcxt));
302 1865966 : return MemoryContextAllocExtended(hcxt, size, MCXT_ALLOC_NO_OOM);
303 : }
304 :
305 :
306 : /*
307 : * HashCompareFunc for string keys
308 : *
309 : * Because we copy keys with strlcpy(), they will be truncated at keysize-1
310 : * bytes, so we can only compare that many ... hence strncmp is almost but
311 : * not quite the right thing.
312 : */
313 : static int
314 611013 : string_compare(const char *key1, const char *key2, Size keysize)
315 : {
316 611013 : return strncmp(key1, key2, keysize - 1);
317 : }
318 :
319 :
320 : /************************** CREATE ROUTINES **********************/
321 :
322 : /*
323 : * hash_create -- create a new dynamic hash table
324 : *
325 : * tabname: a name for the table (for debugging purposes)
326 : * nelem: maximum number of elements expected
327 : * *info: additional table parameters, as indicated by flags
328 : * flags: bitmask indicating which parameters to take from *info
329 : *
330 : * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
331 : * optional, but the default keysize and entrysize values were useless.)
332 : * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
333 : * or HASH_FUNCTION, to define the key hashing semantics (C strings,
334 : * binary blobs, or custom, respectively). Callers specifying a custom
335 : * hash function will likely also want to use HASH_COMPARE, and perhaps
336 : * also HASH_KEYCOPY, to control key comparison and copying.
337 : * Another often-used flag is HASH_CONTEXT, to allocate the hash table
338 : * under info->hcxt rather than under TopMemoryContext; the default
339 : * behavior is only suitable for session-lifespan hash tables.
340 : * Other flags bits are special-purpose and seldom used, except for those
341 : * associated with shared-memory hash tables, for which see
342 : * ShmemRequestHash().
343 : *
344 : * Fields in *info are read only when the associated flags bit is set.
345 : * It is not necessary to initialize other fields of *info.
346 : * Neither tabname nor *info need persist after the hash_create() call.
347 : *
348 : * Note: It is deprecated for callers of hash_create() to explicitly specify
349 : * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
350 : * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
351 : * one of these.
352 : *
353 : * Note: for a shared-memory hashtable, nelem needs to be a pretty good
354 : * estimate, since we can't expand the table on the fly. But an unshared
355 : * hashtable can be expanded on-the-fly, so it's better for nelem to be
356 : * on the small side and let the table grow if it's exceeded. An overly
357 : * large nelem will penalize hash_seq_search speed without buying much.
358 : */
359 : HTAB *
360 429960 : hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
361 : {
362 : HTAB *hashp;
363 : HASHHDR *hctl;
364 : MemoryContext hcxt;
365 :
366 : /*
367 : * Hash tables now allocate space for key and data, but you have to say
368 : * how much space to allocate.
369 : */
370 : Assert(flags & HASH_ELEM);
371 : Assert(info->keysize > 0);
372 : Assert(info->entrysize >= info->keysize);
373 :
374 : /*
375 : * For shared hash tables, we have a local hash header (HTAB struct) that
376 : * we allocate in TopMemoryContext; all else is in shared memory.
377 : *
378 : * For non-shared hash tables, everything including the hash header is in
379 : * a memory context created specially for the hash table --- this makes
380 : * hash_destroy very simple. The memory context is made a child of either
381 : * a context specified by the caller, or TopMemoryContext if nothing is
382 : * specified.
383 : *
384 : * Note that HASH_ALLOC had better be set as well.
385 : */
386 429960 : if (flags & HASH_SHARED_MEM)
387 : {
388 : /* Set up to allocate the hash header */
389 11090 : hcxt = TopMemoryContext;
390 : }
391 : else
392 : {
393 : /* Create the hash table's private memory context */
394 418870 : if (flags & HASH_CONTEXT)
395 271077 : hcxt = info->hcxt;
396 : else
397 147793 : hcxt = TopMemoryContext;
398 418870 : hcxt = AllocSetContextCreate(hcxt, "dynahash",
399 : ALLOCSET_DEFAULT_SIZES);
400 : }
401 :
402 : /* Initialize the hash header, plus a copy of the table name */
403 429960 : hashp = (HTAB *) MemoryContextAlloc(hcxt,
404 429960 : sizeof(HTAB) + strlen(tabname) + 1);
405 5159520 : MemSet(hashp, 0, sizeof(HTAB));
406 :
407 429960 : hashp->tabname = (char *) (hashp + 1);
408 429960 : strcpy(hashp->tabname, tabname);
409 :
410 : /* If we have a private context, label it with hashtable's name */
411 429960 : if (!(flags & HASH_SHARED_MEM))
412 418870 : MemoryContextSetIdentifier(hcxt, hashp->tabname);
413 :
414 : /*
415 : * Select the appropriate hash function (see comments at head of file).
416 : */
417 429960 : if (flags & HASH_FUNCTION)
418 : {
419 : Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
420 15589 : hashp->hash = info->hash;
421 : }
422 414371 : else if (flags & HASH_BLOBS)
423 : {
424 : Assert(!(flags & HASH_STRINGS));
425 : /* We can optimize hashing for common key sizes */
426 350273 : if (info->keysize == sizeof(uint32))
427 246948 : hashp->hash = uint32_hash;
428 : else
429 103325 : hashp->hash = tag_hash;
430 : }
431 : else
432 : {
433 : /*
434 : * string_hash used to be considered the default hash method, and in a
435 : * non-assert build it effectively still is. But we now consider it
436 : * an assertion error to not say HASH_STRINGS explicitly. To help
437 : * catch mistaken usage of HASH_STRINGS, we also insist on a
438 : * reasonably long string length: if the keysize is only 4 or 8 bytes,
439 : * it's almost certainly an integer or pointer not a string.
440 : */
441 : Assert(flags & HASH_STRINGS);
442 : Assert(info->keysize > 8);
443 :
444 64098 : hashp->hash = string_hash;
445 : }
446 :
447 : /*
448 : * If you don't specify a match function, it defaults to string_compare if
449 : * you used string_hash, and to memcmp otherwise.
450 : *
451 : * Note: explicitly specifying string_hash is deprecated, because this
452 : * might not work for callers in loadable modules on some platforms due to
453 : * referencing a trampoline instead of the string_hash function proper.
454 : * Specify HASH_STRINGS instead.
455 : */
456 429960 : if (flags & HASH_COMPARE)
457 8370 : hashp->match = info->match;
458 421590 : else if (hashp->hash == string_hash)
459 64098 : hashp->match = (HashCompareFunc) string_compare;
460 : else
461 357492 : hashp->match = memcmp;
462 :
463 : /*
464 : * Similarly, the key-copying function defaults to strlcpy or memcpy.
465 : */
466 429960 : if (flags & HASH_KEYCOPY)
467 0 : hashp->keycopy = info->keycopy;
468 429960 : else if (hashp->hash == string_hash)
469 : {
470 : /*
471 : * The signature of keycopy is meant for memcpy(), which returns
472 : * void*, but strlcpy() returns size_t. Since we never use the return
473 : * value of keycopy, and size_t is pretty much always the same size as
474 : * void *, this should be safe. The extra cast in the middle is to
475 : * avoid warnings from -Wcast-function-type.
476 : */
477 64098 : hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
478 : }
479 : else
480 365862 : hashp->keycopy = memcpy;
481 :
482 : /* And select the entry allocation function, too. */
483 429960 : if (flags & HASH_ALLOC)
484 : {
485 11090 : hashp->alloc = info->alloc;
486 11090 : hashp->alloc_arg = info->alloc_arg;
487 : }
488 : else
489 : {
490 418870 : hashp->alloc = DynaHashAlloc;
491 418870 : hashp->alloc_arg = hcxt;
492 : }
493 :
494 429960 : if (flags & HASH_SHARED_MEM)
495 : {
496 11090 : hashp->hcxt = NULL;
497 11090 : hashp->isshared = true;
498 :
499 : /* hash table already exists, we're just attaching to it */
500 11090 : if (flags & HASH_ATTACH)
501 : {
502 : /* Caller must pass the pointer to the shared header */
503 : Assert(info->hctl);
504 0 : hashp->hctl = info->hctl;
505 :
506 : /* make local copies of some heavily-used values */
507 0 : hashp->dir = info->hctl->dir;
508 0 : hashp->keysize = info->hctl->keysize;
509 :
510 0 : return hashp;
511 : }
512 : }
513 : else
514 : {
515 : /* setup hash table defaults */
516 418870 : hashp->hctl = NULL;
517 418870 : hashp->dir = NULL;
518 418870 : hashp->hcxt = hcxt;
519 418870 : hashp->isshared = false;
520 : }
521 :
522 : /*
523 : * Allocate the header structure.
524 : *
525 : * XXX: In case of a shared memory hash table, other processes need the
526 : * pointer to the header to re-find the hash table. There is currently no
527 : * explicit way to pass it back from here, the caller relies on the fact
528 : * that this is the first allocation made with the alloc function. That's
529 : * a little ugly, but works for now.
530 : */
531 429960 : hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
532 429960 : if (!hashp->hctl)
533 0 : ereport(ERROR,
534 : (errcode(ERRCODE_OUT_OF_MEMORY),
535 : errmsg("out of memory")));
536 :
537 429960 : hashp->frozen = false;
538 :
539 429960 : hdefault(hashp);
540 :
541 429960 : hctl = hashp->hctl;
542 :
543 429960 : if (flags & HASH_PARTITION)
544 : {
545 : /* Doesn't make sense to partition a local hash table */
546 : Assert(flags & HASH_SHARED_MEM);
547 :
548 : /*
549 : * The number of partitions had better be a power of 2. Also, it must
550 : * be less than INT_MAX (see init_htab()), so call the int version of
551 : * next_pow2.
552 : */
553 : Assert(info->num_partitions == next_pow2_int(info->num_partitions));
554 :
555 6155 : hctl->num_partitions = info->num_partitions;
556 : }
557 :
558 : /* remember the entry sizes, too */
559 429960 : hctl->keysize = info->keysize;
560 429960 : hctl->entrysize = info->entrysize;
561 :
562 : /* make local copies of heavily-used constant fields */
563 429960 : hashp->keysize = hctl->keysize;
564 :
565 : /* Build the hash directory structure */
566 429960 : if (!init_htab(hashp, nelem))
567 0 : elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
568 :
569 : /*
570 : * For a shared hash table, preallocate the requested number of elements.
571 : * This reduces problems with run-time out-of-shared-memory conditions.
572 : *
573 : * For a non-shared hash table, preallocate the requested number of
574 : * elements if it's less than our chosen nelem_alloc. This avoids wasting
575 : * space if the caller correctly estimates a small table size.
576 : */
577 429960 : if ((flags & HASH_SHARED_MEM) ||
578 418870 : nelem < hctl->nelem_alloc)
579 : {
580 : int i,
581 : freelist_partitions,
582 : nelem_alloc,
583 : nelem_alloc_first;
584 :
585 : /*
586 : * If hash table is partitioned, give each freelist an equal share of
587 : * the initial allocation. Otherwise only freeList[0] is used.
588 : */
589 154884 : if (IS_PARTITIONED(hashp->hctl))
590 6155 : freelist_partitions = NUM_FREELISTS;
591 : else
592 148729 : freelist_partitions = 1;
593 :
594 154884 : nelem_alloc = nelem / freelist_partitions;
595 154884 : if (nelem_alloc <= 0)
596 0 : nelem_alloc = 1;
597 :
598 : /*
599 : * Make sure we'll allocate all the requested elements; freeList[0]
600 : * gets the excess if the request isn't divisible by NUM_FREELISTS.
601 : */
602 154884 : if (nelem_alloc * freelist_partitions < nelem)
603 59 : nelem_alloc_first =
604 59 : nelem - nelem_alloc * (freelist_partitions - 1);
605 : else
606 154825 : nelem_alloc_first = nelem_alloc;
607 :
608 500573 : for (i = 0; i < freelist_partitions; i++)
609 : {
610 345689 : int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
611 :
612 345689 : if (!element_alloc(hashp, temp, i))
613 0 : ereport(ERROR,
614 : (errcode(ERRCODE_OUT_OF_MEMORY),
615 : errmsg("out of memory")));
616 : }
617 : }
618 :
619 : /* Set isfixed if requested, but not till after we build initial entries */
620 429960 : if (flags & HASH_FIXED_SIZE)
621 11090 : hctl->isfixed = true;
622 :
623 429960 : return hashp;
624 : }
625 :
626 : /*
627 : * Set default HASHHDR parameters.
628 : */
629 : static void
630 429960 : hdefault(HTAB *hashp)
631 : {
632 429960 : HASHHDR *hctl = hashp->hctl;
633 :
634 46005720 : MemSet(hctl, 0, sizeof(HASHHDR));
635 :
636 429960 : hctl->num_partitions = 0; /* not partitioned */
637 :
638 : /* table has no fixed maximum size */
639 429960 : hctl->max_dsize = NO_MAX_DSIZE;
640 :
641 429960 : hctl->isfixed = false; /* can be enlarged */
642 :
643 : #ifdef HASH_STATISTICS
644 : hctl->accesses = hctl->collisions = hctl->expansions = 0;
645 : #endif
646 429960 : }
647 :
648 : /*
649 : * Given the user-specified entry size, choose nelem_alloc, ie, how many
650 : * elements to add to the hash table when we need more.
651 : */
652 : static int
653 443365 : choose_nelem_alloc(Size entrysize)
654 : {
655 : int nelem_alloc;
656 : Size elementSize;
657 : Size allocSize;
658 :
659 : /* Each element has a HASHELEMENT header plus user data. */
660 : /* NB: this had better match element_alloc() */
661 443365 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
662 :
663 : /*
664 : * The idea here is to choose nelem_alloc at least 32, but round up so
665 : * that the allocation request will be a power of 2 or just less. This
666 : * makes little difference for hash tables in shared memory, but for hash
667 : * tables managed by palloc, the allocation request will be rounded up to
668 : * a power of 2 anyway. If we fail to take this into account, we'll waste
669 : * as much as half the allocated space.
670 : */
671 443365 : allocSize = 32 * 4; /* assume elementSize at least 8 */
672 : do
673 : {
674 1700029 : allocSize <<= 1;
675 1700029 : nelem_alloc = allocSize / elementSize;
676 1700029 : } while (nelem_alloc < 32);
677 :
678 443365 : return nelem_alloc;
679 : }
680 :
681 : /*
682 : * Compute derived fields of hctl and build the initial directory/segment
683 : * arrays
684 : */
685 : static bool
686 429960 : init_htab(HTAB *hashp, int64 nelem)
687 : {
688 429960 : HASHHDR *hctl = hashp->hctl;
689 : HASHSEGMENT *segp;
690 : int nbuckets;
691 : int nsegs;
692 : int i;
693 :
694 : /*
695 : * initialize mutexes if it's a partitioned table
696 : */
697 429960 : if (IS_PARTITIONED(hctl))
698 203115 : for (i = 0; i < NUM_FREELISTS; i++)
699 196960 : SpinLockInit(&(hctl->freeList[i].mutex));
700 :
701 : /*
702 : * Allocate space for the next greater power of two number of buckets,
703 : * assuming a desired maximum load factor of 1.
704 : */
705 429960 : nbuckets = next_pow2_int(nelem);
706 :
707 : /*
708 : * In a partitioned table, nbuckets must be at least equal to
709 : * num_partitions; were it less, keys with apparently different partition
710 : * numbers would map to the same bucket, breaking partition independence.
711 : * (Normally nbuckets will be much bigger; this is just a safety check.)
712 : */
713 429960 : while (nbuckets < hctl->num_partitions)
714 0 : nbuckets <<= 1;
715 :
716 429960 : hctl->max_bucket = hctl->low_mask = nbuckets - 1;
717 429960 : hctl->high_mask = (nbuckets << 1) - 1;
718 :
719 : /*
720 : * Figure number of directory segments needed, round up to a power of 2
721 : */
722 429960 : nsegs = (nbuckets - 1) / HASH_SEGSIZE + 1;
723 429960 : nsegs = next_pow2_int(nsegs);
724 :
725 : /*
726 : * Make sure directory is big enough.
727 : */
728 429960 : hctl->dsize = Max(DEF_DIRSIZE, nsegs);
729 :
730 : /* SHM hash tables have a fixed directory. */
731 429960 : if (hashp->isshared)
732 11090 : hctl->max_dsize = hctl->dsize;
733 :
734 : /* Allocate a directory */
735 429960 : hctl->dir = (HASHSEGMENT *)
736 429960 : hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
737 429960 : if (!hctl->dir)
738 0 : return false;
739 429960 : hashp->dir = hctl->dir;
740 :
741 : /* Allocate initial segments */
742 1402638 : for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
743 : {
744 972678 : *segp = seg_alloc(hashp);
745 972678 : if (*segp == NULL)
746 0 : return false;
747 : }
748 :
749 : /* Choose number of entries to allocate at a time */
750 429960 : hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
751 :
752 429960 : return true;
753 : }
754 :
755 : /*
756 : * Estimate the space needed for a hashtable containing the given number
757 : * of entries of given size.
758 : * NOTE: this is used to estimate the footprint of hashtables in shared
759 : * memory; therefore it does not count HTAB which is in local memory.
760 : * NB: assumes that all hash structure parameters have default values!
761 : */
762 : Size
763 13405 : hash_estimate_size(int64 num_entries, Size entrysize)
764 : {
765 : Size size;
766 : int64 nBuckets,
767 : nSegments,
768 : nDirEntries,
769 : nElementAllocs,
770 : elementSize,
771 : elementAllocCnt;
772 :
773 : /* estimate number of buckets wanted */
774 13405 : nBuckets = next_pow2_int64(num_entries);
775 : /* # of segments needed for nBuckets */
776 13405 : nSegments = next_pow2_int64((nBuckets - 1) / HASH_SEGSIZE + 1);
777 : /* directory entries */
778 13405 : nDirEntries = Max(DEF_DIRSIZE, nSegments);
779 :
780 : /* fixed control info */
781 13405 : size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
782 : /* directory */
783 13405 : size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
784 : /* segments */
785 13405 : size = add_size(size, mul_size(nSegments,
786 : MAXALIGN(HASH_SEGSIZE * sizeof(HASHBUCKET))));
787 : /* elements --- allocated in groups of choose_nelem_alloc() entries */
788 13405 : elementAllocCnt = choose_nelem_alloc(entrysize);
789 13405 : nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
790 13405 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
791 13405 : size = add_size(size,
792 : mul_size(nElementAllocs,
793 : mul_size(elementAllocCnt, elementSize)));
794 :
795 13405 : return size;
796 : }
797 :
798 :
799 : /********************** DESTROY ROUTINES ************************/
800 :
801 : void
802 79886 : hash_destroy(HTAB *hashp)
803 : {
804 79886 : if (hashp != NULL)
805 : {
806 : /* allocation method must be one we know how to free, too */
807 : Assert(hashp->alloc == DynaHashAlloc);
808 : /* so this hashtable must have its own context */
809 : Assert(hashp->hcxt != NULL);
810 :
811 79886 : hash_stats(__func__, hashp);
812 :
813 : /*
814 : * Free everything by destroying the hash table's memory context.
815 : */
816 79886 : MemoryContextDelete(hashp->hcxt);
817 : }
818 79886 : }
819 :
820 : void
821 79886 : hash_stats(const char *caller, HTAB *hashp)
822 : {
823 : #ifdef HASH_STATISTICS
824 : HASHHDR *hctl = hashp->hctl;
825 :
826 : elog(DEBUG4,
827 : "hash_stats: Caller: %s Table Name: \"%s\" Accesses: " UINT64_FORMAT " Collisions: " UINT64_FORMAT " Expansions: " UINT64_FORMAT " Entries: " INT64_FORMAT " Key Size: %zu Max Bucket: %u Segment Count: " INT64_FORMAT,
828 : caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses,
829 : hctl->collisions, hctl->expansions, hash_get_num_entries(hashp),
830 : hctl->keysize, hctl->max_bucket, hctl->nsegs);
831 : #endif
832 79886 : }
833 :
834 : /*******************************SEARCH ROUTINES *****************************/
835 :
836 :
837 : /*
838 : * get_hash_value -- exported routine to calculate a key's hash value
839 : *
840 : * We export this because for partitioned tables, callers need to compute
841 : * the partition number (from the low-order bits of the hash value) before
842 : * searching.
843 : */
844 : uint32
845 110055978 : get_hash_value(HTAB *hashp, const void *keyPtr)
846 : {
847 110055978 : return hashp->hash(keyPtr, hashp->keysize);
848 : }
849 :
850 : /* Convert a hash value to a bucket number */
851 : static inline uint32
852 245594187 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
853 : {
854 : uint32 bucket;
855 :
856 245594187 : bucket = hash_val & hctl->high_mask;
857 245594187 : if (bucket > hctl->max_bucket)
858 119662860 : bucket = bucket & hctl->low_mask;
859 :
860 245594187 : return bucket;
861 : }
862 :
863 : /*
864 : * hash_search -- look up key in table and perform action
865 : * hash_search_with_hash_value -- same, with key's hash value already computed
866 : *
867 : * action is one of:
868 : * HASH_FIND: look up key in table
869 : * HASH_ENTER: look up key in table, creating entry if not present
870 : * HASH_ENTER_NULL: same, but return NULL if out of memory
871 : * HASH_REMOVE: look up key in table, remove entry if present
872 : *
873 : * Return value is a pointer to the element found/entered/removed if any,
874 : * or NULL if no match was found. (NB: in the case of the REMOVE action,
875 : * the result is a dangling pointer that shouldn't be dereferenced!)
876 : *
877 : * HASH_ENTER will normally ereport a generic "out of memory" error if
878 : * it is unable to create a new entry. The HASH_ENTER_NULL operation is
879 : * the same except it will return NULL if out of memory.
880 : *
881 : * If foundPtr isn't NULL, then *foundPtr is set true if we found an
882 : * existing entry in the table, false otherwise. This is needed in the
883 : * HASH_ENTER case, but is redundant with the return value otherwise.
884 : *
885 : * For hash_search_with_hash_value, the hashvalue parameter must have been
886 : * calculated with get_hash_value().
887 : */
888 : void *
889 145360302 : hash_search(HTAB *hashp,
890 : const void *keyPtr,
891 : HASHACTION action,
892 : bool *foundPtr)
893 : {
894 145360302 : return hash_search_with_hash_value(hashp,
895 : keyPtr,
896 145360302 : hashp->hash(keyPtr, hashp->keysize),
897 : action,
898 : foundPtr);
899 : }
900 :
901 : void *
902 243678995 : hash_search_with_hash_value(HTAB *hashp,
903 : const void *keyPtr,
904 : uint32 hashvalue,
905 : HASHACTION action,
906 : bool *foundPtr)
907 : {
908 243678995 : HASHHDR *hctl = hashp->hctl;
909 243678995 : int freelist_idx = FREELIST_IDX(hctl, hashvalue);
910 : Size keysize;
911 : HASHBUCKET currBucket;
912 : HASHBUCKET *prevBucketPtr;
913 : HashCompareFunc match;
914 :
915 : #ifdef HASH_STATISTICS
916 : hctl->accesses++;
917 : #endif
918 :
919 : /*
920 : * If inserting, check if it is time to split a bucket.
921 : *
922 : * NOTE: failure to expand table is not a fatal error, it just means we
923 : * have to run at higher fill factor than we wanted. However, if we're
924 : * using the palloc allocator then it will throw error anyway on
925 : * out-of-memory, so we must do this before modifying the table.
926 : */
927 243678995 : if (action == HASH_ENTER || action == HASH_ENTER_NULL)
928 : {
929 : /*
930 : * Can't split if running in partitioned mode, nor if frozen, nor if
931 : * table is the subject of any active hash_seq_search scans.
932 : */
933 61593020 : if (hctl->freeList[0].nentries > (int64) hctl->max_bucket &&
934 457364 : !IS_PARTITIONED(hctl) && !hashp->frozen &&
935 457364 : !has_seq_scans(hashp))
936 457364 : (void) expand_table(hashp);
937 : }
938 :
939 : /*
940 : * Do the initial lookup
941 : */
942 243678995 : (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
943 243678995 : currBucket = *prevBucketPtr;
944 :
945 : /*
946 : * Follow collision chain looking for matching key
947 : */
948 243678995 : match = hashp->match; /* save one fetch in inner loop */
949 243678995 : keysize = hashp->keysize; /* ditto */
950 :
951 292662840 : while (currBucket != NULL)
952 : {
953 443045376 : if (currBucket->hashvalue == hashvalue &&
954 197034193 : match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
955 197027338 : break;
956 48983845 : prevBucketPtr = &(currBucket->link);
957 48983845 : currBucket = *prevBucketPtr;
958 : #ifdef HASH_STATISTICS
959 : hctl->collisions++;
960 : #endif
961 : }
962 :
963 243678995 : if (foundPtr)
964 63100187 : *foundPtr = (bool) (currBucket != NULL);
965 :
966 : /*
967 : * OK, now what?
968 : */
969 243678995 : switch (action)
970 : {
971 149773171 : case HASH_FIND:
972 149773171 : if (currBucket != NULL)
973 141741289 : return ELEMENTKEY(currBucket);
974 8031882 : return NULL;
975 :
976 32312804 : case HASH_REMOVE:
977 32312804 : if (currBucket != NULL)
978 : {
979 : /* if partitioned, must lock to touch nentries and freeList */
980 32308881 : if (IS_PARTITIONED(hctl))
981 6657474 : SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
982 :
983 : /* delete the record from the appropriate nentries counter. */
984 : Assert(hctl->freeList[freelist_idx].nentries > 0);
985 32308881 : hctl->freeList[freelist_idx].nentries--;
986 :
987 : /* remove record from hash bucket's chain. */
988 32308881 : *prevBucketPtr = currBucket->link;
989 :
990 : /* add the record to the appropriate freelist. */
991 32308881 : currBucket->link = hctl->freeList[freelist_idx].freeList;
992 32308881 : hctl->freeList[freelist_idx].freeList = currBucket;
993 :
994 32308881 : if (IS_PARTITIONED(hctl))
995 6657474 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
996 :
997 : /*
998 : * better hope the caller is synchronizing access to this
999 : * element, because someone else is going to reuse it the next
1000 : * time something is added to the table
1001 : */
1002 32308881 : return ELEMENTKEY(currBucket);
1003 : }
1004 3923 : return NULL;
1005 :
1006 61593020 : case HASH_ENTER:
1007 : case HASH_ENTER_NULL:
1008 : /* Return existing element if found, else create one */
1009 61593020 : if (currBucket != NULL)
1010 22977168 : return ELEMENTKEY(currBucket);
1011 :
1012 : /* disallow inserts if frozen */
1013 38615852 : if (hashp->frozen)
1014 0 : elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1015 : hashp->tabname);
1016 :
1017 38615852 : currBucket = get_hash_entry(hashp, freelist_idx);
1018 38615852 : if (currBucket == NULL)
1019 : {
1020 : /* out of memory */
1021 0 : if (action == HASH_ENTER_NULL)
1022 0 : return NULL;
1023 : /* report a generic message */
1024 0 : if (hashp->isshared)
1025 0 : ereport(ERROR,
1026 : (errcode(ERRCODE_OUT_OF_MEMORY),
1027 : errmsg("out of shared memory")));
1028 : else
1029 0 : ereport(ERROR,
1030 : (errcode(ERRCODE_OUT_OF_MEMORY),
1031 : errmsg("out of memory")));
1032 : }
1033 :
1034 : /* link into hashbucket chain */
1035 38615852 : *prevBucketPtr = currBucket;
1036 38615852 : currBucket->link = NULL;
1037 :
1038 : /* copy key into record */
1039 38615852 : currBucket->hashvalue = hashvalue;
1040 38615852 : hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1041 :
1042 : /*
1043 : * Caller is expected to fill the data field on return. DO NOT
1044 : * insert any code that could possibly throw error here, as doing
1045 : * so would leave the table entry incomplete and hence corrupt the
1046 : * caller's data structure.
1047 : */
1048 :
1049 38615852 : return ELEMENTKEY(currBucket);
1050 : }
1051 :
1052 0 : elog(ERROR, "unrecognized hash action code: %d", (int) action);
1053 :
1054 : return NULL; /* keep compiler quiet */
1055 : }
1056 :
1057 : /*
1058 : * hash_update_hash_key -- change the hash key of an existing table entry
1059 : *
1060 : * This is equivalent to removing the entry, making a new entry, and copying
1061 : * over its data, except that the entry never goes to the table's freelist.
1062 : * Therefore this cannot suffer an out-of-memory failure, even if there are
1063 : * other processes operating in other partitions of the hashtable.
1064 : *
1065 : * Returns true if successful, false if the requested new hash key is already
1066 : * present. Throws error if the specified entry pointer isn't actually a
1067 : * table member.
1068 : *
1069 : * NB: currently, there is no special case for old and new hash keys being
1070 : * identical, which means we'll report false for that situation. This is
1071 : * preferable for existing uses.
1072 : *
1073 : * NB: for a partitioned hashtable, caller must hold lock on both relevant
1074 : * partitions, if the new hash key would belong to a different partition.
1075 : */
1076 : bool
1077 789 : hash_update_hash_key(HTAB *hashp,
1078 : void *existingEntry,
1079 : const void *newKeyPtr)
1080 : {
1081 789 : HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1082 : uint32 newhashvalue;
1083 : Size keysize;
1084 : uint32 bucket;
1085 : uint32 newbucket;
1086 : HASHBUCKET currBucket;
1087 : HASHBUCKET *prevBucketPtr;
1088 : HASHBUCKET *oldPrevPtr;
1089 : HashCompareFunc match;
1090 :
1091 : #ifdef HASH_STATISTICS
1092 : HASHHDR *hctl = hashp->hctl;
1093 :
1094 : hctl->accesses++;
1095 : #endif
1096 :
1097 : /* disallow updates if frozen */
1098 789 : if (hashp->frozen)
1099 0 : elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1100 : hashp->tabname);
1101 :
1102 : /*
1103 : * Lookup the existing element using its saved hash value. We need to do
1104 : * this to be able to unlink it from its hash chain, but as a side benefit
1105 : * we can verify the validity of the passed existingEntry pointer.
1106 : */
1107 789 : bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
1108 : &prevBucketPtr);
1109 789 : currBucket = *prevBucketPtr;
1110 :
1111 796 : while (currBucket != NULL)
1112 : {
1113 796 : if (currBucket == existingElement)
1114 789 : break;
1115 7 : prevBucketPtr = &(currBucket->link);
1116 7 : currBucket = *prevBucketPtr;
1117 : }
1118 :
1119 789 : if (currBucket == NULL)
1120 0 : elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1121 : hashp->tabname);
1122 :
1123 789 : oldPrevPtr = prevBucketPtr;
1124 :
1125 : /*
1126 : * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1127 : * chain we want to put the entry into.
1128 : */
1129 789 : newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
1130 789 : newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
1131 789 : currBucket = *prevBucketPtr;
1132 :
1133 : /*
1134 : * Follow collision chain looking for matching key
1135 : */
1136 789 : match = hashp->match; /* save one fetch in inner loop */
1137 789 : keysize = hashp->keysize; /* ditto */
1138 :
1139 851 : while (currBucket != NULL)
1140 : {
1141 62 : if (currBucket->hashvalue == newhashvalue &&
1142 0 : match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1143 0 : break;
1144 62 : prevBucketPtr = &(currBucket->link);
1145 62 : currBucket = *prevBucketPtr;
1146 : #ifdef HASH_STATISTICS
1147 : hctl->collisions++;
1148 : #endif
1149 : }
1150 :
1151 789 : if (currBucket != NULL)
1152 0 : return false; /* collision with an existing entry */
1153 :
1154 789 : currBucket = existingElement;
1155 :
1156 : /*
1157 : * If old and new hash values belong to the same bucket, we need not
1158 : * change any chain links, and indeed should not since this simplistic
1159 : * update will corrupt the list if currBucket is the last element. (We
1160 : * cannot fall out earlier, however, since we need to scan the bucket to
1161 : * check for duplicate keys.)
1162 : */
1163 789 : if (bucket != newbucket)
1164 : {
1165 : /* OK to remove record from old hash bucket's chain. */
1166 729 : *oldPrevPtr = currBucket->link;
1167 :
1168 : /* link into new hashbucket chain */
1169 729 : *prevBucketPtr = currBucket;
1170 729 : currBucket->link = NULL;
1171 : }
1172 :
1173 : /* copy new key into record */
1174 789 : currBucket->hashvalue = newhashvalue;
1175 789 : hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1176 :
1177 : /* rest of record is untouched */
1178 :
1179 789 : return true;
1180 : }
1181 :
1182 : /*
1183 : * Allocate a new hashtable entry if possible; return NULL if out of memory.
1184 : * (Or, if the underlying space allocator throws error for out-of-memory,
1185 : * we won't return at all.)
1186 : */
1187 : static HASHBUCKET
1188 38615852 : get_hash_entry(HTAB *hashp, int freelist_idx)
1189 : {
1190 38615852 : HASHHDR *hctl = hashp->hctl;
1191 : HASHBUCKET newElement;
1192 :
1193 : for (;;)
1194 : {
1195 : /* if partitioned, must lock to touch nentries and freeList */
1196 39020863 : if (IS_PARTITIONED(hctl))
1197 7374367 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1198 :
1199 : /* try to get an entry from the freelist */
1200 39020863 : newElement = hctl->freeList[freelist_idx].freeList;
1201 :
1202 39020863 : if (newElement != NULL)
1203 38577685 : break;
1204 :
1205 443178 : if (IS_PARTITIONED(hctl))
1206 38167 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1207 :
1208 : /*
1209 : * No free elements in this freelist. In a partitioned table, there
1210 : * might be entries in other freelists, but to reduce contention we
1211 : * prefer to first try to get another chunk of buckets from the main
1212 : * shmem allocator. If that fails, though, we *MUST* root through all
1213 : * the other freelists before giving up. There are multiple callers
1214 : * that assume that they can allocate every element in the initially
1215 : * requested table size, or that deleting an element guarantees they
1216 : * can insert a new element, even if shared memory is entirely full.
1217 : * Failing because the needed element is in a different freelist is
1218 : * not acceptable.
1219 : */
1220 443178 : if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1221 : {
1222 : int borrow_from_idx;
1223 :
1224 38167 : if (!IS_PARTITIONED(hctl))
1225 0 : return NULL; /* out of memory */
1226 :
1227 : /* try to borrow element from another freelist */
1228 38167 : borrow_from_idx = freelist_idx;
1229 : for (;;)
1230 : {
1231 42610 : borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1232 42610 : if (borrow_from_idx == freelist_idx)
1233 0 : break; /* examined all freelists, fail */
1234 :
1235 42610 : SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1236 42610 : newElement = hctl->freeList[borrow_from_idx].freeList;
1237 :
1238 42610 : if (newElement != NULL)
1239 : {
1240 38167 : hctl->freeList[borrow_from_idx].freeList = newElement->link;
1241 38167 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1242 :
1243 : /* careful: count the new element in its proper freelist */
1244 38167 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1245 38167 : hctl->freeList[freelist_idx].nentries++;
1246 38167 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1247 :
1248 38167 : return newElement;
1249 : }
1250 :
1251 4443 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1252 : }
1253 :
1254 : /* no elements available to borrow either, so out of memory */
1255 0 : return NULL;
1256 : }
1257 : }
1258 :
1259 : /* remove entry from freelist, bump nentries */
1260 38577685 : hctl->freeList[freelist_idx].freeList = newElement->link;
1261 38577685 : hctl->freeList[freelist_idx].nentries++;
1262 :
1263 38577685 : if (IS_PARTITIONED(hctl))
1264 7336200 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1265 :
1266 38577685 : return newElement;
1267 : }
1268 :
1269 : /*
1270 : * hash_get_num_entries -- get the number of entries in a hashtable
1271 : */
1272 : int64
1273 67499 : hash_get_num_entries(HTAB *hashp)
1274 : {
1275 : int i;
1276 67499 : int64 sum = hashp->hctl->freeList[0].nentries;
1277 :
1278 : /*
1279 : * We currently don't bother with acquiring the mutexes; it's only
1280 : * sensible to call this function if you've got lock on all partitions of
1281 : * the table.
1282 : */
1283 67499 : if (IS_PARTITIONED(hashp->hctl))
1284 : {
1285 74848 : for (i = 1; i < NUM_FREELISTS; i++)
1286 72509 : sum += hashp->hctl->freeList[i].nentries;
1287 : }
1288 :
1289 67499 : return sum;
1290 : }
1291 :
1292 : /*
1293 : * hash_seq_init/_search/_term
1294 : * Sequentially search through hash table and return
1295 : * all the elements one by one, return NULL when no more.
1296 : *
1297 : * hash_seq_term should be called if and only if the scan is abandoned before
1298 : * completion; if hash_seq_search returns NULL then it has already done the
1299 : * end-of-scan cleanup.
1300 : *
1301 : * NOTE: caller may delete the returned element before continuing the scan.
1302 : * However, deleting any other element while the scan is in progress is
1303 : * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1304 : * if elements are added to the table while the scan is in progress, it is
1305 : * unspecified whether they will be visited by the scan or not.
1306 : *
1307 : * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1308 : * worry about hash_seq_term cleanup, if the hashtable is first locked against
1309 : * further insertions by calling hash_freeze.
1310 : *
1311 : * NOTE: to use this with a partitioned hashtable, caller had better hold
1312 : * at least shared lock on all partitions of the table throughout the scan!
1313 : * We can cope with insertions or deletions by our own backend, but *not*
1314 : * with concurrent insertions or deletions by another.
1315 : */
1316 : void
1317 3432932 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1318 : {
1319 3432932 : status->hashp = hashp;
1320 3432932 : status->curBucket = 0;
1321 3432932 : status->curEntry = NULL;
1322 3432932 : status->hasHashvalue = false;
1323 3432932 : if (!hashp->frozen)
1324 3432932 : register_seq_scan(hashp);
1325 3432932 : }
1326 :
1327 : /*
1328 : * Same as above but scan by the given hash value.
1329 : * See also hash_seq_search().
1330 : *
1331 : * NOTE: the default hash function doesn't match syscache hash function.
1332 : * Thus, if you're going to use this function in syscache callback, make sure
1333 : * you're using custom hash function. See relatt_cache_syshash()
1334 : * for example.
1335 : */
1336 : void
1337 1266616 : hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp,
1338 : uint32 hashvalue)
1339 : {
1340 : HASHBUCKET *bucketPtr;
1341 :
1342 1266616 : hash_seq_init(status, hashp);
1343 :
1344 1266616 : status->hasHashvalue = true;
1345 1266616 : status->hashvalue = hashvalue;
1346 :
1347 1266616 : status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
1348 1266616 : status->curEntry = *bucketPtr;
1349 1266616 : }
1350 :
1351 : void *
1352 37771805 : hash_seq_search(HASH_SEQ_STATUS *status)
1353 : {
1354 : HTAB *hashp;
1355 : HASHHDR *hctl;
1356 : uint32 max_bucket;
1357 : int64 segment_num;
1358 : int64 segment_ndx;
1359 : HASHSEGMENT segp;
1360 : uint32 curBucket;
1361 : HASHELEMENT *curElem;
1362 :
1363 37771805 : if (status->hasHashvalue)
1364 : {
1365 : /*
1366 : * Scan entries only in the current bucket because only this bucket
1367 : * can contain entries with the given hash value.
1368 : */
1369 1428139 : while ((curElem = status->curEntry) != NULL)
1370 : {
1371 161523 : status->curEntry = curElem->link;
1372 161523 : if (status->hashvalue != curElem->hashvalue)
1373 154991 : continue;
1374 6532 : return (void *) ELEMENTKEY(curElem);
1375 : }
1376 :
1377 1266616 : hash_seq_term(status);
1378 1266616 : return NULL;
1379 : }
1380 :
1381 36498657 : if ((curElem = status->curEntry) != NULL)
1382 : {
1383 : /* Continuing scan of curBucket... */
1384 9648119 : status->curEntry = curElem->link;
1385 9648119 : if (status->curEntry == NULL) /* end of this bucket */
1386 6883829 : ++status->curBucket;
1387 9648119 : return ELEMENTKEY(curElem);
1388 : }
1389 :
1390 : /*
1391 : * Search for next nonempty bucket starting at curBucket.
1392 : */
1393 26850538 : curBucket = status->curBucket;
1394 26850538 : hashp = status->hashp;
1395 26850538 : hctl = hashp->hctl;
1396 26850538 : max_bucket = hctl->max_bucket;
1397 :
1398 26850538 : if (curBucket > max_bucket)
1399 : {
1400 72249 : hash_seq_term(status);
1401 72249 : return NULL; /* search is done */
1402 : }
1403 :
1404 : /*
1405 : * first find the right segment in the table directory.
1406 : */
1407 26778289 : segment_num = curBucket >> HASH_SEGSIZE_SHIFT;
1408 26778289 : segment_ndx = MOD(curBucket, HASH_SEGSIZE);
1409 :
1410 26778289 : segp = hashp->dir[segment_num];
1411 :
1412 : /*
1413 : * Pick up the first item in this bucket's chain. If chain is not empty
1414 : * we can begin searching it. Otherwise we have to advance to find the
1415 : * next nonempty bucket. We try to optimize that case since searching a
1416 : * near-empty hashtable has to iterate this loop a lot.
1417 : */
1418 183841085 : while ((curElem = segp[segment_ndx]) == NULL)
1419 : {
1420 : /* empty bucket, advance to next */
1421 159136034 : if (++curBucket > max_bucket)
1422 : {
1423 2073238 : status->curBucket = curBucket;
1424 2073238 : hash_seq_term(status);
1425 2073238 : return NULL; /* search is done */
1426 : }
1427 157062796 : if (++segment_ndx >= HASH_SEGSIZE)
1428 : {
1429 403875 : segment_num++;
1430 403875 : segment_ndx = 0;
1431 403875 : segp = hashp->dir[segment_num];
1432 : }
1433 : }
1434 :
1435 : /* Begin scan of curBucket... */
1436 24705051 : status->curEntry = curElem->link;
1437 24705051 : if (status->curEntry == NULL) /* end of this bucket */
1438 17821018 : ++curBucket;
1439 24705051 : status->curBucket = curBucket;
1440 24705051 : return ELEMENTKEY(curElem);
1441 : }
1442 :
1443 : void
1444 3432638 : hash_seq_term(HASH_SEQ_STATUS *status)
1445 : {
1446 3432638 : if (!status->hashp->frozen)
1447 3432638 : deregister_seq_scan(status->hashp);
1448 3432638 : }
1449 :
1450 : /*
1451 : * hash_freeze
1452 : * Freeze a hashtable against future insertions (deletions are
1453 : * still allowed)
1454 : *
1455 : * The reason for doing this is that by preventing any more bucket splits,
1456 : * we no longer need to worry about registering hash_seq_search scans,
1457 : * and thus caller need not be careful about ensuring hash_seq_term gets
1458 : * called at the right times.
1459 : *
1460 : * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1461 : * with active scans (since hash_seq_term would then do the wrong thing).
1462 : */
1463 : void
1464 0 : hash_freeze(HTAB *hashp)
1465 : {
1466 0 : if (hashp->isshared)
1467 0 : elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
1468 0 : if (!hashp->frozen && has_seq_scans(hashp))
1469 0 : elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1470 : hashp->tabname);
1471 0 : hashp->frozen = true;
1472 0 : }
1473 :
1474 :
1475 : /********************************* UTILITIES ************************/
1476 :
1477 : /*
1478 : * Expand the table by adding one more hash bucket.
1479 : */
1480 : static bool
1481 457364 : expand_table(HTAB *hashp)
1482 : {
1483 457364 : HASHHDR *hctl = hashp->hctl;
1484 : HASHSEGMENT old_seg,
1485 : new_seg;
1486 : int64 old_bucket,
1487 : new_bucket;
1488 : int64 new_segnum,
1489 : new_segndx;
1490 : int64 old_segnum,
1491 : old_segndx;
1492 : HASHBUCKET *oldlink,
1493 : *newlink;
1494 : HASHBUCKET currElement,
1495 : nextElement;
1496 :
1497 : Assert(!IS_PARTITIONED(hctl));
1498 :
1499 : #ifdef HASH_STATISTICS
1500 : hctl->expansions++;
1501 : #endif
1502 :
1503 457364 : new_bucket = hctl->max_bucket + 1;
1504 457364 : new_segnum = new_bucket >> HASH_SEGSIZE_SHIFT;
1505 457364 : new_segndx = MOD(new_bucket, HASH_SEGSIZE);
1506 :
1507 457364 : if (new_segnum >= hctl->nsegs)
1508 : {
1509 : /* Allocate new segment if necessary -- could fail if dir full */
1510 1537 : if (new_segnum >= hctl->dsize)
1511 0 : if (!dir_realloc(hashp))
1512 0 : return false;
1513 1537 : if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
1514 0 : return false;
1515 1537 : hctl->nsegs++;
1516 : }
1517 :
1518 : /* OK, we created a new bucket */
1519 457364 : hctl->max_bucket++;
1520 :
1521 : /*
1522 : * *Before* changing masks, find old bucket corresponding to same hash
1523 : * values; values in that bucket may need to be relocated to new bucket.
1524 : * Note that new_bucket is certainly larger than low_mask at this point,
1525 : * so we can skip the first step of the regular hash mask calc.
1526 : */
1527 457364 : old_bucket = (new_bucket & hctl->low_mask);
1528 :
1529 : /*
1530 : * If we crossed a power of 2, readjust masks.
1531 : */
1532 457364 : if ((uint32) new_bucket > hctl->high_mask)
1533 : {
1534 2896 : hctl->low_mask = hctl->high_mask;
1535 2896 : hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1536 : }
1537 :
1538 : /*
1539 : * Relocate records to the new bucket. NOTE: because of the way the hash
1540 : * masking is done in calc_bucket, only one old bucket can need to be
1541 : * split at this point. With a different way of reducing the hash value,
1542 : * that might not be true!
1543 : */
1544 457364 : old_segnum = old_bucket >> HASH_SEGSIZE_SHIFT;
1545 457364 : old_segndx = MOD(old_bucket, HASH_SEGSIZE);
1546 :
1547 457364 : old_seg = hashp->dir[old_segnum];
1548 457364 : new_seg = hashp->dir[new_segnum];
1549 :
1550 457364 : oldlink = &old_seg[old_segndx];
1551 457364 : newlink = &new_seg[new_segndx];
1552 :
1553 457364 : for (currElement = *oldlink;
1554 1104362 : currElement != NULL;
1555 646998 : currElement = nextElement)
1556 : {
1557 646998 : nextElement = currElement->link;
1558 646998 : if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1559 : {
1560 322897 : *oldlink = currElement;
1561 322897 : oldlink = &currElement->link;
1562 : }
1563 : else
1564 : {
1565 324101 : *newlink = currElement;
1566 324101 : newlink = &currElement->link;
1567 : }
1568 : }
1569 : /* don't forget to terminate the rebuilt hash chains... */
1570 457364 : *oldlink = NULL;
1571 457364 : *newlink = NULL;
1572 :
1573 457364 : return true;
1574 : }
1575 :
1576 :
1577 : static bool
1578 0 : dir_realloc(HTAB *hashp)
1579 : {
1580 : HASHSEGMENT *p;
1581 : HASHSEGMENT *old_p;
1582 : int64 new_dsize;
1583 : int64 old_dirsize;
1584 : int64 new_dirsize;
1585 :
1586 0 : if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
1587 0 : return false;
1588 :
1589 : /* Reallocate directory */
1590 0 : new_dsize = hashp->hctl->dsize << 1;
1591 0 : old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1592 0 : new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1593 :
1594 0 : old_p = hashp->dir;
1595 0 : p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize, hashp->alloc_arg);
1596 :
1597 0 : if (p != NULL)
1598 : {
1599 0 : memcpy(p, old_p, old_dirsize);
1600 0 : MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1601 0 : hashp->hctl->dir = p;
1602 0 : hashp->dir = p;
1603 0 : hashp->hctl->dsize = new_dsize;
1604 :
1605 : /* XXX assume the allocator is palloc, so we know how to free */
1606 : Assert(hashp->alloc == DynaHashAlloc);
1607 0 : pfree(old_p);
1608 :
1609 0 : return true;
1610 : }
1611 :
1612 0 : return false;
1613 : }
1614 :
1615 :
1616 : static HASHSEGMENT
1617 974215 : seg_alloc(HTAB *hashp)
1618 : {
1619 : HASHSEGMENT segp;
1620 :
1621 974215 : segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * HASH_SEGSIZE, hashp->alloc_arg);
1622 :
1623 974215 : if (!segp)
1624 0 : return NULL;
1625 :
1626 974215 : MemSet(segp, 0, sizeof(HASHBUCKET) * HASH_SEGSIZE);
1627 :
1628 974215 : return segp;
1629 : }
1630 :
1631 : /*
1632 : * allocate some new elements and link them into the indicated free list
1633 : */
1634 : static bool
1635 788867 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1636 : {
1637 788867 : HASHHDR *hctl = hashp->hctl;
1638 : Size elementSize;
1639 : Size requestSize;
1640 : char *allocedBlock;
1641 : HASHELEMENT *firstElement;
1642 : HASHELEMENT *tmpElement;
1643 : HASHELEMENT *prevElement;
1644 : int i;
1645 :
1646 788867 : if (hctl->isfixed)
1647 38167 : return false;
1648 :
1649 : /* Each element has a HASHELEMENT header plus user data. */
1650 750700 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1651 :
1652 750700 : requestSize = nelem * elementSize;
1653 :
1654 : /* Add space for slist_node list link if we need one. */
1655 : #ifdef USE_VALGRIND
1656 : if (!hashp->isshared)
1657 : requestSize += MAXALIGN(sizeof(slist_node));
1658 : #endif
1659 :
1660 : /* Allocate the memory. */
1661 750700 : allocedBlock = hashp->alloc(requestSize, hashp->alloc_arg);
1662 :
1663 750700 : if (!allocedBlock)
1664 0 : return false;
1665 :
1666 : /*
1667 : * If USE_VALGRIND, each allocated block of elements of a non-shared
1668 : * hashtable is chained into a list, so that Valgrind won't think it's
1669 : * been leaked.
1670 : */
1671 : #ifdef USE_VALGRIND
1672 : if (hashp->isshared)
1673 : firstElement = (HASHELEMENT *) allocedBlock;
1674 : else
1675 : {
1676 : slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock);
1677 : firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node)));
1678 : }
1679 : #else
1680 750700 : firstElement = (HASHELEMENT *) allocedBlock;
1681 : #endif
1682 :
1683 : /* prepare to link all the new entries into the freelist */
1684 750700 : prevElement = NULL;
1685 750700 : tmpElement = firstElement;
1686 98183652 : for (i = 0; i < nelem; i++)
1687 : {
1688 97432952 : tmpElement->link = prevElement;
1689 97432952 : prevElement = tmpElement;
1690 97432952 : tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1691 : }
1692 :
1693 : /* if partitioned, must lock to touch freeList */
1694 750700 : if (IS_PARTITIONED(hctl))
1695 196960 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1696 :
1697 : /* freelist could be nonempty if two backends did this concurrently */
1698 750700 : firstElement->link = hctl->freeList[freelist_idx].freeList;
1699 750700 : hctl->freeList[freelist_idx].freeList = prevElement;
1700 :
1701 750700 : if (IS_PARTITIONED(hctl))
1702 196960 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1703 :
1704 750700 : return true;
1705 : }
1706 :
1707 : /*
1708 : * Do initial lookup of a bucket for the given hash value, retrieving its
1709 : * bucket number and its hash bucket.
1710 : */
1711 : static inline uint32
1712 244947189 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
1713 : {
1714 244947189 : HASHHDR *hctl = hashp->hctl;
1715 : HASHSEGMENT segp;
1716 : int64 segment_num;
1717 : int64 segment_ndx;
1718 : uint32 bucket;
1719 :
1720 244947189 : bucket = calc_bucket(hctl, hashvalue);
1721 :
1722 244947189 : segment_num = bucket >> HASH_SEGSIZE_SHIFT;
1723 244947189 : segment_ndx = MOD(bucket, HASH_SEGSIZE);
1724 :
1725 244947189 : segp = hashp->dir[segment_num];
1726 :
1727 244947189 : if (segp == NULL)
1728 0 : hash_corrupted(hashp);
1729 :
1730 244947189 : *bucketptr = &segp[segment_ndx];
1731 244947189 : return bucket;
1732 : }
1733 :
1734 : /* complain when we have detected a corrupted hashtable */
1735 : static void
1736 0 : hash_corrupted(HTAB *hashp)
1737 : {
1738 : /*
1739 : * If the corruption is in a shared hashtable, we'd better force a
1740 : * systemwide restart. Otherwise, just shut down this one backend.
1741 : */
1742 0 : if (hashp->isshared)
1743 0 : elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1744 : else
1745 0 : elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1746 : }
1747 :
1748 : /* calculate ceil(log base 2) of num */
1749 : static int
1750 886730 : my_log2(int64 num)
1751 : {
1752 : /*
1753 : * guard against too-large input, which would be invalid for
1754 : * pg_ceil_log2_*()
1755 : */
1756 886730 : if (num > PG_INT64_MAX / 2)
1757 0 : num = PG_INT64_MAX / 2;
1758 :
1759 886730 : return pg_ceil_log2_64(num);
1760 : }
1761 :
1762 : /* calculate first power of 2 >= num, bounded to what will fit in a int64 */
1763 : static int64
1764 26810 : next_pow2_int64(int64 num)
1765 : {
1766 : /* my_log2's internal range check is sufficient */
1767 26810 : return 1L << my_log2(num);
1768 : }
1769 :
1770 : /* calculate first power of 2 >= num, bounded to what will fit in an int */
1771 : static int
1772 859920 : next_pow2_int(int64 num)
1773 : {
1774 859920 : if (num > INT_MAX / 2)
1775 0 : num = INT_MAX / 2;
1776 859920 : return 1 << my_log2(num);
1777 : }
1778 :
1779 :
1780 : /************************* SEQ SCAN TRACKING ************************/
1781 :
1782 : /*
1783 : * We track active hash_seq_search scans here. The need for this mechanism
1784 : * comes from the fact that a scan will get confused if a bucket split occurs
1785 : * while it's in progress: it might visit entries twice, or even miss some
1786 : * entirely (if it's partway through the same bucket that splits). Hence
1787 : * we want to inhibit bucket splits if there are any active scans on the
1788 : * table being inserted into. This is a fairly rare case in current usage,
1789 : * so just postponing the split until the next insertion seems sufficient.
1790 : *
1791 : * Given present usages of the function, only a few scans are likely to be
1792 : * open concurrently; so a finite-size stack of open scans seems sufficient,
1793 : * and we don't worry that linear search is too slow. Note that we do
1794 : * allow multiple scans of the same hashtable to be open concurrently.
1795 : *
1796 : * This mechanism can support concurrent scan and insertion in a shared
1797 : * hashtable if it's the same backend doing both. It would fail otherwise,
1798 : * but locking reasons seem to preclude any such scenario anyway, so we don't
1799 : * worry.
1800 : *
1801 : * This arrangement is reasonably robust if a transient hashtable is deleted
1802 : * without notifying us. The absolute worst case is we might inhibit splits
1803 : * in another table created later at exactly the same address. We will give
1804 : * a warning at transaction end for reference leaks, so any bugs leading to
1805 : * lack of notification should be easy to catch.
1806 : */
1807 :
1808 : #define MAX_SEQ_SCANS 100
1809 :
1810 : static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1811 : static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1812 : static int num_seq_scans = 0;
1813 :
1814 :
1815 : /* Register a table as having an active hash_seq_search scan */
1816 : static void
1817 3432932 : register_seq_scan(HTAB *hashp)
1818 : {
1819 3432932 : if (num_seq_scans >= MAX_SEQ_SCANS)
1820 0 : elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1821 : hashp->tabname);
1822 3432932 : seq_scan_tables[num_seq_scans] = hashp;
1823 3432932 : seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1824 3432932 : num_seq_scans++;
1825 3432932 : }
1826 :
1827 : /* Deregister an active scan */
1828 : static void
1829 3432638 : deregister_seq_scan(HTAB *hashp)
1830 : {
1831 : int i;
1832 :
1833 : /* Search backward since it's most likely at the stack top */
1834 3432638 : for (i = num_seq_scans - 1; i >= 0; i--)
1835 : {
1836 3432638 : if (seq_scan_tables[i] == hashp)
1837 : {
1838 3432638 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1839 3432638 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1840 3432638 : num_seq_scans--;
1841 3432638 : return;
1842 : }
1843 : }
1844 0 : elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1845 : hashp->tabname);
1846 : }
1847 :
1848 : /* Check if a table has any active scan */
1849 : static bool
1850 457364 : has_seq_scans(HTAB *hashp)
1851 : {
1852 : int i;
1853 :
1854 457576 : for (i = 0; i < num_seq_scans; i++)
1855 : {
1856 212 : if (seq_scan_tables[i] == hashp)
1857 0 : return true;
1858 : }
1859 457364 : return false;
1860 : }
1861 :
1862 : /* Clean up any open scans at end of transaction */
1863 : void
1864 630754 : AtEOXact_HashTables(bool isCommit)
1865 : {
1866 : /*
1867 : * During abort cleanup, open scans are expected; just silently clean 'em
1868 : * out. An open scan at commit means someone forgot a hash_seq_term()
1869 : * call, so complain.
1870 : *
1871 : * Note: it's tempting to try to print the tabname here, but refrain for
1872 : * fear of touching deallocated memory. This isn't a user-facing message
1873 : * anyway, so it needn't be pretty.
1874 : */
1875 630754 : if (isCommit)
1876 : {
1877 : int i;
1878 :
1879 595424 : for (i = 0; i < num_seq_scans; i++)
1880 : {
1881 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1882 : seq_scan_tables[i]);
1883 : }
1884 : }
1885 630754 : num_seq_scans = 0;
1886 630754 : }
1887 :
1888 : /* Clean up any open scans at end of subtransaction */
1889 : void
1890 11910 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1891 : {
1892 : int i;
1893 :
1894 : /*
1895 : * Search backward to make cleanup easy. Note we must check all entries,
1896 : * not only those at the end of the array, because deletion technique
1897 : * doesn't keep them in order.
1898 : */
1899 11913 : for (i = num_seq_scans - 1; i >= 0; i--)
1900 : {
1901 3 : if (seq_scan_level[i] >= nestDepth)
1902 : {
1903 3 : if (isCommit)
1904 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1905 : seq_scan_tables[i]);
1906 3 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1907 3 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1908 3 : num_seq_scans--;
1909 : }
1910 : }
1911 11910 : }
|