Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dshash.c
4 : * Concurrent hash tables backed by dynamic shared memory areas.
5 : *
6 : * This is an open hashing hash table, with a linked list at each table
7 : * entry. It supports dynamic resizing, as required to prevent the linked
8 : * lists from growing too long on average. Currently, only growing is
9 : * supported: the hash table never becomes smaller.
10 : *
11 : * To deal with concurrency, it has a fixed size set of partitions, each of
12 : * which is independently locked. Each bucket maps to a partition; so insert,
13 : * find and iterate operations normally only acquire one lock. Therefore,
14 : * good concurrency is achieved whenever such operations don't collide at the
15 : * lock partition level. However, when a resize operation begins, all
16 : * partition locks must be acquired simultaneously for a brief period. This
17 : * is only expected to happen a small number of times until a stable size is
18 : * found, since growth is geometric.
19 : *
20 : * Future versions may support iterators and incremental resizing; for now
21 : * the implementation is minimalist.
22 : *
23 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
24 : * Portions Copyright (c) 1994, Regents of the University of California
25 : *
26 : * IDENTIFICATION
27 : * src/backend/lib/dshash.c
28 : *
29 : *-------------------------------------------------------------------------
30 : */
31 :
32 : #include "postgres.h"
33 :
34 : #include "common/hashfn.h"
35 : #include "lib/dshash.h"
36 : #include "storage/lwlock.h"
37 : #include "utils/dsa.h"
38 :
39 : /*
40 : * An item in the hash table. This wraps the user's entry object in an
41 : * envelop that holds a pointer back to the bucket and a pointer to the next
42 : * item in the bucket.
43 : */
44 : struct dshash_table_item
45 : {
46 : /* The next item in the same bucket. */
47 : dsa_pointer next;
48 : /* The hashed key, to avoid having to recompute it. */
49 : dshash_hash hash;
50 : /* The user's entry object follows here. See ENTRY_FROM_ITEM(item). */
51 : };
52 :
53 : /*
54 : * The number of partitions for locking purposes. This is set to match
55 : * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
56 : * the buffer pool must be good enough for any other purpose. This could
57 : * become a runtime parameter in future.
58 : */
59 : #define DSHASH_NUM_PARTITIONS_LOG2 7
60 : #define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
61 :
62 : /* A magic value used to identify our hash tables. */
63 : #define DSHASH_MAGIC 0x75ff6a20
64 :
65 : /*
66 : * Tracking information for each lock partition. Initially, each partition
67 : * corresponds to one bucket, but each time the hash table grows, the buckets
68 : * covered by each partition split so the number of buckets covered doubles.
69 : *
70 : * We might want to add padding here so that each partition is on a different
71 : * cache line, but doing so would bloat this structure considerably.
72 : */
73 : typedef struct dshash_partition
74 : {
75 : LWLock lock; /* Protects all buckets in this partition. */
76 : size_t count; /* # of items in this partition's buckets */
77 : } dshash_partition;
78 :
79 : /*
80 : * The head object for a hash table. This will be stored in dynamic shared
81 : * memory.
82 : */
83 : typedef struct dshash_table_control
84 : {
85 : dshash_table_handle handle;
86 : uint32 magic;
87 : dshash_partition partitions[DSHASH_NUM_PARTITIONS];
88 : int lwlock_tranche_id;
89 :
90 : /*
91 : * The following members are written to only when ALL partitions locks are
92 : * held. They can be read when any one partition lock is held.
93 : */
94 :
95 : /* Number of buckets expressed as power of 2 (8 = 256 buckets). */
96 : size_t size_log2; /* log2(number of buckets) */
97 : dsa_pointer buckets; /* current bucket array */
98 : } dshash_table_control;
99 :
100 : /*
101 : * Per-backend state for a dynamic hash table.
102 : */
103 : struct dshash_table
104 : {
105 : dsa_area *area; /* Backing dynamic shared memory area. */
106 : dshash_parameters params; /* Parameters. */
107 : void *arg; /* User-supplied data pointer. */
108 : dshash_table_control *control; /* Control object in DSM. */
109 : dsa_pointer *buckets; /* Current bucket pointers in DSM. */
110 : size_t size_log2; /* log2(number of buckets) */
111 : };
112 :
113 : /* Given a pointer to an item, find the entry (user data) it holds. */
114 : #define ENTRY_FROM_ITEM(item) \
115 : ((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
116 :
117 : /* Given a pointer to an entry, find the item that holds it. */
118 : #define ITEM_FROM_ENTRY(entry) \
119 : ((dshash_table_item *)((char *)(entry) - \
120 : MAXALIGN(sizeof(dshash_table_item))))
121 :
122 : /* How many resize operations (bucket splits) have there been? */
123 : #define NUM_SPLITS(size_log2) \
124 : (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
125 :
126 : /* How many buckets are there in a given size? */
127 : #define NUM_BUCKETS(size_log2) \
128 : (((size_t) 1) << (size_log2))
129 :
130 : /* How many buckets are there in each partition at a given size? */
131 : #define BUCKETS_PER_PARTITION(size_log2) \
132 : (((size_t) 1) << NUM_SPLITS(size_log2))
133 :
134 : /* Max entries before we need to grow. Half + quarter = 75% load factor. */
135 : #define MAX_COUNT_PER_PARTITION(hash_table) \
136 : (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
137 : BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
138 :
139 : /* Choose partition based on the highest order bits of the hash. */
140 : #define PARTITION_FOR_HASH(hash) \
141 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
142 :
143 : /*
144 : * Find the bucket index for a given hash and table size. Each time the table
145 : * doubles in size, the appropriate bucket for a given hash value doubles and
146 : * possibly adds one, depending on the newly revealed bit, so that all buckets
147 : * are split.
148 : */
149 : #define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2) \
150 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
151 :
152 : /* The index of the first bucket in a given partition. */
153 : #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
154 : ((partition) << NUM_SPLITS(size_log2))
155 :
156 : /* Choose partition based on bucket index. */
157 : #define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2) \
158 : ((bucket_idx) >> NUM_SPLITS(size_log2))
159 :
160 : /* The head of the active bucket for a given hash value (lvalue). */
161 : #define BUCKET_FOR_HASH(hash_table, hash) \
162 : (hash_table->buckets[ \
163 : BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, \
164 : hash_table->size_log2)])
165 :
166 : static void delete_item(dshash_table *hash_table,
167 : dshash_table_item *item);
168 : static void resize(dshash_table *hash_table, size_t new_size_log2);
169 : static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
170 : static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
171 : const void *key,
172 : dsa_pointer item_pointer);
173 : static void insert_item_into_bucket(dshash_table *hash_table,
174 : dsa_pointer item_pointer,
175 : dshash_table_item *item,
176 : dsa_pointer *bucket);
177 : static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
178 : const void *key,
179 : dsa_pointer *bucket);
180 : static bool delete_key_from_bucket(dshash_table *hash_table,
181 : const void *key,
182 : dsa_pointer *bucket_head);
183 : static bool delete_item_from_bucket(dshash_table *hash_table,
184 : dshash_table_item *item,
185 : dsa_pointer *bucket_head);
186 : static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
187 : static inline bool equal_keys(dshash_table *hash_table,
188 : const void *a, const void *b);
189 : static inline void copy_key(dshash_table *hash_table, void *dest,
190 : const void *src);
191 :
192 : #define PARTITION_LOCK(hash_table, i) \
193 : (&(hash_table)->control->partitions[(i)].lock)
194 :
195 : #define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \
196 : Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \
197 : DSHASH_NUM_PARTITIONS, sizeof(dshash_partition)))
198 :
199 : /*
200 : * Create a new hash table backed by the given dynamic shared area, with the
201 : * given parameters. The returned object is allocated in backend-local memory
202 : * using the current MemoryContext. 'arg' will be passed through to the
203 : * compare, hash, and copy functions.
204 : */
205 : dshash_table *
206 2310 : dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
207 : {
208 : dshash_table *hash_table;
209 : dsa_pointer control;
210 :
211 : /* Allocate the backend-local object representing the hash table. */
212 2310 : hash_table = palloc(sizeof(dshash_table));
213 :
214 : /* Allocate the control object in shared memory. */
215 2310 : control = dsa_allocate(area, sizeof(dshash_table_control));
216 :
217 : /* Set up the local and shared hash table structs. */
218 2310 : hash_table->area = area;
219 2310 : hash_table->params = *params;
220 2310 : hash_table->arg = arg;
221 2310 : hash_table->control = dsa_get_address(area, control);
222 2310 : hash_table->control->handle = control;
223 2310 : hash_table->control->magic = DSHASH_MAGIC;
224 2310 : hash_table->control->lwlock_tranche_id = params->tranche_id;
225 :
226 : /* Set up the array of lock partitions. */
227 : {
228 2310 : dshash_partition *partitions = hash_table->control->partitions;
229 2310 : int tranche_id = hash_table->control->lwlock_tranche_id;
230 : int i;
231 :
232 297990 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
233 : {
234 295680 : LWLockInitialize(&partitions[i].lock, tranche_id);
235 295680 : partitions[i].count = 0;
236 : }
237 : }
238 :
239 : /*
240 : * Set up the initial array of buckets. Our initial size is the same as
241 : * the number of partitions.
242 : */
243 2310 : hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
244 4620 : hash_table->control->buckets =
245 2310 : dsa_allocate_extended(area,
246 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
247 : DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
248 2310 : if (!DsaPointerIsValid(hash_table->control->buckets))
249 : {
250 0 : dsa_free(area, control);
251 0 : ereport(ERROR,
252 : (errcode(ERRCODE_OUT_OF_MEMORY),
253 : errmsg("out of memory"),
254 : errdetail("Failed on DSA request of size %zu.",
255 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
256 : }
257 4620 : hash_table->buckets = dsa_get_address(area,
258 2310 : hash_table->control->buckets);
259 2310 : hash_table->size_log2 = hash_table->control->size_log2;
260 :
261 2310 : return hash_table;
262 : }
263 :
264 : /*
265 : * Attach to an existing hash table using a handle. The returned object is
266 : * allocated in backend-local memory using the current MemoryContext. 'arg'
267 : * will be passed through to the compare and hash functions.
268 : */
269 : dshash_table *
270 40326 : dshash_attach(dsa_area *area, const dshash_parameters *params,
271 : dshash_table_handle handle, void *arg)
272 : {
273 : dshash_table *hash_table;
274 : dsa_pointer control;
275 :
276 : /* Allocate the backend-local object representing the hash table. */
277 40326 : hash_table = palloc(sizeof(dshash_table));
278 :
279 : /* Find the control object in shared memory. */
280 40326 : control = handle;
281 :
282 : /* Set up the local hash table struct. */
283 40326 : hash_table->area = area;
284 40326 : hash_table->params = *params;
285 40326 : hash_table->arg = arg;
286 40326 : hash_table->control = dsa_get_address(area, control);
287 : Assert(hash_table->control->magic == DSHASH_MAGIC);
288 :
289 : /*
290 : * These will later be set to the correct values by
291 : * ensure_valid_bucket_pointers(), at which time we'll be holding a
292 : * partition lock for interlocking against concurrent resizing.
293 : */
294 40326 : hash_table->buckets = NULL;
295 40326 : hash_table->size_log2 = 0;
296 :
297 40326 : return hash_table;
298 : }
299 :
300 : /*
301 : * Detach from a hash table. This frees backend-local resources associated
302 : * with the hash table, but the hash table will continue to exist until it is
303 : * either explicitly destroyed (by a backend that is still attached to it), or
304 : * the area that backs it is returned to the operating system.
305 : */
306 : void
307 42276 : dshash_detach(dshash_table *hash_table)
308 : {
309 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
310 :
311 : /* The hash table may have been destroyed. Just free local memory. */
312 42276 : pfree(hash_table);
313 42276 : }
314 :
315 : /*
316 : * Destroy a hash table, returning all memory to the area. The caller must be
317 : * certain that no other backend will attempt to access the hash table before
318 : * calling this function. Other backend must explicitly call dshash_detach to
319 : * free up backend-local memory associated with the hash table. The backend
320 : * that calls dshash_destroy must not call dshash_detach.
321 : */
322 : void
323 0 : dshash_destroy(dshash_table *hash_table)
324 : {
325 : size_t size;
326 : size_t i;
327 :
328 : Assert(hash_table->control->magic == DSHASH_MAGIC);
329 0 : ensure_valid_bucket_pointers(hash_table);
330 :
331 : /* Free all the entries. */
332 0 : size = NUM_BUCKETS(hash_table->size_log2);
333 0 : for (i = 0; i < size; ++i)
334 : {
335 0 : dsa_pointer item_pointer = hash_table->buckets[i];
336 :
337 0 : while (DsaPointerIsValid(item_pointer))
338 : {
339 : dshash_table_item *item;
340 : dsa_pointer next_item_pointer;
341 :
342 0 : item = dsa_get_address(hash_table->area, item_pointer);
343 0 : next_item_pointer = item->next;
344 0 : dsa_free(hash_table->area, item_pointer);
345 0 : item_pointer = next_item_pointer;
346 : }
347 : }
348 :
349 : /*
350 : * Vandalize the control block to help catch programming errors where
351 : * other backends access the memory formerly occupied by this hash table.
352 : */
353 0 : hash_table->control->magic = 0;
354 :
355 : /* Free the active table and control object. */
356 0 : dsa_free(hash_table->area, hash_table->control->buckets);
357 0 : dsa_free(hash_table->area, hash_table->control->handle);
358 :
359 0 : pfree(hash_table);
360 0 : }
361 :
362 : /*
363 : * Get a handle that can be used by other processes to attach to this hash
364 : * table.
365 : */
366 : dshash_table_handle
367 2310 : dshash_get_hash_table_handle(dshash_table *hash_table)
368 : {
369 : Assert(hash_table->control->magic == DSHASH_MAGIC);
370 :
371 2310 : return hash_table->control->handle;
372 : }
373 :
374 : /*
375 : * Look up an entry, given a key. Returns a pointer to an entry if one can be
376 : * found with the given key. Returns NULL if the key is not found. If a
377 : * non-NULL value is returned, the entry is locked and must be released by
378 : * calling dshash_release_lock. If an error is raised before
379 : * dshash_release_lock is called, the lock will be released automatically, but
380 : * the caller must take care to ensure that the entry is not left corrupted.
381 : * The lock mode is either shared or exclusive depending on 'exclusive'.
382 : *
383 : * The caller must not hold a lock already.
384 : *
385 : * Note that the lock held is in fact an LWLock, so interrupts will be held on
386 : * return from this function, and not resumed until dshash_release_lock is
387 : * called. It is a very good idea for the caller to release the lock quickly.
388 : */
389 : void *
390 1900342 : dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
391 : {
392 : dshash_hash hash;
393 : size_t partition;
394 : dshash_table_item *item;
395 :
396 1900342 : hash = hash_key(hash_table, key);
397 1900342 : partition = PARTITION_FOR_HASH(hash);
398 :
399 : Assert(hash_table->control->magic == DSHASH_MAGIC);
400 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
401 :
402 1900342 : LWLockAcquire(PARTITION_LOCK(hash_table, partition),
403 1900342 : exclusive ? LW_EXCLUSIVE : LW_SHARED);
404 1900342 : ensure_valid_bucket_pointers(hash_table);
405 :
406 : /* Search the active bucket. */
407 1900342 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
408 :
409 1900342 : if (!item)
410 : {
411 : /* Not found. */
412 400658 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
413 400658 : return NULL;
414 : }
415 : else
416 : {
417 : /* The caller will free the lock by calling dshash_release_lock. */
418 1499684 : return ENTRY_FROM_ITEM(item);
419 : }
420 : }
421 :
422 : /*
423 : * Returns a pointer to an exclusively locked item which must be released with
424 : * dshash_release_lock. If the key is found in the hash table, 'found' is set
425 : * to true and a pointer to the existing entry is returned. If the key is not
426 : * found, 'found' is set to false, and a pointer to a newly created entry is
427 : * returned.
428 : *
429 : * Notes above dshash_find() regarding locking and error handling equally
430 : * apply here.
431 : */
432 : void *
433 535084 : dshash_find_or_insert(dshash_table *hash_table,
434 : const void *key,
435 : bool *found)
436 : {
437 : dshash_hash hash;
438 : size_t partition_index;
439 : dshash_partition *partition;
440 : dshash_table_item *item;
441 :
442 535084 : hash = hash_key(hash_table, key);
443 535084 : partition_index = PARTITION_FOR_HASH(hash);
444 535084 : partition = &hash_table->control->partitions[partition_index];
445 :
446 : Assert(hash_table->control->magic == DSHASH_MAGIC);
447 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
448 :
449 540322 : restart:
450 540322 : LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
451 : LW_EXCLUSIVE);
452 540322 : ensure_valid_bucket_pointers(hash_table);
453 :
454 : /* Search the active bucket. */
455 540322 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
456 :
457 540322 : if (item)
458 60 : *found = true;
459 : else
460 : {
461 540262 : *found = false;
462 :
463 : /* Check if we are getting too full. */
464 540262 : if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
465 : {
466 : /*
467 : * The load factor (= keys / buckets) for all buckets protected by
468 : * this partition is > 0.75. Presumably the same applies
469 : * generally across the whole hash table (though we don't attempt
470 : * to track that directly to avoid contention on some kind of
471 : * central counter; we just assume that this partition is
472 : * representative). This is a good time to resize.
473 : *
474 : * Give up our existing lock first, because resizing needs to
475 : * reacquire all the locks in the right order to avoid deadlocks.
476 : */
477 5238 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
478 5238 : resize(hash_table, hash_table->size_log2 + 1);
479 :
480 5238 : goto restart;
481 : }
482 :
483 : /* Finally we can try to insert the new item. */
484 535024 : item = insert_into_bucket(hash_table, key,
485 535024 : &BUCKET_FOR_HASH(hash_table, hash));
486 535024 : item->hash = hash;
487 : /* Adjust per-lock-partition counter for load factor knowledge. */
488 535024 : ++partition->count;
489 : }
490 :
491 : /* The caller must release the lock with dshash_release_lock. */
492 535084 : return ENTRY_FROM_ITEM(item);
493 : }
494 :
495 : /*
496 : * Remove an entry by key. Returns true if the key was found and the
497 : * corresponding entry was removed.
498 : *
499 : * To delete an entry that you already have a pointer to, see
500 : * dshash_delete_entry.
501 : */
502 : bool
503 282 : dshash_delete_key(dshash_table *hash_table, const void *key)
504 : {
505 : dshash_hash hash;
506 : size_t partition;
507 : bool found;
508 :
509 : Assert(hash_table->control->magic == DSHASH_MAGIC);
510 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
511 :
512 282 : hash = hash_key(hash_table, key);
513 282 : partition = PARTITION_FOR_HASH(hash);
514 :
515 282 : LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
516 282 : ensure_valid_bucket_pointers(hash_table);
517 :
518 282 : if (delete_key_from_bucket(hash_table, key,
519 282 : &BUCKET_FOR_HASH(hash_table, hash)))
520 : {
521 : Assert(hash_table->control->partitions[partition].count > 0);
522 212 : found = true;
523 212 : --hash_table->control->partitions[partition].count;
524 : }
525 : else
526 70 : found = false;
527 :
528 282 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
529 :
530 282 : return found;
531 : }
532 :
533 : /*
534 : * Remove an entry. The entry must already be exclusively locked, and must
535 : * have been obtained by dshash_find or dshash_find_or_insert. Note that this
536 : * function releases the lock just like dshash_release_lock.
537 : *
538 : * To delete an entry by key, see dshash_delete_key.
539 : */
540 : void
541 92756 : dshash_delete_entry(dshash_table *hash_table, void *entry)
542 : {
543 92756 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
544 92756 : size_t partition = PARTITION_FOR_HASH(item->hash);
545 :
546 : Assert(hash_table->control->magic == DSHASH_MAGIC);
547 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
548 : LW_EXCLUSIVE));
549 :
550 92756 : delete_item(hash_table, item);
551 92756 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
552 92756 : }
553 :
554 : /*
555 : * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
556 : */
557 : void
558 1942012 : dshash_release_lock(dshash_table *hash_table, void *entry)
559 : {
560 1942012 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
561 1942012 : size_t partition_index = PARTITION_FOR_HASH(item->hash);
562 :
563 : Assert(hash_table->control->magic == DSHASH_MAGIC);
564 :
565 1942012 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
566 1942012 : }
567 :
568 : /*
569 : * A compare function that forwards to memcmp.
570 : */
571 : int
572 372 : dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
573 : {
574 372 : return memcmp(a, b, size);
575 : }
576 :
577 : /*
578 : * A hash function that forwards to tag_hash.
579 : */
580 : dshash_hash
581 1058 : dshash_memhash(const void *v, size_t size, void *arg)
582 : {
583 1058 : return tag_hash(v, size);
584 : }
585 :
586 : /*
587 : * A copy function that forwards to memcpy.
588 : */
589 : void
590 535006 : dshash_memcpy(void *dest, const void *src, size_t size, void *arg)
591 : {
592 535006 : (void) memcpy(dest, src, size);
593 535006 : }
594 :
595 : /*
596 : * A compare function that forwards to strcmp.
597 : */
598 : int
599 22 : dshash_strcmp(const void *a, const void *b, size_t size, void *arg)
600 : {
601 : Assert(strlen((const char *) a) < size);
602 : Assert(strlen((const char *) b) < size);
603 :
604 22 : return strcmp((const char *) a, (const char *) b);
605 : }
606 :
607 : /*
608 : * A hash function that forwards to string_hash.
609 : */
610 : dshash_hash
611 40 : dshash_strhash(const void *v, size_t size, void *arg)
612 : {
613 : Assert(strlen((const char *) v) < size);
614 :
615 40 : return string_hash((const char *) v, size);
616 : }
617 :
618 : /*
619 : * A copy function that forwards to strcpy.
620 : */
621 : void
622 18 : dshash_strcpy(void *dest, const void *src, size_t size, void *arg)
623 : {
624 : Assert(strlen((const char *) src) < size);
625 :
626 18 : (void) strcpy((char *) dest, (const char *) src);
627 18 : }
628 :
629 : /*
630 : * Sequentially scan through dshash table and return all the elements one by
631 : * one, return NULL when all elements have been returned.
632 : *
633 : * dshash_seq_term needs to be called when a scan finished. The caller may
634 : * delete returned elements midst of a scan by using dshash_delete_current()
635 : * if exclusive = true.
636 : */
637 : void
638 1678 : dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
639 : bool exclusive)
640 : {
641 1678 : status->hash_table = hash_table;
642 1678 : status->curbucket = 0;
643 1678 : status->nbuckets = 0;
644 1678 : status->curitem = NULL;
645 1678 : status->pnextitem = InvalidDsaPointer;
646 1678 : status->curpartition = -1;
647 1678 : status->exclusive = exclusive;
648 1678 : }
649 :
650 : /*
651 : * Returns the next element.
652 : *
653 : * Returned elements are locked and the caller may not release the lock. It is
654 : * released by future calls to dshash_seq_next() or dshash_seq_term().
655 : */
656 : void *
657 403744 : dshash_seq_next(dshash_seq_status *status)
658 : {
659 : dsa_pointer next_item_pointer;
660 :
661 : /*
662 : * Not yet holding any partition locks. Need to determine the size of the
663 : * hash table, it could have been resized since we were looking last.
664 : * Since we iterate in partition order, we can start by unconditionally
665 : * lock partition 0.
666 : *
667 : * Once we hold the lock, no resizing can happen until the scan ends. So
668 : * we don't need to repeatedly call ensure_valid_bucket_pointers().
669 : */
670 403744 : if (status->curpartition == -1)
671 : {
672 : Assert(status->curbucket == 0);
673 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(status->hash_table);
674 :
675 1678 : status->curpartition = 0;
676 :
677 1678 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
678 : status->curpartition),
679 1678 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
680 :
681 1678 : ensure_valid_bucket_pointers(status->hash_table);
682 :
683 1678 : status->nbuckets =
684 1678 : NUM_BUCKETS(status->hash_table->control->size_log2);
685 1678 : next_item_pointer = status->hash_table->buckets[status->curbucket];
686 : }
687 : else
688 402066 : next_item_pointer = status->pnextitem;
689 :
690 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(status->hash_table,
691 : status->curpartition),
692 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED));
693 :
694 : /* Move to the next bucket if we finished the current bucket */
695 2599570 : while (!DsaPointerIsValid(next_item_pointer))
696 : {
697 : int next_partition;
698 :
699 2197504 : if (++status->curbucket >= status->nbuckets)
700 : {
701 : /* all buckets have been scanned. finish. */
702 1678 : return NULL;
703 : }
704 :
705 : /* Check if move to the next partition */
706 2195826 : next_partition =
707 2195826 : PARTITION_FOR_BUCKET_INDEX(status->curbucket,
708 : status->hash_table->size_log2);
709 :
710 2195826 : if (status->curpartition != next_partition)
711 : {
712 : /*
713 : * Move to the next partition. Lock the next partition then
714 : * release the current, not in the reverse order to avoid
715 : * concurrent resizing. Avoid dead lock by taking lock in the
716 : * same order with resize().
717 : */
718 213106 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
719 : next_partition),
720 213106 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
721 213106 : LWLockRelease(PARTITION_LOCK(status->hash_table,
722 : status->curpartition));
723 213106 : status->curpartition = next_partition;
724 : }
725 :
726 2195826 : next_item_pointer = status->hash_table->buckets[status->curbucket];
727 : }
728 :
729 402066 : status->curitem =
730 402066 : dsa_get_address(status->hash_table->area, next_item_pointer);
731 :
732 : /*
733 : * The caller may delete the item. Store the next item in case of
734 : * deletion.
735 : */
736 402066 : status->pnextitem = status->curitem->next;
737 :
738 402066 : return ENTRY_FROM_ITEM(status->curitem);
739 : }
740 :
741 : /*
742 : * Terminates the seqscan and release all locks.
743 : *
744 : * Needs to be called after finishing or when exiting a seqscan.
745 : */
746 : void
747 1678 : dshash_seq_term(dshash_seq_status *status)
748 : {
749 1678 : if (status->curpartition >= 0)
750 1678 : LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
751 1678 : }
752 :
753 : /*
754 : * Remove the current entry of the seq scan.
755 : */
756 : void
757 1934 : dshash_delete_current(dshash_seq_status *status)
758 : {
759 1934 : dshash_table *hash_table = status->hash_table;
760 1934 : dshash_table_item *item = status->curitem;
761 : size_t partition PG_USED_FOR_ASSERTS_ONLY;
762 :
763 1934 : partition = PARTITION_FOR_HASH(item->hash);
764 :
765 : Assert(status->exclusive);
766 : Assert(hash_table->control->magic == DSHASH_MAGIC);
767 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
768 : LW_EXCLUSIVE));
769 :
770 1934 : delete_item(hash_table, item);
771 1934 : }
772 :
773 : /*
774 : * Print debugging information about the internal state of the hash table to
775 : * stderr. The caller must hold no partition locks.
776 : */
777 : void
778 0 : dshash_dump(dshash_table *hash_table)
779 : {
780 : size_t i;
781 : size_t j;
782 :
783 : Assert(hash_table->control->magic == DSHASH_MAGIC);
784 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
785 :
786 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
787 : {
788 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
789 0 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
790 : }
791 :
792 0 : ensure_valid_bucket_pointers(hash_table);
793 :
794 0 : fprintf(stderr,
795 0 : "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
796 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
797 : {
798 0 : dshash_partition *partition = &hash_table->control->partitions[i];
799 0 : size_t begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
800 0 : size_t end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
801 :
802 0 : fprintf(stderr, " partition %zu\n", i);
803 0 : fprintf(stderr,
804 : " active buckets (key count = %zu)\n", partition->count);
805 :
806 0 : for (j = begin; j < end; ++j)
807 : {
808 0 : size_t count = 0;
809 0 : dsa_pointer bucket = hash_table->buckets[j];
810 :
811 0 : while (DsaPointerIsValid(bucket))
812 : {
813 : dshash_table_item *item;
814 :
815 0 : item = dsa_get_address(hash_table->area, bucket);
816 :
817 0 : bucket = item->next;
818 0 : ++count;
819 : }
820 0 : fprintf(stderr, " bucket %zu (key count = %zu)\n", j, count);
821 : }
822 : }
823 :
824 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
825 0 : LWLockRelease(PARTITION_LOCK(hash_table, i));
826 0 : }
827 :
828 : /*
829 : * Delete a locked item to which we have a pointer.
830 : */
831 : static void
832 94690 : delete_item(dshash_table *hash_table, dshash_table_item *item)
833 : {
834 94690 : size_t hash = item->hash;
835 94690 : size_t partition = PARTITION_FOR_HASH(hash);
836 :
837 : Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
838 :
839 94690 : if (delete_item_from_bucket(hash_table, item,
840 94690 : &BUCKET_FOR_HASH(hash_table, hash)))
841 : {
842 : Assert(hash_table->control->partitions[partition].count > 0);
843 94690 : --hash_table->control->partitions[partition].count;
844 : }
845 : else
846 : {
847 : Assert(false);
848 : }
849 94690 : }
850 :
851 : /*
852 : * Grow the hash table if necessary to the requested number of buckets. The
853 : * requested size must be double some previously observed size.
854 : *
855 : * Must be called without any partition lock held.
856 : */
857 : static void
858 5238 : resize(dshash_table *hash_table, size_t new_size_log2)
859 : {
860 : dsa_pointer old_buckets;
861 : dsa_pointer new_buckets_shared;
862 : dsa_pointer *new_buckets;
863 : size_t size;
864 5238 : size_t new_size = ((size_t) 1) << new_size_log2;
865 : size_t i;
866 :
867 : /*
868 : * Acquire the locks for all lock partitions. This is expensive, but we
869 : * shouldn't have to do it many times.
870 : */
871 675702 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
872 : {
873 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
874 :
875 670464 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
876 670464 : if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
877 : {
878 : /*
879 : * Another backend has already increased the size; we can avoid
880 : * obtaining all the locks and return early.
881 : */
882 0 : LWLockRelease(PARTITION_LOCK(hash_table, 0));
883 0 : return;
884 : }
885 : }
886 :
887 : Assert(new_size_log2 == hash_table->control->size_log2 + 1);
888 :
889 : /* Allocate the space for the new table. */
890 : new_buckets_shared =
891 5238 : dsa_allocate_extended(hash_table->area,
892 : sizeof(dsa_pointer) * new_size,
893 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
894 5238 : new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
895 :
896 : /*
897 : * We've allocated the new bucket array; all that remains to do now is to
898 : * reinsert all items, which amounts to adjusting all the pointers.
899 : */
900 5238 : size = ((size_t) 1) << hash_table->control->size_log2;
901 2380406 : for (i = 0; i < size; ++i)
902 : {
903 2375168 : dsa_pointer item_pointer = hash_table->buckets[i];
904 :
905 2563284 : while (DsaPointerIsValid(item_pointer))
906 : {
907 : dshash_table_item *item;
908 : dsa_pointer next_item_pointer;
909 :
910 188116 : item = dsa_get_address(hash_table->area, item_pointer);
911 188116 : next_item_pointer = item->next;
912 188116 : insert_item_into_bucket(hash_table, item_pointer, item,
913 188116 : &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
914 : new_size_log2)]);
915 188116 : item_pointer = next_item_pointer;
916 : }
917 : }
918 :
919 : /* Swap the hash table into place and free the old one. */
920 5238 : old_buckets = hash_table->control->buckets;
921 5238 : hash_table->control->buckets = new_buckets_shared;
922 5238 : hash_table->control->size_log2 = new_size_log2;
923 5238 : hash_table->buckets = new_buckets;
924 5238 : dsa_free(hash_table->area, old_buckets);
925 :
926 : /* Release all the locks. */
927 675702 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
928 670464 : LWLockRelease(PARTITION_LOCK(hash_table, i));
929 : }
930 :
931 : /*
932 : * Make sure that our backend-local bucket pointers are up to date. The
933 : * caller must have locked one lock partition, which prevents resize() from
934 : * running concurrently.
935 : */
936 : static inline void
937 2442624 : ensure_valid_bucket_pointers(dshash_table *hash_table)
938 : {
939 2442624 : if (hash_table->size_log2 != hash_table->control->size_log2)
940 : {
941 80968 : hash_table->buckets = dsa_get_address(hash_table->area,
942 40484 : hash_table->control->buckets);
943 40484 : hash_table->size_log2 = hash_table->control->size_log2;
944 : }
945 2442624 : }
946 :
947 : /*
948 : * Scan a locked bucket for a match, using the provided compare function.
949 : */
950 : static inline dshash_table_item *
951 2440664 : find_in_bucket(dshash_table *hash_table, const void *key,
952 : dsa_pointer item_pointer)
953 : {
954 2766478 : while (DsaPointerIsValid(item_pointer))
955 : {
956 : dshash_table_item *item;
957 :
958 1825558 : item = dsa_get_address(hash_table->area, item_pointer);
959 1825558 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
960 1499744 : return item;
961 325814 : item_pointer = item->next;
962 : }
963 940920 : return NULL;
964 : }
965 :
966 : /*
967 : * Insert an already-allocated item into a bucket.
968 : */
969 : static void
970 723140 : insert_item_into_bucket(dshash_table *hash_table,
971 : dsa_pointer item_pointer,
972 : dshash_table_item *item,
973 : dsa_pointer *bucket)
974 : {
975 : Assert(item == dsa_get_address(hash_table->area, item_pointer));
976 :
977 723140 : item->next = *bucket;
978 723140 : *bucket = item_pointer;
979 723140 : }
980 :
981 : /*
982 : * Allocate space for an entry with the given key and insert it into the
983 : * provided bucket.
984 : */
985 : static dshash_table_item *
986 535024 : insert_into_bucket(dshash_table *hash_table,
987 : const void *key,
988 : dsa_pointer *bucket)
989 : {
990 : dsa_pointer item_pointer;
991 : dshash_table_item *item;
992 :
993 535024 : item_pointer = dsa_allocate(hash_table->area,
994 : hash_table->params.entry_size +
995 : MAXALIGN(sizeof(dshash_table_item)));
996 535024 : item = dsa_get_address(hash_table->area, item_pointer);
997 535024 : copy_key(hash_table, ENTRY_FROM_ITEM(item), key);
998 535024 : insert_item_into_bucket(hash_table, item_pointer, item, bucket);
999 535024 : return item;
1000 : }
1001 :
1002 : /*
1003 : * Search a bucket for a matching key and delete it.
1004 : */
1005 : static bool
1006 282 : delete_key_from_bucket(dshash_table *hash_table,
1007 : const void *key,
1008 : dsa_pointer *bucket_head)
1009 : {
1010 282 : while (DsaPointerIsValid(*bucket_head))
1011 : {
1012 : dshash_table_item *item;
1013 :
1014 212 : item = dsa_get_address(hash_table->area, *bucket_head);
1015 :
1016 212 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
1017 : {
1018 : dsa_pointer next;
1019 :
1020 212 : next = item->next;
1021 212 : dsa_free(hash_table->area, *bucket_head);
1022 212 : *bucket_head = next;
1023 :
1024 212 : return true;
1025 : }
1026 0 : bucket_head = &item->next;
1027 : }
1028 70 : return false;
1029 : }
1030 :
1031 : /*
1032 : * Delete the specified item from the bucket.
1033 : */
1034 : static bool
1035 94690 : delete_item_from_bucket(dshash_table *hash_table,
1036 : dshash_table_item *item,
1037 : dsa_pointer *bucket_head)
1038 : {
1039 95200 : while (DsaPointerIsValid(*bucket_head))
1040 : {
1041 : dshash_table_item *bucket_item;
1042 :
1043 95200 : bucket_item = dsa_get_address(hash_table->area, *bucket_head);
1044 :
1045 95200 : if (bucket_item == item)
1046 : {
1047 : dsa_pointer next;
1048 :
1049 94690 : next = item->next;
1050 94690 : dsa_free(hash_table->area, *bucket_head);
1051 94690 : *bucket_head = next;
1052 94690 : return true;
1053 : }
1054 510 : bucket_head = &bucket_item->next;
1055 : }
1056 0 : return false;
1057 : }
1058 :
1059 : /*
1060 : * Compute the hash value for a key.
1061 : */
1062 : static inline dshash_hash
1063 2435708 : hash_key(dshash_table *hash_table, const void *key)
1064 : {
1065 2435708 : return hash_table->params.hash_function(key,
1066 : hash_table->params.key_size,
1067 : hash_table->arg);
1068 : }
1069 :
1070 : /*
1071 : * Check whether two keys compare equal.
1072 : */
1073 : static inline bool
1074 1825770 : equal_keys(dshash_table *hash_table, const void *a, const void *b)
1075 : {
1076 1825770 : return hash_table->params.compare_function(a, b,
1077 : hash_table->params.key_size,
1078 1825770 : hash_table->arg) == 0;
1079 : }
1080 :
1081 : /*
1082 : * Copy a key.
1083 : */
1084 : static inline void
1085 535024 : copy_key(dshash_table *hash_table, void *dest, const void *src)
1086 : {
1087 535024 : hash_table->params.copy_function(dest, src,
1088 : hash_table->params.key_size,
1089 : hash_table->arg);
1090 535024 : }
|