Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dshash.c
4 : * Concurrent hash tables backed by dynamic shared memory areas.
5 : *
6 : * This is an open hashing hash table, with a linked list at each table
7 : * entry. It supports dynamic resizing, as required to prevent the linked
8 : * lists from growing too long on average. Currently, only growing is
9 : * supported: the hash table never becomes smaller.
10 : *
11 : * To deal with concurrency, it has a fixed size set of partitions, each of
12 : * which is independently locked. Each bucket maps to a partition; so insert,
13 : * find and iterate operations normally only acquire one lock. Therefore,
14 : * good concurrency is achieved whenever such operations don't collide at the
15 : * lock partition level. However, when a resize operation begins, all
16 : * partition locks must be acquired simultaneously for a brief period. This
17 : * is only expected to happen a small number of times until a stable size is
18 : * found, since growth is geometric.
19 : *
20 : * Future versions may support iterators and incremental resizing; for now
21 : * the implementation is minimalist.
22 : *
23 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
24 : * Portions Copyright (c) 1994, Regents of the University of California
25 : *
26 : * IDENTIFICATION
27 : * src/backend/lib/dshash.c
28 : *
29 : *-------------------------------------------------------------------------
30 : */
31 :
32 : #include "postgres.h"
33 :
34 : #include <limits.h>
35 :
36 : #include "common/hashfn.h"
37 : #include "lib/dshash.h"
38 : #include "storage/lwlock.h"
39 : #include "utils/dsa.h"
40 :
41 : /*
42 : * An item in the hash table. This wraps the user's entry object in an
43 : * envelop that holds a pointer back to the bucket and a pointer to the next
44 : * item in the bucket.
45 : */
46 : struct dshash_table_item
47 : {
48 : /* The next item in the same bucket. */
49 : dsa_pointer next;
50 : /* The hashed key, to avoid having to recompute it. */
51 : dshash_hash hash;
52 : /* The user's entry object follows here. See ENTRY_FROM_ITEM(item). */
53 : };
54 :
55 : /*
56 : * The number of partitions for locking purposes. This is set to match
57 : * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
58 : * the buffer pool must be good enough for any other purpose. This could
59 : * become a runtime parameter in future.
60 : */
61 : #define DSHASH_NUM_PARTITIONS_LOG2 7
62 : #define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
63 :
64 : /* A magic value used to identify our hash tables. */
65 : #define DSHASH_MAGIC 0x75ff6a20
66 :
67 : /*
68 : * Tracking information for each lock partition. Initially, each partition
69 : * corresponds to one bucket, but each time the hash table grows, the buckets
70 : * covered by each partition split so the number of buckets covered doubles.
71 : *
72 : * We might want to add padding here so that each partition is on a different
73 : * cache line, but doing so would bloat this structure considerably.
74 : */
75 : typedef struct dshash_partition
76 : {
77 : LWLock lock; /* Protects all buckets in this partition. */
78 : size_t count; /* # of items in this partition's buckets */
79 : } dshash_partition;
80 :
81 : /*
82 : * The head object for a hash table. This will be stored in dynamic shared
83 : * memory.
84 : */
85 : typedef struct dshash_table_control
86 : {
87 : dshash_table_handle handle;
88 : uint32 magic;
89 : dshash_partition partitions[DSHASH_NUM_PARTITIONS];
90 : int lwlock_tranche_id;
91 :
92 : /*
93 : * The following members are written to only when ALL partitions locks are
94 : * held. They can be read when any one partition lock is held.
95 : */
96 :
97 : /* Number of buckets expressed as power of 2 (8 = 256 buckets). */
98 : size_t size_log2; /* log2(number of buckets) */
99 : dsa_pointer buckets; /* current bucket array */
100 : } dshash_table_control;
101 :
102 : /*
103 : * Per-backend state for a dynamic hash table.
104 : */
105 : struct dshash_table
106 : {
107 : dsa_area *area; /* Backing dynamic shared memory area. */
108 : dshash_parameters params; /* Parameters. */
109 : void *arg; /* User-supplied data pointer. */
110 : dshash_table_control *control; /* Control object in DSM. */
111 : dsa_pointer *buckets; /* Current bucket pointers in DSM. */
112 : size_t size_log2; /* log2(number of buckets) */
113 : };
114 :
115 : /* Given a pointer to an item, find the entry (user data) it holds. */
116 : #define ENTRY_FROM_ITEM(item) \
117 : ((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
118 :
119 : /* Given a pointer to an entry, find the item that holds it. */
120 : #define ITEM_FROM_ENTRY(entry) \
121 : ((dshash_table_item *)((char *)(entry) - \
122 : MAXALIGN(sizeof(dshash_table_item))))
123 :
124 : /* How many resize operations (bucket splits) have there been? */
125 : #define NUM_SPLITS(size_log2) \
126 : (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
127 :
128 : /* How many buckets are there in a given size? */
129 : #define NUM_BUCKETS(size_log2) \
130 : (((size_t) 1) << (size_log2))
131 :
132 : /* How many buckets are there in each partition at a given size? */
133 : #define BUCKETS_PER_PARTITION(size_log2) \
134 : (((size_t) 1) << NUM_SPLITS(size_log2))
135 :
136 : /* Max entries before we need to grow. Half + quarter = 75% load factor. */
137 : #define MAX_COUNT_PER_PARTITION(hash_table) \
138 : (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
139 : BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
140 :
141 : /* Choose partition based on the highest order bits of the hash. */
142 : #define PARTITION_FOR_HASH(hash) \
143 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
144 :
145 : /*
146 : * Find the bucket index for a given hash and table size. Each time the table
147 : * doubles in size, the appropriate bucket for a given hash value doubles and
148 : * possibly adds one, depending on the newly revealed bit, so that all buckets
149 : * are split.
150 : */
151 : #define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2) \
152 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
153 :
154 : /* The index of the first bucket in a given partition. */
155 : #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
156 : ((partition) << NUM_SPLITS(size_log2))
157 :
158 : /* Choose partition based on bucket index. */
159 : #define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2) \
160 : ((bucket_idx) >> NUM_SPLITS(size_log2))
161 :
162 : /* The head of the active bucket for a given hash value (lvalue). */
163 : #define BUCKET_FOR_HASH(hash_table, hash) \
164 : (hash_table->buckets[ \
165 : BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, \
166 : hash_table->size_log2)])
167 :
168 : static void delete_item(dshash_table *hash_table,
169 : dshash_table_item *item);
170 : static void resize(dshash_table *hash_table, size_t new_size_log2);
171 : static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
172 : static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
173 : const void *key,
174 : dsa_pointer item_pointer);
175 : static void insert_item_into_bucket(dshash_table *hash_table,
176 : dsa_pointer item_pointer,
177 : dshash_table_item *item,
178 : dsa_pointer *bucket);
179 : static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
180 : const void *key,
181 : dsa_pointer *bucket);
182 : static bool delete_key_from_bucket(dshash_table *hash_table,
183 : const void *key,
184 : dsa_pointer *bucket_head);
185 : static bool delete_item_from_bucket(dshash_table *hash_table,
186 : dshash_table_item *item,
187 : dsa_pointer *bucket_head);
188 : static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
189 : static inline bool equal_keys(dshash_table *hash_table,
190 : const void *a, const void *b);
191 : static inline void copy_key(dshash_table *hash_table, void *dest,
192 : const void *src);
193 :
194 : #define PARTITION_LOCK(hash_table, i) \
195 : (&(hash_table)->control->partitions[(i)].lock)
196 :
197 : #define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \
198 : Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \
199 : DSHASH_NUM_PARTITIONS, sizeof(dshash_partition)))
200 :
201 : /*
202 : * Create a new hash table backed by the given dynamic shared area, with the
203 : * given parameters. The returned object is allocated in backend-local memory
204 : * using the current MemoryContext. 'arg' will be passed through to the
205 : * compare, hash, and copy functions.
206 : */
207 : dshash_table *
208 2622 : dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
209 : {
210 : dshash_table *hash_table;
211 : dsa_pointer control;
212 :
213 : /* Allocate the backend-local object representing the hash table. */
214 2622 : hash_table = palloc(sizeof(dshash_table));
215 :
216 : /* Allocate the control object in shared memory. */
217 2622 : control = dsa_allocate(area, sizeof(dshash_table_control));
218 :
219 : /* Set up the local and shared hash table structs. */
220 2622 : hash_table->area = area;
221 2622 : hash_table->params = *params;
222 2622 : hash_table->arg = arg;
223 2622 : hash_table->control = dsa_get_address(area, control);
224 2622 : hash_table->control->handle = control;
225 2622 : hash_table->control->magic = DSHASH_MAGIC;
226 2622 : hash_table->control->lwlock_tranche_id = params->tranche_id;
227 :
228 : /* Set up the array of lock partitions. */
229 : {
230 2622 : dshash_partition *partitions = hash_table->control->partitions;
231 2622 : int tranche_id = hash_table->control->lwlock_tranche_id;
232 : int i;
233 :
234 338238 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
235 : {
236 335616 : LWLockInitialize(&partitions[i].lock, tranche_id);
237 335616 : partitions[i].count = 0;
238 : }
239 : }
240 :
241 : /*
242 : * Set up the initial array of buckets. Our initial size is the same as
243 : * the number of partitions.
244 : */
245 2622 : hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
246 5244 : hash_table->control->buckets =
247 2622 : dsa_allocate_extended(area,
248 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
249 : DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
250 2622 : if (!DsaPointerIsValid(hash_table->control->buckets))
251 : {
252 0 : dsa_free(area, control);
253 0 : ereport(ERROR,
254 : (errcode(ERRCODE_OUT_OF_MEMORY),
255 : errmsg("out of memory"),
256 : errdetail("Failed on DSA request of size %zu.",
257 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
258 : }
259 5244 : hash_table->buckets = dsa_get_address(area,
260 2622 : hash_table->control->buckets);
261 2622 : hash_table->size_log2 = hash_table->control->size_log2;
262 :
263 2622 : return hash_table;
264 : }
265 :
266 : /*
267 : * Attach to an existing hash table using a handle. The returned object is
268 : * allocated in backend-local memory using the current MemoryContext. 'arg'
269 : * will be passed through to the compare and hash functions.
270 : */
271 : dshash_table *
272 50600 : dshash_attach(dsa_area *area, const dshash_parameters *params,
273 : dshash_table_handle handle, void *arg)
274 : {
275 : dshash_table *hash_table;
276 : dsa_pointer control;
277 :
278 : /* Allocate the backend-local object representing the hash table. */
279 50600 : hash_table = palloc(sizeof(dshash_table));
280 :
281 : /* Find the control object in shared memory. */
282 50600 : control = handle;
283 :
284 : /* Set up the local hash table struct. */
285 50600 : hash_table->area = area;
286 50600 : hash_table->params = *params;
287 50600 : hash_table->arg = arg;
288 50600 : hash_table->control = dsa_get_address(area, control);
289 : Assert(hash_table->control->magic == DSHASH_MAGIC);
290 :
291 : /*
292 : * These will later be set to the correct values by
293 : * ensure_valid_bucket_pointers(), at which time we'll be holding a
294 : * partition lock for interlocking against concurrent resizing.
295 : */
296 50600 : hash_table->buckets = NULL;
297 50600 : hash_table->size_log2 = 0;
298 :
299 50600 : return hash_table;
300 : }
301 :
302 : /*
303 : * Detach from a hash table. This frees backend-local resources associated
304 : * with the hash table, but the hash table will continue to exist until it is
305 : * either explicitly destroyed (by a backend that is still attached to it), or
306 : * the area that backs it is returned to the operating system.
307 : */
308 : void
309 52656 : dshash_detach(dshash_table *hash_table)
310 : {
311 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
312 :
313 : /* The hash table may have been destroyed. Just free local memory. */
314 52656 : pfree(hash_table);
315 52656 : }
316 :
317 : /*
318 : * Destroy a hash table, returning all memory to the area. The caller must be
319 : * certain that no other backend will attempt to access the hash table before
320 : * calling this function. Other backend must explicitly call dshash_detach to
321 : * free up backend-local memory associated with the hash table. The backend
322 : * that calls dshash_destroy must not call dshash_detach.
323 : */
324 : void
325 0 : dshash_destroy(dshash_table *hash_table)
326 : {
327 : size_t size;
328 : size_t i;
329 :
330 : Assert(hash_table->control->magic == DSHASH_MAGIC);
331 0 : ensure_valid_bucket_pointers(hash_table);
332 :
333 : /* Free all the entries. */
334 0 : size = NUM_BUCKETS(hash_table->size_log2);
335 0 : for (i = 0; i < size; ++i)
336 : {
337 0 : dsa_pointer item_pointer = hash_table->buckets[i];
338 :
339 0 : while (DsaPointerIsValid(item_pointer))
340 : {
341 : dshash_table_item *item;
342 : dsa_pointer next_item_pointer;
343 :
344 0 : item = dsa_get_address(hash_table->area, item_pointer);
345 0 : next_item_pointer = item->next;
346 0 : dsa_free(hash_table->area, item_pointer);
347 0 : item_pointer = next_item_pointer;
348 : }
349 : }
350 :
351 : /*
352 : * Vandalize the control block to help catch programming errors where
353 : * other backends access the memory formerly occupied by this hash table.
354 : */
355 0 : hash_table->control->magic = 0;
356 :
357 : /* Free the active table and control object. */
358 0 : dsa_free(hash_table->area, hash_table->control->buckets);
359 0 : dsa_free(hash_table->area, hash_table->control->handle);
360 :
361 0 : pfree(hash_table);
362 0 : }
363 :
364 : /*
365 : * Get a handle that can be used by other processes to attach to this hash
366 : * table.
367 : */
368 : dshash_table_handle
369 2622 : dshash_get_hash_table_handle(dshash_table *hash_table)
370 : {
371 : Assert(hash_table->control->magic == DSHASH_MAGIC);
372 :
373 2622 : return hash_table->control->handle;
374 : }
375 :
376 : /*
377 : * Look up an entry, given a key. Returns a pointer to an entry if one can be
378 : * found with the given key. Returns NULL if the key is not found. If a
379 : * non-NULL value is returned, the entry is locked and must be released by
380 : * calling dshash_release_lock. If an error is raised before
381 : * dshash_release_lock is called, the lock will be released automatically, but
382 : * the caller must take care to ensure that the entry is not left corrupted.
383 : * The lock mode is either shared or exclusive depending on 'exclusive'.
384 : *
385 : * The caller must not hold a lock already.
386 : *
387 : * Note that the lock held is in fact an LWLock, so interrupts will be held on
388 : * return from this function, and not resumed until dshash_release_lock is
389 : * called. It is a very good idea for the caller to release the lock quickly.
390 : */
391 : void *
392 2431186 : dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
393 : {
394 : dshash_hash hash;
395 : size_t partition;
396 : dshash_table_item *item;
397 :
398 2431186 : hash = hash_key(hash_table, key);
399 2431186 : partition = PARTITION_FOR_HASH(hash);
400 :
401 : Assert(hash_table->control->magic == DSHASH_MAGIC);
402 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
403 :
404 2431186 : LWLockAcquire(PARTITION_LOCK(hash_table, partition),
405 2431186 : exclusive ? LW_EXCLUSIVE : LW_SHARED);
406 2431186 : ensure_valid_bucket_pointers(hash_table);
407 :
408 : /* Search the active bucket. */
409 2431186 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
410 :
411 2431186 : if (!item)
412 : {
413 : /* Not found. */
414 453098 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
415 453098 : return NULL;
416 : }
417 : else
418 : {
419 : /* The caller will free the lock by calling dshash_release_lock. */
420 1978088 : return ENTRY_FROM_ITEM(item);
421 : }
422 : }
423 :
424 : /*
425 : * Returns a pointer to an exclusively locked item which must be released with
426 : * dshash_release_lock. If the key is found in the hash table, 'found' is set
427 : * to true and a pointer to the existing entry is returned. If the key is not
428 : * found, 'found' is set to false, and a pointer to a newly created entry is
429 : * returned.
430 : *
431 : * Notes above dshash_find() regarding locking and error handling equally
432 : * apply here.
433 : */
434 : void *
435 647194 : dshash_find_or_insert(dshash_table *hash_table,
436 : const void *key,
437 : bool *found)
438 : {
439 : dshash_hash hash;
440 : size_t partition_index;
441 : dshash_partition *partition;
442 : dshash_table_item *item;
443 :
444 647194 : hash = hash_key(hash_table, key);
445 647194 : partition_index = PARTITION_FOR_HASH(hash);
446 647194 : partition = &hash_table->control->partitions[partition_index];
447 :
448 : Assert(hash_table->control->magic == DSHASH_MAGIC);
449 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
450 :
451 653462 : restart:
452 653462 : LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
453 : LW_EXCLUSIVE);
454 653462 : ensure_valid_bucket_pointers(hash_table);
455 :
456 : /* Search the active bucket. */
457 653462 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
458 :
459 653462 : if (item)
460 204 : *found = true;
461 : else
462 : {
463 653258 : *found = false;
464 :
465 : /* Check if we are getting too full. */
466 653258 : if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
467 : {
468 : /*
469 : * The load factor (= keys / buckets) for all buckets protected by
470 : * this partition is > 0.75. Presumably the same applies
471 : * generally across the whole hash table (though we don't attempt
472 : * to track that directly to avoid contention on some kind of
473 : * central counter; we just assume that this partition is
474 : * representative). This is a good time to resize.
475 : *
476 : * Give up our existing lock first, because resizing needs to
477 : * reacquire all the locks in the right order to avoid deadlocks.
478 : */
479 6268 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
480 6268 : resize(hash_table, hash_table->size_log2 + 1);
481 :
482 6268 : goto restart;
483 : }
484 :
485 : /* Finally we can try to insert the new item. */
486 646990 : item = insert_into_bucket(hash_table, key,
487 646990 : &BUCKET_FOR_HASH(hash_table, hash));
488 646990 : item->hash = hash;
489 : /* Adjust per-lock-partition counter for load factor knowledge. */
490 646990 : ++partition->count;
491 : }
492 :
493 : /* The caller must release the lock with dshash_release_lock. */
494 647194 : return ENTRY_FROM_ITEM(item);
495 : }
496 :
497 : /*
498 : * Remove an entry by key. Returns true if the key was found and the
499 : * corresponding entry was removed.
500 : *
501 : * To delete an entry that you already have a pointer to, see
502 : * dshash_delete_entry.
503 : */
504 : bool
505 446 : dshash_delete_key(dshash_table *hash_table, const void *key)
506 : {
507 : dshash_hash hash;
508 : size_t partition;
509 : bool found;
510 :
511 : Assert(hash_table->control->magic == DSHASH_MAGIC);
512 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
513 :
514 446 : hash = hash_key(hash_table, key);
515 446 : partition = PARTITION_FOR_HASH(hash);
516 :
517 446 : LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
518 446 : ensure_valid_bucket_pointers(hash_table);
519 :
520 446 : if (delete_key_from_bucket(hash_table, key,
521 446 : &BUCKET_FOR_HASH(hash_table, hash)))
522 : {
523 : Assert(hash_table->control->partitions[partition].count > 0);
524 276 : found = true;
525 276 : --hash_table->control->partitions[partition].count;
526 : }
527 : else
528 170 : found = false;
529 :
530 446 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
531 :
532 446 : return found;
533 : }
534 :
535 : /*
536 : * Remove an entry. The entry must already be exclusively locked, and must
537 : * have been obtained by dshash_find or dshash_find_or_insert. Note that this
538 : * function releases the lock just like dshash_release_lock.
539 : *
540 : * To delete an entry by key, see dshash_delete_key.
541 : */
542 : void
543 104060 : dshash_delete_entry(dshash_table *hash_table, void *entry)
544 : {
545 104060 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
546 104060 : size_t partition = PARTITION_FOR_HASH(item->hash);
547 :
548 : Assert(hash_table->control->magic == DSHASH_MAGIC);
549 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
550 : LW_EXCLUSIVE));
551 :
552 104060 : delete_item(hash_table, item);
553 104060 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
554 104060 : }
555 :
556 : /*
557 : * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
558 : */
559 : void
560 2521222 : dshash_release_lock(dshash_table *hash_table, void *entry)
561 : {
562 2521222 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
563 2521222 : size_t partition_index = PARTITION_FOR_HASH(item->hash);
564 :
565 : Assert(hash_table->control->magic == DSHASH_MAGIC);
566 :
567 2521222 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
568 2521222 : }
569 :
570 : /*
571 : * A compare function that forwards to memcmp.
572 : */
573 : int
574 808 : dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
575 : {
576 808 : return memcmp(a, b, size);
577 : }
578 :
579 : /*
580 : * A hash function that forwards to tag_hash.
581 : */
582 : dshash_hash
583 1684 : dshash_memhash(const void *v, size_t size, void *arg)
584 : {
585 1684 : return tag_hash(v, size);
586 : }
587 :
588 : /*
589 : * A copy function that forwards to memcpy.
590 : */
591 : void
592 646964 : dshash_memcpy(void *dest, const void *src, size_t size, void *arg)
593 : {
594 646964 : (void) memcpy(dest, src, size);
595 646964 : }
596 :
597 : /*
598 : * A compare function that forwards to strcmp.
599 : */
600 : int
601 74 : dshash_strcmp(const void *a, const void *b, size_t size, void *arg)
602 : {
603 : Assert(strlen((const char *) a) < size);
604 : Assert(strlen((const char *) b) < size);
605 :
606 74 : return strcmp((const char *) a, (const char *) b);
607 : }
608 :
609 : /*
610 : * A hash function that forwards to string_hash.
611 : */
612 : dshash_hash
613 100 : dshash_strhash(const void *v, size_t size, void *arg)
614 : {
615 : Assert(strlen((const char *) v) < size);
616 :
617 100 : return string_hash((const char *) v, size);
618 : }
619 :
620 : /*
621 : * A copy function that forwards to strcpy.
622 : */
623 : void
624 26 : dshash_strcpy(void *dest, const void *src, size_t size, void *arg)
625 : {
626 : Assert(strlen((const char *) src) < size);
627 :
628 26 : (void) strcpy((char *) dest, (const char *) src);
629 26 : }
630 :
631 : /*
632 : * Sequentially scan through dshash table and return all the elements one by
633 : * one, return NULL when all elements have been returned.
634 : *
635 : * dshash_seq_term needs to be called when a scan finished. The caller may
636 : * delete returned elements midst of a scan by using dshash_delete_current()
637 : * if exclusive = true.
638 : */
639 : void
640 1940 : dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
641 : bool exclusive)
642 : {
643 1940 : status->hash_table = hash_table;
644 1940 : status->curbucket = 0;
645 1940 : status->nbuckets = 0;
646 1940 : status->curitem = NULL;
647 1940 : status->pnextitem = InvalidDsaPointer;
648 1940 : status->curpartition = -1;
649 1940 : status->exclusive = exclusive;
650 1940 : }
651 :
652 : /*
653 : * Returns the next element.
654 : *
655 : * Returned elements are locked and the caller may not release the lock. It is
656 : * released by future calls to dshash_seq_next() or dshash_seq_term().
657 : */
658 : void *
659 501830 : dshash_seq_next(dshash_seq_status *status)
660 : {
661 : dsa_pointer next_item_pointer;
662 :
663 : /*
664 : * Not yet holding any partition locks. Need to determine the size of the
665 : * hash table, it could have been resized since we were looking last.
666 : * Since we iterate in partition order, we can start by unconditionally
667 : * lock partition 0.
668 : *
669 : * Once we hold the lock, no resizing can happen until the scan ends. So
670 : * we don't need to repeatedly call ensure_valid_bucket_pointers().
671 : */
672 501830 : if (status->curpartition == -1)
673 : {
674 : Assert(status->curbucket == 0);
675 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(status->hash_table);
676 :
677 1940 : status->curpartition = 0;
678 :
679 1940 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
680 : status->curpartition),
681 1940 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
682 :
683 1940 : ensure_valid_bucket_pointers(status->hash_table);
684 :
685 1940 : status->nbuckets =
686 1940 : NUM_BUCKETS(status->hash_table->control->size_log2);
687 1940 : next_item_pointer = status->hash_table->buckets[status->curbucket];
688 : }
689 : else
690 499890 : next_item_pointer = status->pnextitem;
691 :
692 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(status->hash_table,
693 : status->curpartition),
694 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED));
695 :
696 : /* Move to the next bucket if we finished the current bucket */
697 3190194 : while (!DsaPointerIsValid(next_item_pointer))
698 : {
699 : int next_partition;
700 :
701 2690304 : if (++status->curbucket >= status->nbuckets)
702 : {
703 : /* all buckets have been scanned. finish. */
704 1940 : return NULL;
705 : }
706 :
707 : /* Check if move to the next partition */
708 2688364 : next_partition =
709 2688364 : PARTITION_FOR_BUCKET_INDEX(status->curbucket,
710 : status->hash_table->size_log2);
711 :
712 2688364 : if (status->curpartition != next_partition)
713 : {
714 : /*
715 : * Move to the next partition. Lock the next partition then
716 : * release the current, not in the reverse order to avoid
717 : * concurrent resizing. Avoid dead lock by taking lock in the
718 : * same order with resize().
719 : */
720 246380 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
721 : next_partition),
722 246380 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
723 246380 : LWLockRelease(PARTITION_LOCK(status->hash_table,
724 : status->curpartition));
725 246380 : status->curpartition = next_partition;
726 : }
727 :
728 2688364 : next_item_pointer = status->hash_table->buckets[status->curbucket];
729 : }
730 :
731 499890 : status->curitem =
732 499890 : dsa_get_address(status->hash_table->area, next_item_pointer);
733 :
734 : /*
735 : * The caller may delete the item. Store the next item in case of
736 : * deletion.
737 : */
738 499890 : status->pnextitem = status->curitem->next;
739 :
740 499890 : return ENTRY_FROM_ITEM(status->curitem);
741 : }
742 :
743 : /*
744 : * Terminates the seqscan and release all locks.
745 : *
746 : * Needs to be called after finishing or when exiting a seqscan.
747 : */
748 : void
749 1940 : dshash_seq_term(dshash_seq_status *status)
750 : {
751 1940 : if (status->curpartition >= 0)
752 1940 : LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
753 1940 : }
754 :
755 : /*
756 : * Remove the current entry of the seq scan.
757 : */
758 : void
759 8794 : dshash_delete_current(dshash_seq_status *status)
760 : {
761 8794 : dshash_table *hash_table = status->hash_table;
762 8794 : dshash_table_item *item = status->curitem;
763 : size_t partition PG_USED_FOR_ASSERTS_ONLY;
764 :
765 8794 : partition = PARTITION_FOR_HASH(item->hash);
766 :
767 : Assert(status->exclusive);
768 : Assert(hash_table->control->magic == DSHASH_MAGIC);
769 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
770 : LW_EXCLUSIVE));
771 :
772 8794 : delete_item(hash_table, item);
773 8794 : }
774 :
775 : /*
776 : * Print debugging information about the internal state of the hash table to
777 : * stderr. The caller must hold no partition locks.
778 : */
779 : void
780 0 : dshash_dump(dshash_table *hash_table)
781 : {
782 : size_t i;
783 : size_t j;
784 :
785 : Assert(hash_table->control->magic == DSHASH_MAGIC);
786 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
787 :
788 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
789 : {
790 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
791 0 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
792 : }
793 :
794 0 : ensure_valid_bucket_pointers(hash_table);
795 :
796 0 : fprintf(stderr,
797 0 : "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
798 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
799 : {
800 0 : dshash_partition *partition = &hash_table->control->partitions[i];
801 0 : size_t begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
802 0 : size_t end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
803 :
804 0 : fprintf(stderr, " partition %zu\n", i);
805 0 : fprintf(stderr,
806 : " active buckets (key count = %zu)\n", partition->count);
807 :
808 0 : for (j = begin; j < end; ++j)
809 : {
810 0 : size_t count = 0;
811 0 : dsa_pointer bucket = hash_table->buckets[j];
812 :
813 0 : while (DsaPointerIsValid(bucket))
814 : {
815 : dshash_table_item *item;
816 :
817 0 : item = dsa_get_address(hash_table->area, bucket);
818 :
819 0 : bucket = item->next;
820 0 : ++count;
821 : }
822 0 : fprintf(stderr, " bucket %zu (key count = %zu)\n", j, count);
823 : }
824 : }
825 :
826 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
827 0 : LWLockRelease(PARTITION_LOCK(hash_table, i));
828 0 : }
829 :
830 : /*
831 : * Delete a locked item to which we have a pointer.
832 : */
833 : static void
834 112854 : delete_item(dshash_table *hash_table, dshash_table_item *item)
835 : {
836 112854 : size_t hash = item->hash;
837 112854 : size_t partition = PARTITION_FOR_HASH(hash);
838 :
839 : Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
840 :
841 112854 : if (delete_item_from_bucket(hash_table, item,
842 112854 : &BUCKET_FOR_HASH(hash_table, hash)))
843 : {
844 : Assert(hash_table->control->partitions[partition].count > 0);
845 112854 : --hash_table->control->partitions[partition].count;
846 : }
847 : else
848 : {
849 : Assert(false);
850 : }
851 112854 : }
852 :
853 : /*
854 : * Grow the hash table if necessary to the requested number of buckets. The
855 : * requested size must be double some previously observed size.
856 : *
857 : * Must be called without any partition lock held.
858 : */
859 : static void
860 6268 : resize(dshash_table *hash_table, size_t new_size_log2)
861 : {
862 : dsa_pointer old_buckets;
863 : dsa_pointer new_buckets_shared;
864 : dsa_pointer *new_buckets;
865 : size_t size;
866 6268 : size_t new_size = ((size_t) 1) << new_size_log2;
867 : size_t i;
868 :
869 : /*
870 : * Acquire the locks for all lock partitions. This is expensive, but we
871 : * shouldn't have to do it many times.
872 : */
873 808572 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
874 : {
875 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
876 :
877 802304 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
878 802304 : if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
879 : {
880 : /*
881 : * Another backend has already increased the size; we can avoid
882 : * obtaining all the locks and return early.
883 : */
884 0 : LWLockRelease(PARTITION_LOCK(hash_table, 0));
885 0 : return;
886 : }
887 : }
888 :
889 : Assert(new_size_log2 == hash_table->control->size_log2 + 1);
890 :
891 : /* Allocate the space for the new table. */
892 : new_buckets_shared =
893 6268 : dsa_allocate_extended(hash_table->area,
894 : sizeof(dsa_pointer) * new_size,
895 : DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
896 6268 : new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
897 :
898 : /*
899 : * We've allocated the new bucket array; all that remains to do now is to
900 : * reinsert all items, which amounts to adjusting all the pointers.
901 : */
902 6268 : size = ((size_t) 1) << hash_table->control->size_log2;
903 2885756 : for (i = 0; i < size; ++i)
904 : {
905 2879488 : dsa_pointer item_pointer = hash_table->buckets[i];
906 :
907 3100214 : while (DsaPointerIsValid(item_pointer))
908 : {
909 : dshash_table_item *item;
910 : dsa_pointer next_item_pointer;
911 :
912 220726 : item = dsa_get_address(hash_table->area, item_pointer);
913 220726 : next_item_pointer = item->next;
914 220726 : insert_item_into_bucket(hash_table, item_pointer, item,
915 220726 : &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
916 : new_size_log2)]);
917 220726 : item_pointer = next_item_pointer;
918 : }
919 : }
920 :
921 : /* Swap the hash table into place and free the old one. */
922 6268 : old_buckets = hash_table->control->buckets;
923 6268 : hash_table->control->buckets = new_buckets_shared;
924 6268 : hash_table->control->size_log2 = new_size_log2;
925 6268 : hash_table->buckets = new_buckets;
926 6268 : dsa_free(hash_table->area, old_buckets);
927 :
928 : /* Release all the locks. */
929 808572 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
930 802304 : LWLockRelease(PARTITION_LOCK(hash_table, i));
931 : }
932 :
933 : /*
934 : * Make sure that our backend-local bucket pointers are up to date. The
935 : * caller must have locked one lock partition, which prevents resize() from
936 : * running concurrently.
937 : */
938 : static inline void
939 3087034 : ensure_valid_bucket_pointers(dshash_table *hash_table)
940 : {
941 3087034 : if (hash_table->size_log2 != hash_table->control->size_log2)
942 : {
943 103900 : hash_table->buckets = dsa_get_address(hash_table->area,
944 51950 : hash_table->control->buckets);
945 51950 : hash_table->size_log2 = hash_table->control->size_log2;
946 : }
947 3087034 : }
948 :
949 : /*
950 : * Scan a locked bucket for a match, using the provided compare function.
951 : */
952 : static inline dshash_table_item *
953 3084648 : find_in_bucket(dshash_table *hash_table, const void *key,
954 : dsa_pointer item_pointer)
955 : {
956 3515956 : while (DsaPointerIsValid(item_pointer))
957 : {
958 : dshash_table_item *item;
959 :
960 2409600 : item = dsa_get_address(hash_table->area, item_pointer);
961 2409600 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
962 1978292 : return item;
963 431308 : item_pointer = item->next;
964 : }
965 1106356 : return NULL;
966 : }
967 :
968 : /*
969 : * Insert an already-allocated item into a bucket.
970 : */
971 : static void
972 867716 : insert_item_into_bucket(dshash_table *hash_table,
973 : dsa_pointer item_pointer,
974 : dshash_table_item *item,
975 : dsa_pointer *bucket)
976 : {
977 : Assert(item == dsa_get_address(hash_table->area, item_pointer));
978 :
979 867716 : item->next = *bucket;
980 867716 : *bucket = item_pointer;
981 867716 : }
982 :
983 : /*
984 : * Allocate space for an entry with the given key and insert it into the
985 : * provided bucket.
986 : */
987 : static dshash_table_item *
988 646990 : insert_into_bucket(dshash_table *hash_table,
989 : const void *key,
990 : dsa_pointer *bucket)
991 : {
992 : dsa_pointer item_pointer;
993 : dshash_table_item *item;
994 :
995 646990 : item_pointer = dsa_allocate(hash_table->area,
996 : hash_table->params.entry_size +
997 : MAXALIGN(sizeof(dshash_table_item)));
998 646990 : item = dsa_get_address(hash_table->area, item_pointer);
999 646990 : copy_key(hash_table, ENTRY_FROM_ITEM(item), key);
1000 646990 : insert_item_into_bucket(hash_table, item_pointer, item, bucket);
1001 646990 : return item;
1002 : }
1003 :
1004 : /*
1005 : * Search a bucket for a matching key and delete it.
1006 : */
1007 : static bool
1008 446 : delete_key_from_bucket(dshash_table *hash_table,
1009 : const void *key,
1010 : dsa_pointer *bucket_head)
1011 : {
1012 446 : while (DsaPointerIsValid(*bucket_head))
1013 : {
1014 : dshash_table_item *item;
1015 :
1016 276 : item = dsa_get_address(hash_table->area, *bucket_head);
1017 :
1018 276 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
1019 : {
1020 : dsa_pointer next;
1021 :
1022 276 : next = item->next;
1023 276 : dsa_free(hash_table->area, *bucket_head);
1024 276 : *bucket_head = next;
1025 :
1026 276 : return true;
1027 : }
1028 0 : bucket_head = &item->next;
1029 : }
1030 170 : return false;
1031 : }
1032 :
1033 : /*
1034 : * Delete the specified item from the bucket.
1035 : */
1036 : static bool
1037 112854 : delete_item_from_bucket(dshash_table *hash_table,
1038 : dshash_table_item *item,
1039 : dsa_pointer *bucket_head)
1040 : {
1041 113762 : while (DsaPointerIsValid(*bucket_head))
1042 : {
1043 : dshash_table_item *bucket_item;
1044 :
1045 113762 : bucket_item = dsa_get_address(hash_table->area, *bucket_head);
1046 :
1047 113762 : if (bucket_item == item)
1048 : {
1049 : dsa_pointer next;
1050 :
1051 112854 : next = item->next;
1052 112854 : dsa_free(hash_table->area, *bucket_head);
1053 112854 : *bucket_head = next;
1054 112854 : return true;
1055 : }
1056 908 : bucket_head = &bucket_item->next;
1057 : }
1058 0 : return false;
1059 : }
1060 :
1061 : /*
1062 : * Compute the hash value for a key.
1063 : */
1064 : static inline dshash_hash
1065 3078826 : hash_key(dshash_table *hash_table, const void *key)
1066 : {
1067 3078826 : return hash_table->params.hash_function(key,
1068 : hash_table->params.key_size,
1069 : hash_table->arg);
1070 : }
1071 :
1072 : /*
1073 : * Check whether two keys compare equal.
1074 : */
1075 : static inline bool
1076 2409876 : equal_keys(dshash_table *hash_table, const void *a, const void *b)
1077 : {
1078 2409876 : return hash_table->params.compare_function(a, b,
1079 : hash_table->params.key_size,
1080 2409876 : hash_table->arg) == 0;
1081 : }
1082 :
1083 : /*
1084 : * Copy a key.
1085 : */
1086 : static inline void
1087 646990 : copy_key(dshash_table *hash_table, void *dest, const void *src)
1088 : {
1089 646990 : hash_table->params.copy_function(dest, src,
1090 : hash_table->params.key_size,
1091 : hash_table->arg);
1092 646990 : }
|