Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * dshash.c
4 : * Concurrent hash tables backed by dynamic shared memory areas.
5 : *
6 : * This is an open hashing hash table, with a linked list at each table
7 : * entry. It supports dynamic resizing, as required to prevent the linked
8 : * lists from growing too long on average. Currently, only growing is
9 : * supported: the hash table never becomes smaller.
10 : *
11 : * To deal with concurrency, it has a fixed size set of partitions, each of
12 : * which is independently locked. Each bucket maps to a partition; so insert,
13 : * find and iterate operations normally only acquire one lock. Therefore,
14 : * good concurrency is achieved whenever such operations don't collide at the
15 : * lock partition level. However, when a resize operation begins, all
16 : * partition locks must be acquired simultaneously for a brief period. This
17 : * is only expected to happen a small number of times until a stable size is
18 : * found, since growth is geometric.
19 : *
20 : * Future versions may support iterators and incremental resizing; for now
21 : * the implementation is minimalist.
22 : *
23 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
24 : * Portions Copyright (c) 1994, Regents of the University of California
25 : *
26 : * IDENTIFICATION
27 : * src/backend/lib/dshash.c
28 : *
29 : *-------------------------------------------------------------------------
30 : */
31 :
32 : #include "postgres.h"
33 :
34 : #include <limits.h>
35 :
36 : #include "common/hashfn.h"
37 : #include "lib/dshash.h"
38 : #include "storage/lwlock.h"
39 : #include "utils/dsa.h"
40 :
41 : /*
42 : * An item in the hash table. This wraps the user's entry object in an
43 : * envelop that holds a pointer back to the bucket and a pointer to the next
44 : * item in the bucket.
45 : */
46 : struct dshash_table_item
47 : {
48 : /* The next item in the same bucket. */
49 : dsa_pointer next;
50 : /* The hashed key, to avoid having to recompute it. */
51 : dshash_hash hash;
52 : /* The user's entry object follows here. See ENTRY_FROM_ITEM(item). */
53 : };
54 :
55 : /*
56 : * The number of partitions for locking purposes. This is set to match
57 : * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
58 : * the buffer pool must be good enough for any other purpose. This could
59 : * become a runtime parameter in future.
60 : */
61 : #define DSHASH_NUM_PARTITIONS_LOG2 7
62 : #define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
63 :
64 : /* A magic value used to identify our hash tables. */
65 : #define DSHASH_MAGIC 0x75ff6a20
66 :
67 : /*
68 : * Tracking information for each lock partition. Initially, each partition
69 : * corresponds to one bucket, but each time the hash table grows, the buckets
70 : * covered by each partition split so the number of buckets covered doubles.
71 : *
72 : * We might want to add padding here so that each partition is on a different
73 : * cache line, but doing so would bloat this structure considerably.
74 : */
75 : typedef struct dshash_partition
76 : {
77 : LWLock lock; /* Protects all buckets in this partition. */
78 : size_t count; /* # of items in this partition's buckets */
79 : } dshash_partition;
80 :
81 : /*
82 : * The head object for a hash table. This will be stored in dynamic shared
83 : * memory.
84 : */
85 : typedef struct dshash_table_control
86 : {
87 : dshash_table_handle handle;
88 : uint32 magic;
89 : dshash_partition partitions[DSHASH_NUM_PARTITIONS];
90 : int lwlock_tranche_id;
91 :
92 : /*
93 : * The following members are written to only when ALL partitions locks are
94 : * held. They can be read when any one partition lock is held.
95 : */
96 :
97 : /* Number of buckets expressed as power of 2 (8 = 256 buckets). */
98 : size_t size_log2; /* log2(number of buckets) */
99 : dsa_pointer buckets; /* current bucket array */
100 : } dshash_table_control;
101 :
102 : /*
103 : * Per-backend state for a dynamic hash table.
104 : */
105 : struct dshash_table
106 : {
107 : dsa_area *area; /* Backing dynamic shared memory area. */
108 : dshash_parameters params; /* Parameters. */
109 : void *arg; /* User-supplied data pointer. */
110 : dshash_table_control *control; /* Control object in DSM. */
111 : dsa_pointer *buckets; /* Current bucket pointers in DSM. */
112 : size_t size_log2; /* log2(number of buckets) */
113 : };
114 :
115 : /* Given a pointer to an item, find the entry (user data) it holds. */
116 : #define ENTRY_FROM_ITEM(item) \
117 : ((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
118 :
119 : /* Given a pointer to an entry, find the item that holds it. */
120 : #define ITEM_FROM_ENTRY(entry) \
121 : ((dshash_table_item *)((char *)(entry) - \
122 : MAXALIGN(sizeof(dshash_table_item))))
123 :
124 : /* How many resize operations (bucket splits) have there been? */
125 : #define NUM_SPLITS(size_log2) \
126 : (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
127 :
128 : /* How many buckets are there in a given size? */
129 : #define NUM_BUCKETS(size_log2) \
130 : (((size_t) 1) << (size_log2))
131 :
132 : /* How many buckets are there in each partition at a given size? */
133 : #define BUCKETS_PER_PARTITION(size_log2) \
134 : (((size_t) 1) << NUM_SPLITS(size_log2))
135 :
136 : /* Max entries before we need to grow. Half + quarter = 75% load factor. */
137 : #define MAX_COUNT_PER_PARTITION(hash_table) \
138 : (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
139 : BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
140 :
141 : /* Choose partition based on the highest order bits of the hash. */
142 : #define PARTITION_FOR_HASH(hash) \
143 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
144 :
145 : /*
146 : * Find the bucket index for a given hash and table size. Each time the table
147 : * doubles in size, the appropriate bucket for a given hash value doubles and
148 : * possibly adds one, depending on the newly revealed bit, so that all buckets
149 : * are split.
150 : */
151 : #define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2) \
152 : (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
153 :
154 : /* The index of the first bucket in a given partition. */
155 : #define BUCKET_INDEX_FOR_PARTITION(partition, size_log2) \
156 : ((partition) << NUM_SPLITS(size_log2))
157 :
158 : /* Choose partition based on bucket index. */
159 : #define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2) \
160 : ((bucket_idx) >> NUM_SPLITS(size_log2))
161 :
162 : /* The head of the active bucket for a given hash value (lvalue). */
163 : #define BUCKET_FOR_HASH(hash_table, hash) \
164 : (hash_table->buckets[ \
165 : BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, \
166 : hash_table->size_log2)])
167 :
168 : static void delete_item(dshash_table *hash_table,
169 : dshash_table_item *item);
170 : static bool resize(dshash_table *hash_table, size_t new_size_log2,
171 : int flags);
172 : static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
173 : static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
174 : const void *key,
175 : dsa_pointer item_pointer);
176 : static void insert_item_into_bucket(dshash_table *hash_table,
177 : dsa_pointer item_pointer,
178 : dshash_table_item *item,
179 : dsa_pointer *bucket);
180 : static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
181 : const void *key,
182 : dsa_pointer *bucket,
183 : int flags);
184 : static bool delete_key_from_bucket(dshash_table *hash_table,
185 : const void *key,
186 : dsa_pointer *bucket_head);
187 : static bool delete_item_from_bucket(dshash_table *hash_table,
188 : dshash_table_item *item,
189 : dsa_pointer *bucket_head);
190 : static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
191 : static inline bool equal_keys(dshash_table *hash_table,
192 : const void *a, const void *b);
193 : static inline void copy_key(dshash_table *hash_table, void *dest,
194 : const void *src);
195 :
196 : #define PARTITION_LOCK(hash_table, i) \
197 : (&(hash_table)->control->partitions[(i)].lock)
198 :
199 : #define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \
200 : Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \
201 : DSHASH_NUM_PARTITIONS, sizeof(dshash_partition)))
202 :
203 : /*
204 : * Create a new hash table backed by the given dynamic shared area, with the
205 : * given parameters. The returned object is allocated in backend-local memory
206 : * using the current MemoryContext. 'arg' will be passed through to the
207 : * compare, hash, and copy functions.
208 : */
209 : dshash_table *
210 1557 : dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
211 : {
212 : dshash_table *hash_table;
213 : dsa_pointer control;
214 :
215 : /* Allocate the backend-local object representing the hash table. */
216 1557 : hash_table = palloc_object(dshash_table);
217 :
218 : /* Allocate the control object in shared memory. */
219 1557 : control = dsa_allocate(area, sizeof(dshash_table_control));
220 :
221 : /* Set up the local and shared hash table structs. */
222 1557 : hash_table->area = area;
223 1557 : hash_table->params = *params;
224 1557 : hash_table->arg = arg;
225 1557 : hash_table->control = dsa_get_address(area, control);
226 1557 : hash_table->control->handle = control;
227 1557 : hash_table->control->magic = DSHASH_MAGIC;
228 1557 : hash_table->control->lwlock_tranche_id = params->tranche_id;
229 :
230 : /* Set up the array of lock partitions. */
231 : {
232 1557 : dshash_partition *partitions = hash_table->control->partitions;
233 1557 : int tranche_id = hash_table->control->lwlock_tranche_id;
234 : int i;
235 :
236 200853 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
237 : {
238 199296 : LWLockInitialize(&partitions[i].lock, tranche_id);
239 199296 : partitions[i].count = 0;
240 : }
241 : }
242 :
243 : /*
244 : * Set up the initial array of buckets. Our initial size is the same as
245 : * the number of partitions.
246 : */
247 1557 : hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
248 3114 : hash_table->control->buckets =
249 1557 : dsa_allocate_extended(area,
250 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
251 : DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
252 1557 : if (!DsaPointerIsValid(hash_table->control->buckets))
253 : {
254 0 : dsa_free(area, control);
255 0 : ereport(ERROR,
256 : (errcode(ERRCODE_OUT_OF_MEMORY),
257 : errmsg("out of memory"),
258 : errdetail("Failed on DSA request of size %zu.",
259 : sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
260 : }
261 3114 : hash_table->buckets = dsa_get_address(area,
262 1557 : hash_table->control->buckets);
263 1557 : hash_table->size_log2 = hash_table->control->size_log2;
264 :
265 1557 : return hash_table;
266 : }
267 :
268 : /*
269 : * Attach to an existing hash table using a handle. The returned object is
270 : * allocated in backend-local memory using the current MemoryContext. 'arg'
271 : * will be passed through to the compare and hash functions.
272 : */
273 : dshash_table *
274 29100 : dshash_attach(dsa_area *area, const dshash_parameters *params,
275 : dshash_table_handle handle, void *arg)
276 : {
277 : dshash_table *hash_table;
278 : dsa_pointer control;
279 :
280 : /* Allocate the backend-local object representing the hash table. */
281 29100 : hash_table = palloc_object(dshash_table);
282 :
283 : /* Find the control object in shared memory. */
284 29100 : control = handle;
285 :
286 : /* Set up the local hash table struct. */
287 29100 : hash_table->area = area;
288 29100 : hash_table->params = *params;
289 29100 : hash_table->arg = arg;
290 29100 : hash_table->control = dsa_get_address(area, control);
291 : Assert(hash_table->control->magic == DSHASH_MAGIC);
292 :
293 : /*
294 : * These will later be set to the correct values by
295 : * ensure_valid_bucket_pointers(), at which time we'll be holding a
296 : * partition lock for interlocking against concurrent resizing.
297 : */
298 29100 : hash_table->buckets = NULL;
299 29100 : hash_table->size_log2 = 0;
300 :
301 29100 : return hash_table;
302 : }
303 :
304 : /*
305 : * Detach from a hash table. This frees backend-local resources associated
306 : * with the hash table, but the hash table will continue to exist until it is
307 : * either explicitly destroyed (by a backend that is still attached to it), or
308 : * the area that backs it is returned to the operating system.
309 : */
310 : void
311 30208 : dshash_detach(dshash_table *hash_table)
312 : {
313 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
314 :
315 : /* The hash table may have been destroyed. Just free local memory. */
316 30208 : pfree(hash_table);
317 30208 : }
318 :
319 : /*
320 : * Destroy a hash table, returning all memory to the area. The caller must be
321 : * certain that no other backend will attempt to access the hash table before
322 : * calling this function. Other backend must explicitly call dshash_detach to
323 : * free up backend-local memory associated with the hash table. The backend
324 : * that calls dshash_destroy must not call dshash_detach.
325 : */
326 : void
327 0 : dshash_destroy(dshash_table *hash_table)
328 : {
329 : size_t size;
330 : size_t i;
331 :
332 : Assert(hash_table->control->magic == DSHASH_MAGIC);
333 0 : ensure_valid_bucket_pointers(hash_table);
334 :
335 : /* Free all the entries. */
336 0 : size = NUM_BUCKETS(hash_table->size_log2);
337 0 : for (i = 0; i < size; ++i)
338 : {
339 0 : dsa_pointer item_pointer = hash_table->buckets[i];
340 :
341 0 : while (DsaPointerIsValid(item_pointer))
342 : {
343 : dshash_table_item *item;
344 : dsa_pointer next_item_pointer;
345 :
346 0 : item = dsa_get_address(hash_table->area, item_pointer);
347 0 : next_item_pointer = item->next;
348 0 : dsa_free(hash_table->area, item_pointer);
349 0 : item_pointer = next_item_pointer;
350 : }
351 : }
352 :
353 : /*
354 : * Vandalize the control block to help catch programming errors where
355 : * other backends access the memory formerly occupied by this hash table.
356 : */
357 0 : hash_table->control->magic = 0;
358 :
359 : /* Free the active table and control object. */
360 0 : dsa_free(hash_table->area, hash_table->control->buckets);
361 0 : dsa_free(hash_table->area, hash_table->control->handle);
362 :
363 0 : pfree(hash_table);
364 0 : }
365 :
366 : /*
367 : * Get a handle that can be used by other processes to attach to this hash
368 : * table.
369 : */
370 : dshash_table_handle
371 1557 : dshash_get_hash_table_handle(dshash_table *hash_table)
372 : {
373 : Assert(hash_table->control->magic == DSHASH_MAGIC);
374 :
375 1557 : return hash_table->control->handle;
376 : }
377 :
378 : /*
379 : * Look up an entry, given a key. Returns a pointer to an entry if one can be
380 : * found with the given key. Returns NULL if the key is not found. If a
381 : * non-NULL value is returned, the entry is locked and must be released by
382 : * calling dshash_release_lock. If an error is raised before
383 : * dshash_release_lock is called, the lock will be released automatically, but
384 : * the caller must take care to ensure that the entry is not left corrupted.
385 : * The lock mode is either shared or exclusive depending on 'exclusive'.
386 : *
387 : * The caller must not hold a lock already.
388 : *
389 : * Note that the lock held is in fact an LWLock, so interrupts will be held on
390 : * return from this function, and not resumed until dshash_release_lock is
391 : * called. It is a very good idea for the caller to release the lock quickly.
392 : */
393 : void *
394 1335322 : dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
395 : {
396 : dshash_hash hash;
397 : size_t partition;
398 : dshash_table_item *item;
399 :
400 1335322 : hash = hash_key(hash_table, key);
401 1335322 : partition = PARTITION_FOR_HASH(hash);
402 :
403 : Assert(hash_table->control->magic == DSHASH_MAGIC);
404 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
405 :
406 1335322 : LWLockAcquire(PARTITION_LOCK(hash_table, partition),
407 1335322 : exclusive ? LW_EXCLUSIVE : LW_SHARED);
408 1335322 : ensure_valid_bucket_pointers(hash_table);
409 :
410 : /* Search the active bucket. */
411 1335322 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
412 :
413 1335322 : if (!item)
414 : {
415 : /* Not found. */
416 280538 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
417 280538 : return NULL;
418 : }
419 : else
420 : {
421 : /* The caller will free the lock by calling dshash_release_lock. */
422 1054784 : return ENTRY_FROM_ITEM(item);
423 : }
424 : }
425 :
426 : /*
427 : * Find an existing entry in a dshash_table, or insert a new one.
428 : *
429 : * DSHASH_INSERT_NO_OOM causes this function to return NULL when no memory is
430 : * available for the new entry. Otherwise, such allocations will result in
431 : * an ERROR.
432 : *
433 : * Any entry returned by this function is exclusively locked, and the caller
434 : * must release that lock using dshash_release_lock. Notes above dshash_find()
435 : * regarding locking and error handling equally apply here.
436 : *
437 : * On return, *found is set to true if an existing entry was found in the
438 : * hash table, and otherwise false.
439 : *
440 : */
441 : void *
442 393066 : dshash_find_or_insert_extended(dshash_table *hash_table,
443 : const void *key,
444 : bool *found,
445 : int flags)
446 : {
447 : dshash_hash hash;
448 : size_t partition_index;
449 : dshash_partition *partition;
450 : dshash_table_item *item;
451 :
452 393066 : hash = hash_key(hash_table, key);
453 393066 : partition_index = PARTITION_FOR_HASH(hash);
454 393066 : partition = &hash_table->control->partitions[partition_index];
455 :
456 : Assert(hash_table->control->magic == DSHASH_MAGIC);
457 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
458 :
459 396671 : restart:
460 396671 : LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
461 : LW_EXCLUSIVE);
462 396671 : ensure_valid_bucket_pointers(hash_table);
463 :
464 : /* Search the active bucket. */
465 396671 : item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
466 :
467 396671 : if (item)
468 270 : *found = true;
469 : else
470 : {
471 396401 : *found = false;
472 :
473 : /* Check if we are getting too full. */
474 396401 : if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
475 : {
476 : /*
477 : * The load factor (= keys / buckets) for all buckets protected by
478 : * this partition is > 0.75. Presumably the same applies
479 : * generally across the whole hash table (though we don't attempt
480 : * to track that directly to avoid contention on some kind of
481 : * central counter; we just assume that this partition is
482 : * representative). This is a good time to resize.
483 : *
484 : * Give up our existing lock first, because resizing needs to
485 : * reacquire all the locks in the right order to avoid deadlocks.
486 : */
487 3605 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
488 3605 : if (!resize(hash_table, hash_table->size_log2 + 1, flags))
489 : {
490 : Assert((flags & DSHASH_INSERT_NO_OOM) != 0);
491 0 : return NULL;
492 : }
493 :
494 3605 : goto restart;
495 : }
496 :
497 : /* Finally we can try to insert the new item. */
498 392796 : item = insert_into_bucket(hash_table, key,
499 392796 : &BUCKET_FOR_HASH(hash_table, hash),
500 : flags);
501 392796 : if (item == NULL)
502 : {
503 : Assert((flags & DSHASH_INSERT_NO_OOM) != 0);
504 0 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
505 0 : return NULL;
506 : }
507 392796 : item->hash = hash;
508 : /* Adjust per-lock-partition counter for load factor knowledge. */
509 392796 : ++partition->count;
510 : }
511 :
512 : /* The caller must release the lock with dshash_release_lock. */
513 393066 : return ENTRY_FROM_ITEM(item);
514 : }
515 :
516 : /*
517 : * Remove an entry by key. Returns true if the key was found and the
518 : * corresponding entry was removed.
519 : *
520 : * To delete an entry that you already have a pointer to, see
521 : * dshash_delete_entry.
522 : */
523 : bool
524 267 : dshash_delete_key(dshash_table *hash_table, const void *key)
525 : {
526 : dshash_hash hash;
527 : size_t partition;
528 : bool found;
529 :
530 : Assert(hash_table->control->magic == DSHASH_MAGIC);
531 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
532 :
533 267 : hash = hash_key(hash_table, key);
534 267 : partition = PARTITION_FOR_HASH(hash);
535 :
536 267 : LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
537 267 : ensure_valid_bucket_pointers(hash_table);
538 :
539 267 : if (delete_key_from_bucket(hash_table, key,
540 267 : &BUCKET_FOR_HASH(hash_table, hash)))
541 : {
542 : Assert(hash_table->control->partitions[partition].count > 0);
543 140 : found = true;
544 140 : --hash_table->control->partitions[partition].count;
545 : }
546 : else
547 127 : found = false;
548 :
549 267 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
550 :
551 267 : return found;
552 : }
553 :
554 : /*
555 : * Remove an entry. The entry must already be exclusively locked, and must
556 : * have been obtained by dshash_find or dshash_find_or_insert. Note that this
557 : * function releases the lock just like dshash_release_lock.
558 : *
559 : * To delete an entry by key, see dshash_delete_key.
560 : */
561 : void
562 65548 : dshash_delete_entry(dshash_table *hash_table, void *entry)
563 : {
564 65548 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
565 65548 : size_t partition = PARTITION_FOR_HASH(item->hash);
566 :
567 : Assert(hash_table->control->magic == DSHASH_MAGIC);
568 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
569 : LW_EXCLUSIVE));
570 :
571 65548 : delete_item(hash_table, item);
572 65548 : LWLockRelease(PARTITION_LOCK(hash_table, partition));
573 65548 : }
574 :
575 : /*
576 : * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
577 : */
578 : void
579 1382301 : dshash_release_lock(dshash_table *hash_table, void *entry)
580 : {
581 1382301 : dshash_table_item *item = ITEM_FROM_ENTRY(entry);
582 1382301 : size_t partition_index = PARTITION_FOR_HASH(item->hash);
583 :
584 : Assert(hash_table->control->magic == DSHASH_MAGIC);
585 :
586 1382301 : LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
587 1382301 : }
588 :
589 : /*
590 : * A compare function that forwards to memcmp.
591 : */
592 : int
593 727 : dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
594 : {
595 727 : return memcmp(a, b, size);
596 : }
597 :
598 : /*
599 : * A hash function that forwards to tag_hash.
600 : */
601 : dshash_hash
602 1207 : dshash_memhash(const void *v, size_t size, void *arg)
603 : {
604 1207 : return tag_hash(v, size);
605 : }
606 :
607 : /*
608 : * A copy function that forwards to memcpy.
609 : */
610 : void
611 392772 : dshash_memcpy(void *dest, const void *src, size_t size, void *arg)
612 : {
613 392772 : (void) memcpy(dest, src, size);
614 392772 : }
615 :
616 : /*
617 : * A compare function that forwards to strcmp.
618 : */
619 : int
620 175 : dshash_strcmp(const void *a, const void *b, size_t size, void *arg)
621 : {
622 : Assert(strlen((const char *) a) < size);
623 : Assert(strlen((const char *) b) < size);
624 :
625 175 : return strcmp((const char *) a, (const char *) b);
626 : }
627 :
628 : /*
629 : * A hash function that forwards to string_hash.
630 : */
631 : dshash_hash
632 204 : dshash_strhash(const void *v, size_t size, void *arg)
633 : {
634 : Assert(strlen((const char *) v) < size);
635 :
636 204 : return string_hash((const char *) v, size);
637 : }
638 :
639 : /*
640 : * A copy function that forwards to strcpy.
641 : */
642 : void
643 24 : dshash_strcpy(void *dest, const void *src, size_t size, void *arg)
644 : {
645 : Assert(strlen((const char *) src) < size);
646 :
647 24 : (void) strcpy((char *) dest, (const char *) src);
648 24 : }
649 :
650 : /*
651 : * Sequentially scan through dshash table and return all the elements one by
652 : * one, return NULL when all elements have been returned.
653 : *
654 : * dshash_seq_term needs to be called when a scan finished. The caller may
655 : * delete returned elements midst of a scan by using dshash_delete_current()
656 : * if exclusive = true.
657 : */
658 : void
659 1125 : dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
660 : bool exclusive)
661 : {
662 1125 : status->hash_table = hash_table;
663 1125 : status->curbucket = 0;
664 1125 : status->nbuckets = 0;
665 1125 : status->curitem = NULL;
666 1125 : status->pnextitem = InvalidDsaPointer;
667 1125 : status->curpartition = -1;
668 1125 : status->exclusive = exclusive;
669 1125 : }
670 :
671 : /*
672 : * Returns the next element.
673 : *
674 : * Returned elements are locked and the caller may not release the lock. It is
675 : * released by future calls to dshash_seq_next() or dshash_seq_term().
676 : */
677 : void *
678 312853 : dshash_seq_next(dshash_seq_status *status)
679 : {
680 : dsa_pointer next_item_pointer;
681 :
682 : /*
683 : * Not yet holding any partition locks. Need to determine the size of the
684 : * hash table, it could have been resized since we were looking last.
685 : * Since we iterate in partition order, we can start by unconditionally
686 : * lock partition 0.
687 : *
688 : * Once we hold the lock, no resizing can happen until the scan ends. So
689 : * we don't need to repeatedly call ensure_valid_bucket_pointers().
690 : */
691 312853 : if (status->curpartition == -1)
692 : {
693 : Assert(status->curbucket == 0);
694 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(status->hash_table);
695 :
696 1125 : status->curpartition = 0;
697 :
698 1125 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
699 : status->curpartition),
700 1125 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
701 :
702 1125 : ensure_valid_bucket_pointers(status->hash_table);
703 :
704 1125 : status->nbuckets =
705 1125 : NUM_BUCKETS(status->hash_table->control->size_log2);
706 1125 : next_item_pointer = status->hash_table->buckets[status->curbucket];
707 : }
708 : else
709 311728 : next_item_pointer = status->pnextitem;
710 :
711 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(status->hash_table,
712 : status->curpartition),
713 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED));
714 :
715 : /* Move to the next bucket if we finished the current bucket */
716 1938224 : while (!DsaPointerIsValid(next_item_pointer))
717 : {
718 : int next_partition;
719 :
720 1626496 : if (++status->curbucket >= status->nbuckets)
721 : {
722 : /* all buckets have been scanned. finish. */
723 1125 : return NULL;
724 : }
725 :
726 : /* Check if move to the next partition */
727 1625371 : next_partition =
728 1625371 : PARTITION_FOR_BUCKET_INDEX(status->curbucket,
729 : status->hash_table->size_log2);
730 :
731 1625371 : if (status->curpartition != next_partition)
732 : {
733 : /*
734 : * Move to the next partition. Lock the next partition then
735 : * release the current, not in the reverse order to avoid
736 : * concurrent resizing. Avoid dead lock by taking lock in the
737 : * same order with resize().
738 : */
739 142875 : LWLockAcquire(PARTITION_LOCK(status->hash_table,
740 : next_partition),
741 142875 : status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
742 142875 : LWLockRelease(PARTITION_LOCK(status->hash_table,
743 : status->curpartition));
744 142875 : status->curpartition = next_partition;
745 : }
746 :
747 1625371 : next_item_pointer = status->hash_table->buckets[status->curbucket];
748 : }
749 :
750 311728 : status->curitem =
751 311728 : dsa_get_address(status->hash_table->area, next_item_pointer);
752 :
753 : /*
754 : * The caller may delete the item. Store the next item in case of
755 : * deletion.
756 : */
757 311728 : status->pnextitem = status->curitem->next;
758 :
759 311728 : return ENTRY_FROM_ITEM(status->curitem);
760 : }
761 :
762 : /*
763 : * Terminates the seqscan and release all locks.
764 : *
765 : * Needs to be called after finishing or when exiting a seqscan.
766 : */
767 : void
768 1125 : dshash_seq_term(dshash_seq_status *status)
769 : {
770 1125 : if (status->curpartition >= 0)
771 1125 : LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
772 1125 : }
773 :
774 : /*
775 : * Remove the current entry of the seq scan.
776 : */
777 : void
778 5767 : dshash_delete_current(dshash_seq_status *status)
779 : {
780 5767 : dshash_table *hash_table = status->hash_table;
781 5767 : dshash_table_item *item = status->curitem;
782 : size_t partition PG_USED_FOR_ASSERTS_ONLY;
783 :
784 5767 : partition = PARTITION_FOR_HASH(item->hash);
785 :
786 : Assert(status->exclusive);
787 : Assert(hash_table->control->magic == DSHASH_MAGIC);
788 : Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
789 : LW_EXCLUSIVE));
790 :
791 5767 : delete_item(hash_table, item);
792 5767 : }
793 :
794 : /*
795 : * Print debugging information about the internal state of the hash table to
796 : * stderr. The caller must hold no partition locks.
797 : */
798 : void
799 0 : dshash_dump(dshash_table *hash_table)
800 : {
801 : size_t i;
802 : size_t j;
803 :
804 : Assert(hash_table->control->magic == DSHASH_MAGIC);
805 : ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
806 :
807 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
808 : {
809 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
810 0 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
811 : }
812 :
813 0 : ensure_valid_bucket_pointers(hash_table);
814 :
815 0 : fprintf(stderr,
816 0 : "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
817 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
818 : {
819 0 : dshash_partition *partition = &hash_table->control->partitions[i];
820 0 : size_t begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
821 0 : size_t end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
822 :
823 0 : fprintf(stderr, " partition %zu\n", i);
824 0 : fprintf(stderr,
825 : " active buckets (key count = %zu)\n", partition->count);
826 :
827 0 : for (j = begin; j < end; ++j)
828 : {
829 0 : size_t count = 0;
830 0 : dsa_pointer bucket = hash_table->buckets[j];
831 :
832 0 : while (DsaPointerIsValid(bucket))
833 : {
834 : dshash_table_item *item;
835 :
836 0 : item = dsa_get_address(hash_table->area, bucket);
837 :
838 0 : bucket = item->next;
839 0 : ++count;
840 : }
841 0 : fprintf(stderr, " bucket %zu (key count = %zu)\n", j, count);
842 : }
843 : }
844 :
845 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
846 0 : LWLockRelease(PARTITION_LOCK(hash_table, i));
847 0 : }
848 :
849 : /*
850 : * Delete a locked item to which we have a pointer.
851 : */
852 : static void
853 71315 : delete_item(dshash_table *hash_table, dshash_table_item *item)
854 : {
855 71315 : size_t hash = item->hash;
856 71315 : size_t partition = PARTITION_FOR_HASH(hash);
857 :
858 : Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
859 :
860 71315 : if (delete_item_from_bucket(hash_table, item,
861 71315 : &BUCKET_FOR_HASH(hash_table, hash)))
862 : {
863 : Assert(hash_table->control->partitions[partition].count > 0);
864 71315 : --hash_table->control->partitions[partition].count;
865 : }
866 : else
867 : {
868 : Assert(false);
869 : }
870 71315 : }
871 :
872 : /*
873 : * Grow the hash table if necessary to the requested number of buckets. The
874 : * requested size must be double some previously observed size.
875 : *
876 : * If an out-of-memory condition is observed, this function returns false if
877 : * flags includes DSHASH_INSERT_NO_OOM, and otherwise throws an ERROR. In all
878 : * other cases, it returns true.
879 : *
880 : * Must be called without any partition lock held.
881 : */
882 : static bool
883 3605 : resize(dshash_table *hash_table, size_t new_size_log2, int flags)
884 : {
885 : dsa_pointer old_buckets;
886 : dsa_pointer new_buckets_shared;
887 : dsa_pointer *new_buckets;
888 : size_t size;
889 3605 : size_t new_size = ((size_t) 1) << new_size_log2;
890 : size_t i;
891 3605 : int dsa_flags = DSA_ALLOC_HUGE | DSA_ALLOC_ZERO;
892 :
893 : /*
894 : * Acquire the locks for all lock partitions. This is expensive, but we
895 : * shouldn't have to do it many times.
896 : */
897 465045 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
898 : {
899 : Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
900 :
901 461440 : LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
902 461440 : if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
903 : {
904 : /*
905 : * Another backend has already increased the size; we can avoid
906 : * obtaining all the locks and return early.
907 : */
908 0 : LWLockRelease(PARTITION_LOCK(hash_table, 0));
909 0 : return true;
910 : }
911 : }
912 :
913 : Assert(new_size_log2 == hash_table->control->size_log2 + 1);
914 :
915 : /* Allocate the space for the new table. */
916 3605 : if (flags & DSHASH_INSERT_NO_OOM)
917 0 : dsa_flags |= DSA_ALLOC_NO_OOM;
918 : new_buckets_shared =
919 3605 : dsa_allocate_extended(hash_table->area,
920 : sizeof(dsa_pointer) * new_size,
921 : dsa_flags);
922 :
923 : /* If DSHASH_INSERT_NO_OOM was specified, allocation may have failed. */
924 3605 : if (!DsaPointerIsValid(new_buckets_shared))
925 : {
926 : /* Release all the locks and return without resizing. */
927 0 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
928 0 : LWLockRelease(PARTITION_LOCK(hash_table, i));
929 0 : return false;
930 : }
931 :
932 : /*
933 : * We've allocated the new bucket array; all that remains to do now is to
934 : * reinsert all items, which amounts to adjusting all the pointers.
935 : */
936 3605 : new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
937 3605 : size = ((size_t) 1) << hash_table->control->size_log2;
938 1646997 : for (i = 0; i < size; ++i)
939 : {
940 1643392 : dsa_pointer item_pointer = hash_table->buckets[i];
941 :
942 1772164 : while (DsaPointerIsValid(item_pointer))
943 : {
944 : dshash_table_item *item;
945 : dsa_pointer next_item_pointer;
946 :
947 128772 : item = dsa_get_address(hash_table->area, item_pointer);
948 128772 : next_item_pointer = item->next;
949 128772 : insert_item_into_bucket(hash_table, item_pointer, item,
950 128772 : &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
951 : new_size_log2)]);
952 128772 : item_pointer = next_item_pointer;
953 : }
954 : }
955 :
956 : /* Swap the hash table into place and free the old one. */
957 3605 : old_buckets = hash_table->control->buckets;
958 3605 : hash_table->control->buckets = new_buckets_shared;
959 3605 : hash_table->control->size_log2 = new_size_log2;
960 3605 : hash_table->buckets = new_buckets;
961 3605 : dsa_free(hash_table->area, old_buckets);
962 :
963 : /* Release all the locks. */
964 465045 : for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
965 461440 : LWLockRelease(PARTITION_LOCK(hash_table, i));
966 :
967 3605 : return true;
968 : }
969 :
970 : /*
971 : * Make sure that our backend-local bucket pointers are up to date. The
972 : * caller must have locked one lock partition, which prevents resize() from
973 : * running concurrently.
974 : */
975 : static inline void
976 1733385 : ensure_valid_bucket_pointers(dshash_table *hash_table)
977 : {
978 1733385 : if (hash_table->size_log2 != hash_table->control->size_log2)
979 : {
980 58124 : hash_table->buckets = dsa_get_address(hash_table->area,
981 29062 : hash_table->control->buckets);
982 29062 : hash_table->size_log2 = hash_table->control->size_log2;
983 : }
984 1733385 : }
985 :
986 : /*
987 : * Scan a locked bucket for a match, using the provided compare function.
988 : */
989 : static inline dshash_table_item *
990 1731993 : find_in_bucket(dshash_table *hash_table, const void *key,
991 : dsa_pointer item_pointer)
992 : {
993 1962825 : while (DsaPointerIsValid(item_pointer))
994 : {
995 : dshash_table_item *item;
996 :
997 1285886 : item = dsa_get_address(hash_table->area, item_pointer);
998 1285886 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
999 1055054 : return item;
1000 230832 : item_pointer = item->next;
1001 : }
1002 676939 : return NULL;
1003 : }
1004 :
1005 : /*
1006 : * Insert an already-allocated item into a bucket.
1007 : */
1008 : static void
1009 521568 : insert_item_into_bucket(dshash_table *hash_table,
1010 : dsa_pointer item_pointer,
1011 : dshash_table_item *item,
1012 : dsa_pointer *bucket)
1013 : {
1014 : Assert(item == dsa_get_address(hash_table->area, item_pointer));
1015 :
1016 521568 : item->next = *bucket;
1017 521568 : *bucket = item_pointer;
1018 521568 : }
1019 :
1020 : /*
1021 : * Allocate space for an entry with the given key and insert it into the
1022 : * provided bucket. Returns NULL if out of memory and DSHASH_INSERT_NO_OOM
1023 : * was specified in flags.
1024 : */
1025 : static dshash_table_item *
1026 392796 : insert_into_bucket(dshash_table *hash_table,
1027 : const void *key,
1028 : dsa_pointer *bucket,
1029 : int flags)
1030 : {
1031 : dsa_pointer item_pointer;
1032 : dshash_table_item *item;
1033 : int dsa_flags;
1034 :
1035 392796 : dsa_flags = (flags & DSHASH_INSERT_NO_OOM) ? DSA_ALLOC_NO_OOM : 0;
1036 392796 : item_pointer = dsa_allocate_extended(hash_table->area,
1037 392796 : hash_table->params.entry_size +
1038 : MAXALIGN(sizeof(dshash_table_item)),
1039 : dsa_flags);
1040 392796 : if (!DsaPointerIsValid(item_pointer))
1041 0 : return NULL;
1042 392796 : item = dsa_get_address(hash_table->area, item_pointer);
1043 392796 : copy_key(hash_table, ENTRY_FROM_ITEM(item), key);
1044 392796 : insert_item_into_bucket(hash_table, item_pointer, item, bucket);
1045 392796 : return item;
1046 : }
1047 :
1048 : /*
1049 : * Search a bucket for a matching key and delete it.
1050 : */
1051 : static bool
1052 267 : delete_key_from_bucket(dshash_table *hash_table,
1053 : const void *key,
1054 : dsa_pointer *bucket_head)
1055 : {
1056 267 : while (DsaPointerIsValid(*bucket_head))
1057 : {
1058 : dshash_table_item *item;
1059 :
1060 140 : item = dsa_get_address(hash_table->area, *bucket_head);
1061 :
1062 140 : if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
1063 : {
1064 : dsa_pointer next;
1065 :
1066 140 : next = item->next;
1067 140 : dsa_free(hash_table->area, *bucket_head);
1068 140 : *bucket_head = next;
1069 :
1070 140 : return true;
1071 : }
1072 0 : bucket_head = &item->next;
1073 : }
1074 127 : return false;
1075 : }
1076 :
1077 : /*
1078 : * Delete the specified item from the bucket.
1079 : */
1080 : static bool
1081 71315 : delete_item_from_bucket(dshash_table *hash_table,
1082 : dshash_table_item *item,
1083 : dsa_pointer *bucket_head)
1084 : {
1085 71868 : while (DsaPointerIsValid(*bucket_head))
1086 : {
1087 : dshash_table_item *bucket_item;
1088 :
1089 71868 : bucket_item = dsa_get_address(hash_table->area, *bucket_head);
1090 :
1091 71868 : if (bucket_item == item)
1092 : {
1093 : dsa_pointer next;
1094 :
1095 71315 : next = item->next;
1096 71315 : dsa_free(hash_table->area, *bucket_head);
1097 71315 : *bucket_head = next;
1098 71315 : return true;
1099 : }
1100 553 : bucket_head = &bucket_item->next;
1101 : }
1102 0 : return false;
1103 : }
1104 :
1105 : /*
1106 : * Compute the hash value for a key.
1107 : */
1108 : static inline dshash_hash
1109 1728655 : hash_key(dshash_table *hash_table, const void *key)
1110 : {
1111 1728655 : return hash_table->params.hash_function(key,
1112 : hash_table->params.key_size,
1113 : hash_table->arg);
1114 : }
1115 :
1116 : /*
1117 : * Check whether two keys compare equal.
1118 : */
1119 : static inline bool
1120 1286026 : equal_keys(dshash_table *hash_table, const void *a, const void *b)
1121 : {
1122 1286026 : return hash_table->params.compare_function(a, b,
1123 : hash_table->params.key_size,
1124 1286026 : hash_table->arg) == 0;
1125 : }
1126 :
1127 : /*
1128 : * Copy a key.
1129 : */
1130 : static inline void
1131 392796 : copy_key(dshash_table *hash_table, void *dest, const void *src)
1132 : {
1133 392796 : hash_table->params.copy_function(dest, src,
1134 : hash_table->params.key_size,
1135 : hash_table->arg);
1136 392796 : }
|