Line data Source code
1 : /*
2 : * hashfn_unstable.h
3 : *
4 : * Building blocks for creating fast inlineable hash functions. The
5 : * functions in this file are not guaranteed to be stable between versions,
6 : * and may differ by hardware platform. Hence they must not be used in
7 : * indexes or other on-disk structures. See hashfn.h if you need stability.
8 : *
9 : *
10 : * Portions Copyright (c) 2024-2026, PostgreSQL Global Development Group
11 : *
12 : * src/include/common/hashfn_unstable.h
13 : */
14 : #ifndef HASHFN_UNSTABLE_H
15 : #define HASHFN_UNSTABLE_H
16 :
17 :
18 : /*
19 : * fasthash is a modification of code taken from
20 : * https://code.google.com/archive/p/fast-hash/source/default/source
21 : * under the terms of the MIT license. The original copyright
22 : * notice follows:
23 : */
24 :
25 : /*
26 : * The MIT License
27 : *
28 : * Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
29 : *
30 : * Permission is hereby granted, free of charge, to any person
31 : * obtaining a copy of this software and associated documentation
32 : * files (the "Software"), to deal in the Software without
33 : * restriction, including without limitation the rights to use, copy,
34 : * modify, merge, publish, distribute, sublicense, and/or sell copies
35 : * of the Software, and to permit persons to whom the Software is
36 : * furnished to do so, subject to the following conditions:
37 : *
38 : * The above copyright notice and this permission notice shall be
39 : * included in all copies or substantial portions of the Software.
40 : *
41 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
42 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
43 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
44 : * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
45 : * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
46 : * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
47 : * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48 : * SOFTWARE.
49 : */
50 :
51 : /*
52 : * fasthash as implemented here has two interfaces:
53 : *
54 : * 1) Standalone functions that take a single input.
55 : *
56 : * 2) Incremental interface. This can used for incorporating multiple
57 : * inputs. First, initialize the hash state (here with a zero seed):
58 : *
59 : * fasthash_state hs;
60 : * fasthash_init(&hs, 0);
61 : *
62 : * Next, accumulate input into the hash state.
63 : * If the inputs are of types that can be trivially cast to uint64, it's
64 : * sufficient to do:
65 : *
66 : * hs.accum = value1;
67 : * fasthash_combine(&hs);
68 : * hs.accum = value2;
69 : * fasthash_combine(&hs);
70 : * ...
71 : *
72 : * For longer or variable-length input, fasthash_accum() is a more
73 : * flexible, but more verbose method. The standalone functions use this
74 : * internally, so see fasthash64() for an example of this.
75 : *
76 : * After all inputs have been mixed in, finalize the hash and optionally
77 : * reduce to 32 bits. If all inputs are fixed-length, it's sufficient
78 : * to pass zero for the tweak:
79 : *
80 : * hashcode = fasthash_final32(&hs, 0);
81 : *
82 : * For variable length input, experimentation has found that SMHasher
83 : * fails unless we pass the length for the tweak. When accumulating
84 : * multiple varlen values, it's probably safest to calculate a tweak
85 : * such that the bits of all individual lengths are present, for example:
86 : *
87 : * lengths = len1 + (len2 << 10) + (len3 << 20);
88 : * hashcode = fasthash_final32(&hs, lengths);
89 : *
90 : * The incremental interface allows an optimization for NUL-terminated
91 : * C strings:
92 : *
93 : * len = fasthash_accum_cstring(&hs, str);
94 : * hashcode = fasthash_final32(&hs, len);
95 : *
96 : * By computing the length on-the-fly, we can avoid needing a strlen()
97 : * call to tell us how many bytes to hash.
98 : */
99 :
100 :
101 : typedef struct fasthash_state
102 : {
103 : /* staging area for chunks of input */
104 : uint64 accum;
105 :
106 : uint64 hash;
107 : } fasthash_state;
108 :
109 : #define FH_SIZEOF_ACCUM sizeof(uint64)
110 :
111 :
112 : /*
113 : * Initialize the hash state.
114 : *
115 : * 'seed' can be zero.
116 : */
117 : static inline void
118 10804536 : fasthash_init(fasthash_state *hs, uint64 seed)
119 : {
120 10804536 : memset(hs, 0, sizeof(fasthash_state));
121 10804536 : hs->hash = seed ^ 0x880355f21e6d1965;
122 10804536 : }
123 :
124 : /* both the finalizer and part of the combining step */
125 : static inline uint64
126 32443689 : fasthash_mix(uint64 h, uint64 tweak)
127 : {
128 32443689 : h ^= (h >> 23) + tweak;
129 32443689 : h *= 0x2127599bf4325c37;
130 32443689 : h ^= h >> 47;
131 32443689 : return h;
132 : }
133 :
134 : /* combine one chunk of input into the hash */
135 : static inline void
136 21639153 : fasthash_combine(fasthash_state *hs)
137 : {
138 21639153 : hs->hash ^= fasthash_mix(hs->accum, 0);
139 21639153 : hs->hash *= 0x880355f21e6d1965;
140 21639153 : }
141 :
142 : /* accumulate up to 8 bytes of input and combine it into the hash */
143 : static inline void
144 30540210 : fasthash_accum(fasthash_state *hs, const char *k, size_t len)
145 : {
146 : uint32 lower_four;
147 :
148 : Assert(len <= FH_SIZEOF_ACCUM);
149 30540210 : hs->accum = 0;
150 :
151 : /*
152 : * For consistency, bytewise loads must match the platform's endianness.
153 : */
154 : #ifdef WORDS_BIGENDIAN
155 : switch (len)
156 : {
157 : case 8:
158 : memcpy(&hs->accum, k, 8);
159 : break;
160 : case 7:
161 : hs->accum |= (uint64) k[6] << 8;
162 : pg_fallthrough;
163 : case 6:
164 : hs->accum |= (uint64) k[5] << 16;
165 : pg_fallthrough;
166 : case 5:
167 : hs->accum |= (uint64) k[4] << 24;
168 : pg_fallthrough;
169 : case 4:
170 : memcpy(&lower_four, k, sizeof(lower_four));
171 : hs->accum |= (uint64) lower_four << 32;
172 : break;
173 : case 3:
174 : hs->accum |= (uint64) k[2] << 40;
175 : pg_fallthrough;
176 : case 2:
177 : hs->accum |= (uint64) k[1] << 48;
178 : pg_fallthrough;
179 : case 1:
180 : hs->accum |= (uint64) k[0] << 56;
181 : break;
182 : case 0:
183 : return;
184 : }
185 : #else
186 30540210 : switch (len)
187 : {
188 19696776 : case 8:
189 19696776 : memcpy(&hs->accum, k, 8);
190 19696776 : break;
191 190399 : case 7:
192 190399 : hs->accum |= (uint64) k[6] << 48;
193 : pg_fallthrough;
194 265129 : case 6:
195 265129 : hs->accum |= (uint64) k[5] << 40;
196 : pg_fallthrough;
197 282302 : case 5:
198 282302 : hs->accum |= (uint64) k[4] << 32;
199 : pg_fallthrough;
200 402066 : case 4:
201 402066 : memcpy(&lower_four, k, sizeof(lower_four));
202 402066 : hs->accum |= lower_four;
203 402066 : break;
204 264171 : case 3:
205 264171 : hs->accum |= (uint64) k[2] << 16;
206 : pg_fallthrough;
207 416648 : case 2:
208 416648 : hs->accum |= (uint64) k[1] << 8;
209 : pg_fallthrough;
210 592980 : case 1:
211 592980 : hs->accum |= (uint64) k[0];
212 592980 : break;
213 9848388 : case 0:
214 9848388 : return;
215 : }
216 : #endif
217 :
218 20691822 : fasthash_combine(hs);
219 : }
220 :
221 : /*
222 : * Set high bit in lowest byte where the input is zero, from:
223 : * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
224 : */
225 : #define haszero64(v) \
226 : (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
227 :
228 : /*
229 : * all-purpose workhorse for fasthash_accum_cstring
230 : */
231 : static inline size_t
232 1111937 : fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
233 : {
234 1111937 : const char *const start = str;
235 :
236 2106983 : while (*str)
237 : {
238 995046 : size_t chunk_len = 0;
239 :
240 4614939 : while (chunk_len < FH_SIZEOF_ACCUM && str[chunk_len] != '\0')
241 3619893 : chunk_len++;
242 :
243 995046 : fasthash_accum(hs, str, chunk_len);
244 995046 : str += chunk_len;
245 : }
246 :
247 1111937 : return str - start;
248 : }
249 :
250 : /*
251 : * specialized workhorse for fasthash_accum_cstring
252 : *
253 : * With an aligned pointer, we consume the string a word at a time.
254 : * Loading the word containing the NUL terminator cannot segfault since
255 : * allocation boundaries are suitably aligned. To keep from setting
256 : * off alarms with address sanitizers, exclude this function from
257 : * such testing.
258 : */
259 : pg_attribute_no_sanitize_address()
260 : static inline size_t
261 1111937 : fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
262 : {
263 1111937 : const char *const start = str;
264 : size_t remainder;
265 : uint64 zero_byte_low;
266 :
267 : Assert(PointerIsAligned(start, uint64));
268 :
269 : /*
270 : * For every chunk of input, check for zero bytes before mixing into the
271 : * hash. The chunk with zeros must contain the NUL terminator.
272 : */
273 : for (;;)
274 839239 : {
275 1951176 : uint64 chunk = *(const uint64 *) str;
276 :
277 1951176 : zero_byte_low = haszero64(chunk);
278 1951176 : if (zero_byte_low)
279 1111937 : break;
280 :
281 839239 : hs->accum = chunk;
282 839239 : fasthash_combine(hs);
283 839239 : str += FH_SIZEOF_ACCUM;
284 : }
285 :
286 : /* mix in remaining bytes */
287 1111937 : remainder = fasthash_accum_cstring_unaligned(hs, str);
288 1111937 : str += remainder;
289 :
290 1111937 : return str - start;
291 : }
292 :
293 : /*
294 : * Mix 'str' into the hash state and return the length of the string.
295 : */
296 : static inline size_t
297 1111937 : fasthash_accum_cstring(fasthash_state *hs, const char *str)
298 : {
299 : #if SIZEOF_VOID_P >= 8
300 :
301 : size_t len;
302 : #ifdef USE_ASSERT_CHECKING
303 : size_t len_check;
304 : fasthash_state hs_check;
305 :
306 : memcpy(&hs_check, hs, sizeof(fasthash_state));
307 : len_check = fasthash_accum_cstring_unaligned(&hs_check, str);
308 : #endif
309 1111937 : if (PointerIsAligned(str, uint64))
310 : {
311 1111937 : len = fasthash_accum_cstring_aligned(hs, str);
312 : Assert(len_check == len);
313 : Assert(hs_check.hash == hs->hash);
314 1111937 : return len;
315 : }
316 : #endif /* SIZEOF_VOID_P */
317 :
318 : /*
319 : * It's not worth it to try to make the word-at-a-time optimization work
320 : * on 32-bit platforms.
321 : */
322 0 : return fasthash_accum_cstring_unaligned(hs, str);
323 : }
324 :
325 : /*
326 : * The finalizer
327 : *
328 : * 'tweak' is intended to be the input length when the caller doesn't know
329 : * the length ahead of time, such as for NUL-terminated strings, otherwise
330 : * zero.
331 : */
332 : static inline uint64
333 10804536 : fasthash_final64(fasthash_state *hs, uint64 tweak)
334 : {
335 10804536 : return fasthash_mix(hs->hash, tweak);
336 : }
337 :
338 : /*
339 : * Reduce a 64-bit hash to a 32-bit hash.
340 : *
341 : * This optional step provides a bit more additional mixing compared to
342 : * just taking the lower 32-bits.
343 : */
344 : static inline uint32
345 10804536 : fasthash_reduce32(uint64 h)
346 : {
347 : /*
348 : * Convert the 64-bit hashcode to Fermat residue, which shall retain
349 : * information from both the higher and lower parts of hashcode.
350 : */
351 10804536 : return h - (h >> 32);
352 : }
353 :
354 : /* finalize and reduce */
355 : static inline uint32
356 956148 : fasthash_final32(fasthash_state *hs, uint64 tweak)
357 : {
358 956148 : return fasthash_reduce32(fasthash_final64(hs, tweak));
359 : }
360 :
361 :
362 : /* Standalone functions */
363 :
364 : /*
365 : * The original fasthash64 function, re-implemented using the incremental
366 : * interface. Returns the same 64-bit hashcode as the original,
367 : * at least on little-endian machines. 'len' controls not only how
368 : * many bytes to hash, but also modifies the internal seed.
369 : * 'seed' can be zero.
370 : */
371 : static inline uint64
372 9848388 : fasthash64(const char *k, size_t len, uint64 seed)
373 : {
374 : fasthash_state hs;
375 :
376 9848388 : fasthash_init(&hs, 0);
377 :
378 : /* re-initialize the seed according to input length */
379 9848388 : hs.hash = seed ^ (len * 0x880355f21e6d1965);
380 :
381 29545164 : while (len >= FH_SIZEOF_ACCUM)
382 : {
383 19696776 : fasthash_accum(&hs, k, FH_SIZEOF_ACCUM);
384 19696776 : k += FH_SIZEOF_ACCUM;
385 19696776 : len -= FH_SIZEOF_ACCUM;
386 : }
387 :
388 9848388 : fasthash_accum(&hs, k, len);
389 :
390 : /*
391 : * Since we already mixed the input length into the seed, we can just pass
392 : * zero here. This matches upstream behavior as well.
393 : */
394 9848388 : return fasthash_final64(&hs, 0);
395 : }
396 :
397 : /* like fasthash64, but returns a 32-bit hashcode */
398 : static inline uint32
399 9848388 : fasthash32(const char *k, size_t len, uint64 seed)
400 : {
401 9848388 : return fasthash_reduce32(fasthash64(k, len, seed));
402 : }
403 :
404 : /*
405 : * Convenience function for hashing NUL-terminated strings
406 : *
407 : * Note: This is faster than, and computes a different result from,
408 : * "fasthash32(s, strlen(s))"
409 : */
410 : static inline uint32
411 399863 : hash_string(const char *s)
412 : {
413 : fasthash_state hs;
414 : size_t s_len;
415 :
416 399863 : fasthash_init(&hs, 0);
417 :
418 : /*
419 : * Combine string into the hash and save the length for tweaking the final
420 : * mix.
421 : */
422 399863 : s_len = fasthash_accum_cstring(&hs, s);
423 :
424 399863 : return fasthash_final32(&hs, s_len);
425 : }
426 :
427 : #endif /* HASHFN_UNSTABLE_H */
|