Line data Source code
1 : /*
2 : * hashfn_unstable.h
3 : *
4 : * Building blocks for creating fast inlineable hash functions. The
5 : * functions in this file are not guaranteed to be stable between versions,
6 : * and may differ by hardware platform. Hence they must not be used in
7 : * indexes or other on-disk structures. See hashfn.h if you need stability.
8 : *
9 : *
10 : * Portions Copyright (c) 2024-2026, PostgreSQL Global Development Group
11 : *
12 : * src/include/common/hashfn_unstable.h
13 : */
14 : #ifndef HASHFN_UNSTABLE_H
15 : #define HASHFN_UNSTABLE_H
16 :
17 :
18 : /*
19 : * fasthash is a modification of code taken from
20 : * https://code.google.com/archive/p/fast-hash/source/default/source
21 : * under the terms of the MIT license. The original copyright
22 : * notice follows:
23 : */
24 :
25 : /* The MIT License
26 :
27 : Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
28 :
29 : Permission is hereby granted, free of charge, to any person
30 : obtaining a copy of this software and associated documentation
31 : files (the "Software"), to deal in the Software without
32 : restriction, including without limitation the rights to use, copy,
33 : modify, merge, publish, distribute, sublicense, and/or sell copies
34 : of the Software, and to permit persons to whom the Software is
35 : furnished to do so, subject to the following conditions:
36 :
37 : The above copyright notice and this permission notice shall be
38 : included in all copies or substantial portions of the Software.
39 :
40 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41 : EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42 : MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
43 : NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
44 : BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
45 : ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
46 : CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
47 : SOFTWARE.
48 : */
49 :
50 : /*
51 : * fasthash as implemented here has two interfaces:
52 : *
53 : * 1) Standalone functions that take a single input.
54 : *
55 : * 2) Incremental interface. This can used for incorporating multiple
56 : * inputs. First, initialize the hash state (here with a zero seed):
57 : *
58 : * fasthash_state hs;
59 : * fasthash_init(&hs, 0);
60 : *
61 : * Next, accumulate input into the hash state.
62 : * If the inputs are of types that can be trivially cast to uint64, it's
63 : * sufficient to do:
64 : *
65 : * hs.accum = value1;
66 : * fasthash_combine(&hs);
67 : * hs.accum = value2;
68 : * fasthash_combine(&hs);
69 : * ...
70 : *
71 : * For longer or variable-length input, fasthash_accum() is a more
72 : * flexible, but more verbose method. The standalone functions use this
73 : * internally, so see fasthash64() for an example of this.
74 : *
75 : * After all inputs have been mixed in, finalize the hash and optionally
76 : * reduce to 32 bits. If all inputs are fixed-length, it's sufficient
77 : * to pass zero for the tweak:
78 : *
79 : * hashcode = fasthash_final32(&hs, 0);
80 : *
81 : * For variable length input, experimentation has found that SMHasher
82 : * fails unless we pass the length for the tweak. When accumulating
83 : * multiple varlen values, it's probably safest to calculate a tweak
84 : * such that the bits of all individual lengths are present, for example:
85 : *
86 : * lengths = len1 + (len2 << 10) + (len3 << 20);
87 : * hashcode = fasthash_final32(&hs, lengths);
88 : *
89 : * The incremental interface allows an optimization for NUL-terminated
90 : * C strings:
91 : *
92 : * len = fasthash_accum_cstring(&hs, str);
93 : * hashcode = fasthash_final32(&hs, len);
94 : *
95 : * By computing the length on-the-fly, we can avoid needing a strlen()
96 : * call to tell us how many bytes to hash.
97 : */
98 :
99 :
100 : typedef struct fasthash_state
101 : {
102 : /* staging area for chunks of input */
103 : uint64 accum;
104 :
105 : uint64 hash;
106 : } fasthash_state;
107 :
108 : #define FH_SIZEOF_ACCUM sizeof(uint64)
109 :
110 :
111 : /*
112 : * Initialize the hash state.
113 : *
114 : * 'seed' can be zero.
115 : */
116 : static inline void
117 16199038 : fasthash_init(fasthash_state *hs, uint64 seed)
118 : {
119 16199038 : memset(hs, 0, sizeof(fasthash_state));
120 16199038 : hs->hash = seed ^ 0x880355f21e6d1965;
121 16199038 : }
122 :
123 : /* both the finalizer and part of the combining step */
124 : static inline uint64
125 48873072 : fasthash_mix(uint64 h, uint64 tweak)
126 : {
127 48873072 : h ^= (h >> 23) + tweak;
128 48873072 : h *= 0x2127599bf4325c37;
129 48873072 : h ^= h >> 47;
130 48873072 : return h;
131 : }
132 :
133 : /* combine one chunk of input into the hash */
134 : static inline void
135 32674034 : fasthash_combine(fasthash_state *hs)
136 : {
137 32674034 : hs->hash ^= fasthash_mix(hs->accum, 0);
138 32674034 : hs->hash *= 0x880355f21e6d1965;
139 32674034 : }
140 :
141 : /* accumulate up to 8 bytes of input and combine it into the hash */
142 : static inline void
143 46712006 : fasthash_accum(fasthash_state *hs, const char *k, size_t len)
144 : {
145 : uint32 lower_four;
146 :
147 : Assert(len <= FH_SIZEOF_ACCUM);
148 46712006 : hs->accum = 0;
149 :
150 : /*
151 : * For consistency, bytewise loads must match the platform's endianness.
152 : */
153 : #ifdef WORDS_BIGENDIAN
154 : switch (len)
155 : {
156 : case 8:
157 : memcpy(&hs->accum, k, 8);
158 : break;
159 : case 7:
160 : hs->accum |= (uint64) k[6] << 8;
161 : /* FALLTHROUGH */
162 : case 6:
163 : hs->accum |= (uint64) k[5] << 16;
164 : /* FALLTHROUGH */
165 : case 5:
166 : hs->accum |= (uint64) k[4] << 24;
167 : /* FALLTHROUGH */
168 : case 4:
169 : memcpy(&lower_four, k, sizeof(lower_four));
170 : hs->accum |= (uint64) lower_four << 32;
171 : break;
172 : case 3:
173 : hs->accum |= (uint64) k[2] << 40;
174 : /* FALLTHROUGH */
175 : case 2:
176 : hs->accum |= (uint64) k[1] << 48;
177 : /* FALLTHROUGH */
178 : case 1:
179 : hs->accum |= (uint64) k[0] << 56;
180 : break;
181 : case 0:
182 : return;
183 : }
184 : #else
185 46712006 : switch (len)
186 : {
187 30533432 : case 8:
188 30533432 : memcpy(&hs->accum, k, 8);
189 30533432 : break;
190 211966 : case 7:
191 211966 : hs->accum |= (uint64) k[6] << 48;
192 : /* FALLTHROUGH */
193 272422 : case 6:
194 272422 : hs->accum |= (uint64) k[5] << 40;
195 : /* FALLTHROUGH */
196 274870 : case 5:
197 274870 : hs->accum |= (uint64) k[4] << 32;
198 : /* FALLTHROUGH */
199 461722 : case 4:
200 461722 : memcpy(&lower_four, k, sizeof(lower_four));
201 461722 : hs->accum |= lower_four;
202 461722 : break;
203 417092 : case 3:
204 417092 : hs->accum |= (uint64) k[2] << 16;
205 : /* FALLTHROUGH */
206 445958 : case 2:
207 445958 : hs->accum |= (uint64) k[1] << 8;
208 : /* FALLTHROUGH */
209 450136 : case 1:
210 450136 : hs->accum |= (uint64) k[0];
211 450136 : break;
212 15266716 : case 0:
213 15266716 : return;
214 : }
215 : #endif
216 :
217 31445290 : fasthash_combine(hs);
218 : }
219 :
220 : /*
221 : * Set high bit in lowest byte where the input is zero, from:
222 : * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
223 : */
224 : #define haszero64(v) \
225 : (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
226 :
227 : /*
228 : * all-purpose workhorse for fasthash_accum_cstring
229 : */
230 : static inline size_t
231 932322 : fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
232 : {
233 932322 : const char *const start = str;
234 :
235 1844180 : while (*str)
236 : {
237 911858 : size_t chunk_len = 0;
238 :
239 4831190 : while (chunk_len < FH_SIZEOF_ACCUM && str[chunk_len] != '\0')
240 3919332 : chunk_len++;
241 :
242 911858 : fasthash_accum(hs, str, chunk_len);
243 911858 : str += chunk_len;
244 : }
245 :
246 932322 : return str - start;
247 : }
248 :
249 : /*
250 : * specialized workhorse for fasthash_accum_cstring
251 : *
252 : * With an aligned pointer, we consume the string a word at a time.
253 : * Loading the word containing the NUL terminator cannot segfault since
254 : * allocation boundaries are suitably aligned. To keep from setting
255 : * off alarms with address sanitizers, exclude this function from
256 : * such testing.
257 : */
258 : pg_attribute_no_sanitize_address()
259 : static inline size_t
260 932322 : fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
261 : {
262 932322 : const char *const start = str;
263 : size_t remainder;
264 : uint64 zero_byte_low;
265 :
266 : Assert(PointerIsAligned(start, uint64));
267 :
268 : /*
269 : * For every chunk of input, check for zero bytes before mixing into the
270 : * hash. The chunk with zeros must contain the NUL terminator.
271 : */
272 : for (;;)
273 1057866 : {
274 1990188 : uint64 chunk = *(uint64 *) str;
275 :
276 1990188 : zero_byte_low = haszero64(chunk);
277 1990188 : if (zero_byte_low)
278 932322 : break;
279 :
280 1057866 : hs->accum = chunk;
281 1057866 : fasthash_combine(hs);
282 1057866 : str += FH_SIZEOF_ACCUM;
283 : }
284 :
285 : /* mix in remaining bytes */
286 932322 : remainder = fasthash_accum_cstring_unaligned(hs, str);
287 932322 : str += remainder;
288 :
289 932322 : return str - start;
290 : }
291 :
292 : /*
293 : * Mix 'str' into the hash state and return the length of the string.
294 : */
295 : static inline size_t
296 932322 : fasthash_accum_cstring(fasthash_state *hs, const char *str)
297 : {
298 : #if SIZEOF_VOID_P >= 8
299 :
300 : size_t len;
301 : #ifdef USE_ASSERT_CHECKING
302 : size_t len_check;
303 : fasthash_state hs_check;
304 :
305 : memcpy(&hs_check, hs, sizeof(fasthash_state));
306 : len_check = fasthash_accum_cstring_unaligned(&hs_check, str);
307 : #endif
308 932322 : if (PointerIsAligned(str, uint64))
309 : {
310 932322 : len = fasthash_accum_cstring_aligned(hs, str);
311 : Assert(len_check == len);
312 : Assert(hs_check.hash == hs->hash);
313 932322 : return len;
314 : }
315 : #endif /* SIZEOF_VOID_P */
316 :
317 : /*
318 : * It's not worth it to try to make the word-at-a-time optimization work
319 : * on 32-bit platforms.
320 : */
321 0 : return fasthash_accum_cstring_unaligned(hs, str);
322 : }
323 :
324 : /*
325 : * The finalizer
326 : *
327 : * 'tweak' is intended to be the input length when the caller doesn't know
328 : * the length ahead of time, such as for NUL-terminated strings, otherwise
329 : * zero.
330 : */
331 : static inline uint64
332 16199038 : fasthash_final64(fasthash_state *hs, uint64 tweak)
333 : {
334 16199038 : return fasthash_mix(hs->hash, tweak);
335 : }
336 :
337 : /*
338 : * Reduce a 64-bit hash to a 32-bit hash.
339 : *
340 : * This optional step provides a bit more additional mixing compared to
341 : * just taking the lower 32-bits.
342 : */
343 : static inline uint32
344 16199038 : fasthash_reduce32(uint64 h)
345 : {
346 : /*
347 : * Convert the 64-bit hashcode to Fermat residue, which shall retain
348 : * information from both the higher and lower parts of hashcode.
349 : */
350 16199038 : return h - (h >> 32);
351 : }
352 :
353 : /* finalize and reduce */
354 : static inline uint32
355 932322 : fasthash_final32(fasthash_state *hs, uint64 tweak)
356 : {
357 932322 : return fasthash_reduce32(fasthash_final64(hs, tweak));
358 : }
359 :
360 :
361 : /* Standalone functions */
362 :
363 : /*
364 : * The original fasthash64 function, re-implemented using the incremental
365 : * interface. Returns the same 64-bit hashcode as the original,
366 : * at least on little-endian machines. 'len' controls not only how
367 : * many bytes to hash, but also modifies the internal seed.
368 : * 'seed' can be zero.
369 : */
370 : static inline uint64
371 15266716 : fasthash64(const char *k, size_t len, uint64 seed)
372 : {
373 : fasthash_state hs;
374 :
375 15266716 : fasthash_init(&hs, 0);
376 :
377 : /* re-initialize the seed according to input length */
378 15266716 : hs.hash = seed ^ (len * 0x880355f21e6d1965);
379 :
380 45800148 : while (len >= FH_SIZEOF_ACCUM)
381 : {
382 30533432 : fasthash_accum(&hs, k, FH_SIZEOF_ACCUM);
383 30533432 : k += FH_SIZEOF_ACCUM;
384 30533432 : len -= FH_SIZEOF_ACCUM;
385 : }
386 :
387 15266716 : fasthash_accum(&hs, k, len);
388 :
389 : /*
390 : * Since we already mixed the input length into the seed, we can just pass
391 : * zero here. This matches upstream behavior as well.
392 : */
393 15266716 : return fasthash_final64(&hs, 0);
394 : }
395 :
396 : /* like fasthash64, but returns a 32-bit hashcode */
397 : static inline uint32
398 15266716 : fasthash32(const char *k, size_t len, uint64 seed)
399 : {
400 15266716 : return fasthash_reduce32(fasthash64(k, len, seed));
401 : }
402 :
403 : /*
404 : * Convenience function for hashing NUL-terminated strings
405 : *
406 : * Note: This is faster than, and computes a different result from,
407 : * "fasthash32(s, strlen(s))"
408 : */
409 : static inline uint32
410 761444 : hash_string(const char *s)
411 : {
412 : fasthash_state hs;
413 : size_t s_len;
414 :
415 761444 : fasthash_init(&hs, 0);
416 :
417 : /*
418 : * Combine string into the hash and save the length for tweaking the final
419 : * mix.
420 : */
421 761444 : s_len = fasthash_accum_cstring(&hs, s);
422 :
423 761444 : return fasthash_final32(&hs, s_len);
424 : }
425 :
426 : #endif /* HASHFN_UNSTABLE_H */
|