Line data Source code
1 : /*
2 : * hashfn_unstable.h
3 : *
4 : * Building blocks for creating fast inlineable hash functions. The
5 : * functions in this file are not guaranteed to be stable between versions,
6 : * and may differ by hardware platform. Hence they must not be used in
7 : * indexes or other on-disk structures. See hashfn.h if you need stability.
8 : *
9 : *
10 : * Portions Copyright (c) 2024-2025, PostgreSQL Global Development Group
11 : *
12 : * src/include/common/hashfn_unstable.h
13 : */
14 : #ifndef HASHFN_UNSTABLE_H
15 : #define HASHFN_UNSTABLE_H
16 :
17 : #include "port/pg_bitutils.h"
18 : #include "port/pg_bswap.h"
19 :
20 : /*
21 : * fasthash is a modification of code taken from
22 : * https://code.google.com/archive/p/fast-hash/source/default/source
23 : * under the terms of the MIT license. The original copyright
24 : * notice follows:
25 : */
26 :
27 : /* The MIT License
28 :
29 : Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
30 :
31 : Permission is hereby granted, free of charge, to any person
32 : obtaining a copy of this software and associated documentation
33 : files (the "Software"), to deal in the Software without
34 : restriction, including without limitation the rights to use, copy,
35 : modify, merge, publish, distribute, sublicense, and/or sell copies
36 : of the Software, and to permit persons to whom the Software is
37 : furnished to do so, subject to the following conditions:
38 :
39 : The above copyright notice and this permission notice shall be
40 : included in all copies or substantial portions of the Software.
41 :
42 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
43 : EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 : MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
45 : NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
46 : BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
47 : ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
48 : CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
49 : SOFTWARE.
50 : */
51 :
52 : /*
53 : * fasthash as implemented here has two interfaces:
54 : *
55 : * 1) Standalone functions, e.g. fasthash32() for a single value with a
56 : * known length. These return the same hash code as the original, at
57 : * least on little-endian machines.
58 : *
59 : * 2) Incremental interface. This can used for incorporating multiple
60 : * inputs. First, initialize the hash state (here with a zero seed):
61 : *
62 : * fasthash_state hs;
63 : * fasthash_init(&hs, 0);
64 : *
65 : * If the inputs are of types that can be trivially cast to uint64, it's
66 : * sufficient to do:
67 : *
68 : * hs.accum = value1;
69 : * fasthash_combine(&hs);
70 : * hs.accum = value2;
71 : * fasthash_combine(&hs);
72 : * ...
73 : *
74 : * For longer or variable-length input, fasthash_accum() is a more
75 : * flexible, but more verbose method. The standalone functions use this
76 : * internally, so see fasthash64() for an example of this.
77 : *
78 : * After all inputs have been mixed in, finalize the hash:
79 : *
80 : * hashcode = fasthash_final32(&hs, 0);
81 : *
82 : * The incremental interface allows an optimization for NUL-terminated
83 : * C strings:
84 : *
85 : * len = fasthash_accum_cstring(&hs, str);
86 : * hashcode = fasthash_final32(&hs, len);
87 : *
88 : * By handling the terminator on-the-fly, we can avoid needing a strlen()
89 : * call to tell us how many bytes to hash. Experimentation has found that
90 : * SMHasher fails unless we incorporate the length, so it is passed to
91 : * the finalizer as a tweak.
92 : */
93 :
94 :
95 : typedef struct fasthash_state
96 : {
97 : /* staging area for chunks of input */
98 : uint64 accum;
99 :
100 : uint64 hash;
101 : } fasthash_state;
102 :
103 : #define FH_SIZEOF_ACCUM sizeof(uint64)
104 :
105 :
106 : /*
107 : * Initialize the hash state.
108 : *
109 : * 'seed' can be zero.
110 : */
111 : static inline void
112 11600180 : fasthash_init(fasthash_state *hs, uint64 seed)
113 : {
114 11600180 : memset(hs, 0, sizeof(fasthash_state));
115 11600180 : hs->hash = seed ^ 0x880355f21e6d1965;
116 11600180 : }
117 :
118 : /* both the finalizer and part of the combining step */
119 : static inline uint64
120 35051358 : fasthash_mix(uint64 h, uint64 tweak)
121 : {
122 35051358 : h ^= (h >> 23) + tweak;
123 35051358 : h *= 0x2127599bf4325c37;
124 35051358 : h ^= h >> 47;
125 35051358 : return h;
126 : }
127 :
128 : /* combine one chunk of input into the hash */
129 : static inline void
130 23451178 : fasthash_combine(fasthash_state *hs)
131 : {
132 23451178 : hs->hash ^= fasthash_mix(hs->accum, 0);
133 23451178 : hs->hash *= 0x880355f21e6d1965;
134 23451178 : }
135 :
136 : /* accumulate up to 8 bytes of input and combine it into the hash */
137 : static inline void
138 33013412 : fasthash_accum(fasthash_state *hs, const char *k, size_t len)
139 : {
140 : uint32 lower_four;
141 :
142 : Assert(len <= FH_SIZEOF_ACCUM);
143 33013412 : hs->accum = 0;
144 :
145 : /*
146 : * For consistency, bytewise loads must match the platform's endianness.
147 : */
148 : #ifdef WORDS_BIGENDIAN
149 : switch (len)
150 : {
151 : case 8:
152 : memcpy(&hs->accum, k, 8);
153 : break;
154 : case 7:
155 : hs->accum |= (uint64) k[6] << 8;
156 : /* FALLTHROUGH */
157 : case 6:
158 : hs->accum |= (uint64) k[5] << 16;
159 : /* FALLTHROUGH */
160 : case 5:
161 : hs->accum |= (uint64) k[4] << 24;
162 : /* FALLTHROUGH */
163 : case 4:
164 : memcpy(&lower_four, k, sizeof(lower_four));
165 : hs->accum |= (uint64) lower_four << 32;
166 : break;
167 : case 3:
168 : hs->accum |= (uint64) k[2] << 40;
169 : /* FALLTHROUGH */
170 : case 2:
171 : hs->accum |= (uint64) k[1] << 48;
172 : /* FALLTHROUGH */
173 : case 1:
174 : hs->accum |= (uint64) k[0] << 56;
175 : break;
176 : case 0:
177 : return;
178 : }
179 : #else
180 33013412 : switch (len)
181 : {
182 21413232 : case 8:
183 21413232 : memcpy(&hs->accum, k, 8);
184 21413232 : break;
185 192494 : case 7:
186 192494 : hs->accum |= (uint64) k[6] << 48;
187 : /* FALLTHROUGH */
188 247796 : case 6:
189 247796 : hs->accum |= (uint64) k[5] << 40;
190 : /* FALLTHROUGH */
191 250018 : case 5:
192 250018 : hs->accum |= (uint64) k[4] << 32;
193 : /* FALLTHROUGH */
194 433584 : case 4:
195 433584 : memcpy(&lower_four, k, sizeof(lower_four));
196 433584 : hs->accum |= lower_four;
197 433584 : break;
198 401942 : case 3:
199 401942 : hs->accum |= (uint64) k[2] << 16;
200 : /* FALLTHROUGH */
201 429250 : case 2:
202 429250 : hs->accum |= (uint64) k[1] << 8;
203 : /* FALLTHROUGH */
204 433614 : case 1:
205 433614 : hs->accum |= (uint64) k[0];
206 433614 : break;
207 10732982 : case 0:
208 10732982 : return;
209 : }
210 : #endif
211 :
212 22280430 : fasthash_combine(hs);
213 : }
214 :
215 : /*
216 : * Set high bit in lowest byte where the input is zero, from:
217 : * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
218 : */
219 : #define haszero64(v) \
220 : (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
221 :
222 : /*
223 : * all-purpose workhorse for fasthash_accum_cstring
224 : */
225 : static inline size_t
226 0 : fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
227 : {
228 0 : const char *const start = str;
229 :
230 0 : while (*str)
231 : {
232 0 : size_t chunk_len = 0;
233 :
234 0 : while (chunk_len < FH_SIZEOF_ACCUM && str[chunk_len] != '\0')
235 0 : chunk_len++;
236 :
237 0 : fasthash_accum(hs, str, chunk_len);
238 0 : str += chunk_len;
239 : }
240 :
241 0 : return str - start;
242 : }
243 :
244 : /*
245 : * specialized workhorse for fasthash_accum_cstring
246 : *
247 : * With an aligned pointer, we consume the string a word at a time.
248 : * Loading the word containing the NUL terminator cannot segfault since
249 : * allocation boundaries are suitably aligned. To keep from setting
250 : * off alarms with address sanitizers, exclude this function from
251 : * such testing.
252 : */
253 : pg_attribute_no_sanitize_address()
254 : static inline size_t
255 893564 : fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
256 : {
257 893564 : const char *const start = str;
258 : size_t remainder;
259 : uint64 zero_byte_low;
260 :
261 : Assert(PointerIsAligned(start, uint64));
262 :
263 : /*
264 : * For every chunk of input, check for zero bytes before mixing into the
265 : * hash. The chunk with zeros must contain the NUL terminator. We arrange
266 : * so that zero_byte_low tells us not only that a zero exists, but also
267 : * where it is, so we can hash the remainder of the string.
268 : *
269 : * The haszero64 calculation will set bits corresponding to the lowest
270 : * byte where a zero exists, so that suffices for little-endian machines.
271 : * For big-endian machines, we would need bits set for the highest zero
272 : * byte in the chunk, since the trailing junk past the terminator could
273 : * contain additional zeros. haszero64 does not give us that, so we
274 : * byteswap the chunk first.
275 : */
276 : for (;;)
277 1007526 : {
278 1901090 : uint64 chunk = *(uint64 *) str;
279 :
280 : #ifdef WORDS_BIGENDIAN
281 : zero_byte_low = haszero64(pg_bswap64(chunk));
282 : #else
283 1901090 : zero_byte_low = haszero64(chunk);
284 : #endif
285 1901090 : if (zero_byte_low)
286 893564 : break;
287 :
288 1007526 : hs->accum = chunk;
289 1007526 : fasthash_combine(hs);
290 1007526 : str += FH_SIZEOF_ACCUM;
291 : }
292 :
293 : /*
294 : * The byte corresponding to the NUL will be 0x80, so the rightmost bit
295 : * position will be in the range 7, 15, ..., 63. Turn this into byte
296 : * position by dividing by 8.
297 : */
298 893564 : remainder = pg_rightmost_one_pos64(zero_byte_low) / BITS_PER_BYTE;
299 893564 : fasthash_accum(hs, str, remainder);
300 893564 : str += remainder;
301 :
302 893564 : return str - start;
303 : }
304 :
305 : /*
306 : * Mix 'str' into the hash state and return the length of the string.
307 : */
308 : static inline size_t
309 893564 : fasthash_accum_cstring(fasthash_state *hs, const char *str)
310 : {
311 : #if SIZEOF_VOID_P >= 8
312 :
313 : size_t len;
314 : #ifdef USE_ASSERT_CHECKING
315 : size_t len_check;
316 : fasthash_state hs_check;
317 :
318 : memcpy(&hs_check, hs, sizeof(fasthash_state));
319 : len_check = fasthash_accum_cstring_unaligned(&hs_check, str);
320 : #endif
321 893564 : if (PointerIsAligned(str, uint64))
322 : {
323 893564 : len = fasthash_accum_cstring_aligned(hs, str);
324 : Assert(len_check == len);
325 : Assert(hs_check.hash == hs->hash);
326 893564 : return len;
327 : }
328 : #endif /* SIZEOF_VOID_P */
329 :
330 : /*
331 : * It's not worth it to try to make the word-at-a-time optimization work
332 : * on 32-bit platforms.
333 : */
334 0 : return fasthash_accum_cstring_unaligned(hs, str);
335 : }
336 :
337 : /*
338 : * The finalizer
339 : *
340 : * 'tweak' is intended to be the input length when the caller doesn't know
341 : * the length ahead of time, such as for NUL-terminated strings, otherwise
342 : * zero.
343 : */
344 : static inline uint64
345 11600180 : fasthash_final64(fasthash_state *hs, uint64 tweak)
346 : {
347 11600180 : return fasthash_mix(hs->hash, tweak);
348 : }
349 :
350 : /*
351 : * Reduce a 64-bit hash to a 32-bit hash.
352 : *
353 : * This optional step provides a bit more additional mixing compared to
354 : * just taking the lower 32-bits.
355 : */
356 : static inline uint32
357 11600180 : fasthash_reduce32(uint64 h)
358 : {
359 : /*
360 : * Convert the 64-bit hashcode to Fermat residue, which shall retain
361 : * information from both the higher and lower parts of hashcode.
362 : */
363 11600180 : return h - (h >> 32);
364 : }
365 :
366 : /* finalize and reduce */
367 : static inline uint32
368 893564 : fasthash_final32(fasthash_state *hs, uint64 tweak)
369 : {
370 893564 : return fasthash_reduce32(fasthash_final64(hs, tweak));
371 : }
372 :
373 : /*
374 : * The original fasthash64 function, re-implemented using the incremental
375 : * interface. Returns a 64-bit hashcode. 'len' controls not only how
376 : * many bytes to hash, but also modifies the internal seed.
377 : * 'seed' can be zero.
378 : */
379 : static inline uint64
380 10706616 : fasthash64(const char *k, size_t len, uint64 seed)
381 : {
382 : fasthash_state hs;
383 :
384 10706616 : fasthash_init(&hs, 0);
385 :
386 : /* re-initialize the seed according to input length */
387 10706616 : hs.hash = seed ^ (len * 0x880355f21e6d1965);
388 :
389 32119848 : while (len >= FH_SIZEOF_ACCUM)
390 : {
391 21413232 : fasthash_accum(&hs, k, FH_SIZEOF_ACCUM);
392 21413232 : k += FH_SIZEOF_ACCUM;
393 21413232 : len -= FH_SIZEOF_ACCUM;
394 : }
395 :
396 10706616 : fasthash_accum(&hs, k, len);
397 10706616 : return fasthash_final64(&hs, 0);
398 : }
399 :
400 : /* like fasthash64, but returns a 32-bit hashcode */
401 : static inline uint32
402 10706616 : fasthash32(const char *k, size_t len, uint64 seed)
403 : {
404 10706616 : return fasthash_reduce32(fasthash64(k, len, seed));
405 : }
406 :
407 : /*
408 : * Convenience function for hashing NUL-terminated strings
409 : */
410 : static inline uint32
411 730342 : hash_string(const char *s)
412 : {
413 : fasthash_state hs;
414 : size_t s_len;
415 :
416 730342 : fasthash_init(&hs, 0);
417 :
418 : /*
419 : * Combine string into the hash and save the length for tweaking the final
420 : * mix.
421 : */
422 730342 : s_len = fasthash_accum_cstring(&hs, s);
423 :
424 730342 : return fasthash_final32(&hs, s_len);
425 : }
426 :
427 : #endif /* HASHFN_UNSTABLE_H */
|