Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_popcount_x86.c
4 : * Holds the x86-64 pg_popcount() implementations.
5 : *
6 : * Copyright (c) 2024-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/port/pg_popcount_x86.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #ifdef HAVE_X86_64_POPCNTQ
16 :
17 : #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
18 : #include <cpuid.h>
19 : #endif
20 :
21 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
22 : #include <immintrin.h>
23 : #endif
24 :
25 : #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
26 : #include <intrin.h>
27 : #endif
28 :
29 : #include "port/pg_bitutils.h"
30 :
31 : /*
32 : * The SSE4.2 versions are built regardless of whether we are building the
33 : * AVX-512 versions.
34 : *
35 : * Technically, POPCNT is not part of SSE4.2, and isn't even a vector
36 : * operation, but in practice this is close enough, and "sse42" seems easier to
37 : * follow than "popcnt" for these names.
38 : */
39 : static inline int pg_popcount32_sse42(uint32 word);
40 : static inline int pg_popcount64_sse42(uint64 word);
41 : static uint64 pg_popcount_sse42(const char *buf, int bytes);
42 : static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
43 :
44 : /*
45 : * These are the AVX-512 implementations of the popcount functions.
46 : */
47 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
48 : static uint64 pg_popcount_avx512(const char *buf, int bytes);
49 : static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
50 : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
51 :
52 : /*
53 : * The function pointers are initially set to "choose" functions. These
54 : * functions will first set the pointers to the right implementations (base on
55 : * what the current CPU supports) and then will call the pointer to fulfill the
56 : * caller's request.
57 : */
58 : static int pg_popcount32_choose(uint32 word);
59 : static int pg_popcount64_choose(uint64 word);
60 : static uint64 pg_popcount_choose(const char *buf, int bytes);
61 : static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
62 : int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
63 : int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
64 : uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
65 : uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
66 :
67 : /*
68 : * Return true if CPUID indicates that the POPCNT instruction is available.
69 : */
70 : static bool
71 12652 : pg_popcount_sse42_available(void)
72 : {
73 12652 : unsigned int exx[4] = {0, 0, 0, 0};
74 :
75 : #if defined(HAVE__GET_CPUID)
76 12652 : __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
77 : #elif defined(HAVE__CPUID)
78 : __cpuid(exx, 1);
79 : #else
80 : #error cpuid instruction not available
81 : #endif
82 :
83 12652 : return (exx[2] & (1 << 23)) != 0; /* POPCNT */
84 : }
85 :
86 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
87 :
88 : /*
89 : * Does CPUID say there's support for XSAVE instructions?
90 : */
91 : static inline bool
92 12652 : xsave_available(void)
93 : {
94 12652 : unsigned int exx[4] = {0, 0, 0, 0};
95 :
96 : #if defined(HAVE__GET_CPUID)
97 12652 : __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
98 : #elif defined(HAVE__CPUID)
99 : __cpuid(exx, 1);
100 : #else
101 : #error cpuid instruction not available
102 : #endif
103 12652 : return (exx[2] & (1 << 27)) != 0; /* osxsave */
104 : }
105 :
106 : /*
107 : * Does XGETBV say the ZMM registers are enabled?
108 : *
109 : * NB: Caller is responsible for verifying that xsave_available() returns true
110 : * before calling this.
111 : */
112 : #ifdef HAVE_XSAVE_INTRINSICS
113 : pg_attribute_target("xsave")
114 : #endif
115 : static inline bool
116 12652 : zmm_regs_available(void)
117 : {
118 : #ifdef HAVE_XSAVE_INTRINSICS
119 12652 : return (_xgetbv(0) & 0xe6) == 0xe6;
120 : #else
121 : return false;
122 : #endif
123 : }
124 :
125 : /*
126 : * Does CPUID say there's support for AVX-512 popcount and byte-and-word
127 : * instructions?
128 : */
129 : static inline bool
130 12652 : avx512_popcnt_available(void)
131 : {
132 12652 : unsigned int exx[4] = {0, 0, 0, 0};
133 :
134 : #if defined(HAVE__GET_CPUID_COUNT)
135 12652 : __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
136 : #elif defined(HAVE__CPUIDEX)
137 : __cpuidex(exx, 7, 0);
138 : #else
139 : #error cpuid instruction not available
140 : #endif
141 12652 : return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
142 0 : (exx[1] & (1 << 30)) != 0; /* avx512-bw */
143 : }
144 :
145 : /*
146 : * Returns true if the CPU supports the instructions required for the AVX-512
147 : * pg_popcount() implementation.
148 : */
149 : static bool
150 12652 : pg_popcount_avx512_available(void)
151 : {
152 25304 : return xsave_available() &&
153 25304 : zmm_regs_available() &&
154 12652 : avx512_popcnt_available();
155 : }
156 :
157 : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
158 :
159 : /*
160 : * These functions get called on the first call to pg_popcount32 etc.
161 : * They detect whether we can use the asm implementations, and replace
162 : * the function pointers so that subsequent calls are routed directly to
163 : * the chosen implementation.
164 : */
165 : static inline void
166 12652 : choose_popcount_functions(void)
167 : {
168 12652 : if (pg_popcount_sse42_available())
169 : {
170 12652 : pg_popcount32 = pg_popcount32_sse42;
171 12652 : pg_popcount64 = pg_popcount64_sse42;
172 12652 : pg_popcount_optimized = pg_popcount_sse42;
173 12652 : pg_popcount_masked_optimized = pg_popcount_masked_sse42;
174 : }
175 : else
176 : {
177 0 : pg_popcount32 = pg_popcount32_portable;
178 0 : pg_popcount64 = pg_popcount64_portable;
179 0 : pg_popcount_optimized = pg_popcount_portable;
180 0 : pg_popcount_masked_optimized = pg_popcount_masked_portable;
181 : }
182 :
183 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
184 12652 : if (pg_popcount_avx512_available())
185 : {
186 0 : pg_popcount_optimized = pg_popcount_avx512;
187 0 : pg_popcount_masked_optimized = pg_popcount_masked_avx512;
188 : }
189 : #endif
190 12652 : }
191 :
192 : static int
193 0 : pg_popcount32_choose(uint32 word)
194 : {
195 0 : choose_popcount_functions();
196 0 : return pg_popcount32(word);
197 : }
198 :
199 : static int
200 10692 : pg_popcount64_choose(uint64 word)
201 : {
202 10692 : choose_popcount_functions();
203 10692 : return pg_popcount64(word);
204 : }
205 :
206 : static uint64
207 4 : pg_popcount_choose(const char *buf, int bytes)
208 : {
209 4 : choose_popcount_functions();
210 4 : return pg_popcount_optimized(buf, bytes);
211 : }
212 :
213 : static uint64
214 1956 : pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
215 : {
216 1956 : choose_popcount_functions();
217 1956 : return pg_popcount_masked(buf, bytes, mask);
218 : }
219 :
220 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
221 :
222 : /*
223 : * pg_popcount_avx512
224 : * Returns the number of 1-bits in buf
225 : */
226 : pg_attribute_target("avx512vpopcntdq,avx512bw")
227 : static uint64
228 0 : pg_popcount_avx512(const char *buf, int bytes)
229 : {
230 : __m512i val,
231 : cnt;
232 0 : __m512i accum = _mm512_setzero_si512();
233 : const char *final;
234 : int tail_idx;
235 0 : __mmask64 mask = ~UINT64CONST(0);
236 :
237 : /*
238 : * Align buffer down to avoid double load overhead from unaligned access.
239 : * Calculate a mask to ignore preceding bytes. Find start offset of final
240 : * iteration and ensure it is not empty.
241 : */
242 0 : mask <<= ((uintptr_t) buf) % sizeof(__m512i);
243 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
244 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
245 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
246 :
247 : /*
248 : * Iterate through all but the final iteration. Starting from the second
249 : * iteration, the mask is ignored.
250 : */
251 0 : if (buf < final)
252 : {
253 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
254 0 : cnt = _mm512_popcnt_epi64(val);
255 0 : accum = _mm512_add_epi64(accum, cnt);
256 :
257 0 : buf += sizeof(__m512i);
258 0 : mask = ~UINT64CONST(0);
259 :
260 0 : for (; buf < final; buf += sizeof(__m512i))
261 : {
262 0 : val = _mm512_load_si512((const __m512i *) buf);
263 0 : cnt = _mm512_popcnt_epi64(val);
264 0 : accum = _mm512_add_epi64(accum, cnt);
265 : }
266 : }
267 :
268 : /* Final iteration needs to ignore bytes that are not within the length */
269 0 : mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
270 :
271 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
272 0 : cnt = _mm512_popcnt_epi64(val);
273 0 : accum = _mm512_add_epi64(accum, cnt);
274 :
275 0 : return _mm512_reduce_add_epi64(accum);
276 : }
277 :
278 : /*
279 : * pg_popcount_masked_avx512
280 : * Returns the number of 1-bits in buf after applying the mask to each byte
281 : */
282 : pg_attribute_target("avx512vpopcntdq,avx512bw")
283 : static uint64
284 0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
285 : {
286 : __m512i val,
287 : vmasked,
288 : cnt;
289 0 : __m512i accum = _mm512_setzero_si512();
290 : const char *final;
291 : int tail_idx;
292 0 : __mmask64 bmask = ~UINT64CONST(0);
293 0 : const __m512i maskv = _mm512_set1_epi8(mask);
294 :
295 : /*
296 : * Align buffer down to avoid double load overhead from unaligned access.
297 : * Calculate a mask to ignore preceding bytes. Find start offset of final
298 : * iteration and ensure it is not empty.
299 : */
300 0 : bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
301 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
302 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
303 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
304 :
305 : /*
306 : * Iterate through all but the final iteration. Starting from the second
307 : * iteration, the mask is ignored.
308 : */
309 0 : if (buf < final)
310 : {
311 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
312 0 : vmasked = _mm512_and_si512(val, maskv);
313 0 : cnt = _mm512_popcnt_epi64(vmasked);
314 0 : accum = _mm512_add_epi64(accum, cnt);
315 :
316 0 : buf += sizeof(__m512i);
317 0 : bmask = ~UINT64CONST(0);
318 :
319 0 : for (; buf < final; buf += sizeof(__m512i))
320 : {
321 0 : val = _mm512_load_si512((const __m512i *) buf);
322 0 : vmasked = _mm512_and_si512(val, maskv);
323 0 : cnt = _mm512_popcnt_epi64(vmasked);
324 0 : accum = _mm512_add_epi64(accum, cnt);
325 : }
326 : }
327 :
328 : /* Final iteration needs to ignore bytes that are not within the length */
329 0 : bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
330 :
331 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
332 0 : vmasked = _mm512_and_si512(val, maskv);
333 0 : cnt = _mm512_popcnt_epi64(vmasked);
334 0 : accum = _mm512_add_epi64(accum, cnt);
335 :
336 0 : return _mm512_reduce_add_epi64(accum);
337 : }
338 :
339 : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
340 :
341 : /*
342 : * pg_popcount32_sse42
343 : * Return the number of 1 bits set in word
344 : */
345 : static inline int
346 0 : pg_popcount32_sse42(uint32 word)
347 : {
348 : #ifdef _MSC_VER
349 : return __popcnt(word);
350 : #else
351 : uint32 res;
352 :
353 0 : __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
354 0 : return (int) res;
355 : #endif
356 : }
357 :
358 : /*
359 : * pg_popcount64_sse42
360 : * Return the number of 1 bits set in word
361 : */
362 : static inline int
363 176970324 : pg_popcount64_sse42(uint64 word)
364 : {
365 : #ifdef _MSC_VER
366 : return __popcnt64(word);
367 : #else
368 : uint64 res;
369 :
370 176970324 : __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
371 176970324 : return (int) res;
372 : #endif
373 : }
374 :
375 : /*
376 : * pg_popcount_sse42
377 : * Returns the number of 1-bits in buf
378 : */
379 : static uint64
380 28 : pg_popcount_sse42(const char *buf, int bytes)
381 : {
382 28 : uint64 popcnt = 0;
383 :
384 : #if SIZEOF_VOID_P >= 8
385 : /* Process in 64-bit chunks if the buffer is aligned. */
386 28 : if (buf == (const char *) TYPEALIGN(8, buf))
387 : {
388 28 : const uint64 *words = (const uint64 *) buf;
389 :
390 524508 : while (bytes >= 8)
391 : {
392 524480 : popcnt += pg_popcount64_sse42(*words++);
393 524480 : bytes -= 8;
394 : }
395 :
396 28 : buf = (const char *) words;
397 : }
398 : #else
399 : /* Process in 32-bit chunks if the buffer is aligned. */
400 : if (buf == (const char *) TYPEALIGN(4, buf))
401 : {
402 : const uint32 *words = (const uint32 *) buf;
403 :
404 : while (bytes >= 4)
405 : {
406 : popcnt += pg_popcount32_sse42(*words++);
407 : bytes -= 4;
408 : }
409 :
410 : buf = (const char *) words;
411 : }
412 : #endif
413 :
414 : /* Process any remaining bytes */
415 148 : while (bytes--)
416 120 : popcnt += pg_number_of_ones[(unsigned char) *buf++];
417 :
418 28 : return popcnt;
419 : }
420 :
421 : /*
422 : * pg_popcount_masked_sse42
423 : * Returns the number of 1-bits in buf after applying the mask to each byte
424 : */
425 : static uint64
426 171068 : pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
427 : {
428 171068 : uint64 popcnt = 0;
429 :
430 : #if SIZEOF_VOID_P >= 8
431 : /* Process in 64-bit chunks if the buffer is aligned */
432 171068 : uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
433 :
434 171068 : if (buf == (const char *) TYPEALIGN(8, buf))
435 : {
436 171068 : const uint64 *words = (const uint64 *) buf;
437 :
438 174831496 : while (bytes >= 8)
439 : {
440 174660428 : popcnt += pg_popcount64_sse42(*words++ & maskv);
441 174660428 : bytes -= 8;
442 : }
443 :
444 171068 : buf = (const char *) words;
445 : }
446 : #else
447 : /* Process in 32-bit chunks if the buffer is aligned. */
448 : uint32 maskv = ~((uint32) 0) / 0xFF * mask;
449 :
450 : if (buf == (const char *) TYPEALIGN(4, buf))
451 : {
452 : const uint32 *words = (const uint32 *) buf;
453 :
454 : while (bytes >= 4)
455 : {
456 : popcnt += pg_popcount32_sse42(*words++ & maskv);
457 : bytes -= 4;
458 : }
459 :
460 : buf = (const char *) words;
461 : }
462 : #endif
463 :
464 : /* Process any remaining bytes */
465 171068 : while (bytes--)
466 0 : popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
467 :
468 171068 : return popcnt;
469 : }
470 :
471 : #endif /* HAVE_X86_64_POPCNTQ */
|