Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_popcount_x86.c
4 : * Holds the x86-64 pg_popcount() implementations.
5 : *
6 : * Copyright (c) 2024-2026, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/port/pg_popcount_x86.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #ifdef HAVE_X86_64_POPCNTQ
16 :
17 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
18 : #include <immintrin.h>
19 : #endif
20 :
21 : #include "port/pg_bitutils.h"
22 : #include "port/pg_cpu.h"
23 :
24 : /*
25 : * The SSE4.2 versions are built regardless of whether we are building the
26 : * AVX-512 versions.
27 : *
28 : * Technically, POPCNT is not part of SSE4.2, and isn't even a vector
29 : * operation, but in practice this is close enough, and "sse42" seems easier to
30 : * follow than "popcnt" for these names.
31 : */
32 : static uint64 pg_popcount_sse42(const char *buf, int bytes);
33 : static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
34 :
35 : /*
36 : * These are the AVX-512 implementations of the popcount functions.
37 : */
38 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
39 : static uint64 pg_popcount_avx512(const char *buf, int bytes);
40 : static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
41 : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
42 :
43 : /*
44 : * The function pointers are initially set to "choose" functions. These
45 : * functions will first set the pointers to the right implementations (base on
46 : * what the current CPU supports) and then will call the pointer to fulfill the
47 : * caller's request.
48 : */
49 : static uint64 pg_popcount_choose(const char *buf, int bytes);
50 : static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
51 : uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
52 : uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
53 :
54 :
55 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
56 :
57 : /*
58 : * Returns true if the CPU supports the instructions required for the AVX-512
59 : * pg_popcount() implementation.
60 : */
61 : static bool
62 1301 : pg_popcount_avx512_available(void)
63 : {
64 2602 : return x86_feature_available(PG_AVX512_BW) &&
65 1301 : x86_feature_available(PG_AVX512_VPOPCNTDQ);
66 : }
67 :
68 : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
69 :
70 : /*
71 : * These functions get called on the first call to pg_popcount(), etc.
72 : * They detect whether we can use the asm implementations, and replace
73 : * the function pointers so that subsequent calls are routed directly to
74 : * the chosen implementation.
75 : */
76 : static inline void
77 1301 : choose_popcount_functions(void)
78 : {
79 1301 : if (x86_feature_available(PG_POPCNT))
80 : {
81 1301 : pg_popcount_optimized = pg_popcount_sse42;
82 1301 : pg_popcount_masked_optimized = pg_popcount_masked_sse42;
83 : }
84 : else
85 : {
86 0 : pg_popcount_optimized = pg_popcount_portable;
87 0 : pg_popcount_masked_optimized = pg_popcount_masked_portable;
88 : }
89 :
90 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
91 1301 : if (pg_popcount_avx512_available())
92 : {
93 0 : pg_popcount_optimized = pg_popcount_avx512;
94 0 : pg_popcount_masked_optimized = pg_popcount_masked_avx512;
95 : }
96 : #endif
97 1301 : }
98 :
99 : static uint64
100 6 : pg_popcount_choose(const char *buf, int bytes)
101 : {
102 6 : choose_popcount_functions();
103 6 : return pg_popcount_optimized(buf, bytes);
104 : }
105 :
106 : static uint64
107 1295 : pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
108 : {
109 1295 : choose_popcount_functions();
110 1295 : return pg_popcount_masked(buf, bytes, mask);
111 : }
112 :
113 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
114 :
115 : /*
116 : * pg_popcount_avx512
117 : * Returns the number of 1-bits in buf
118 : */
119 : pg_attribute_target("avx512vpopcntdq,avx512bw")
120 : static uint64
121 0 : pg_popcount_avx512(const char *buf, int bytes)
122 : {
123 : __m512i val,
124 : cnt;
125 0 : __m512i accum = _mm512_setzero_si512();
126 : const char *final;
127 : int tail_idx;
128 0 : __mmask64 mask = ~UINT64CONST(0);
129 :
130 : /*
131 : * Align buffer down to avoid double load overhead from unaligned access.
132 : * Calculate a mask to ignore preceding bytes. Find start offset of final
133 : * iteration and ensure it is not empty.
134 : */
135 0 : mask <<= ((uintptr_t) buf) % sizeof(__m512i);
136 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
137 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
138 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
139 :
140 : /*
141 : * Iterate through all but the final iteration. Starting from the second
142 : * iteration, the mask is ignored.
143 : */
144 0 : if (buf < final)
145 : {
146 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
147 0 : cnt = _mm512_popcnt_epi64(val);
148 0 : accum = _mm512_add_epi64(accum, cnt);
149 :
150 0 : buf += sizeof(__m512i);
151 0 : mask = ~UINT64CONST(0);
152 :
153 0 : for (; buf < final; buf += sizeof(__m512i))
154 : {
155 0 : val = _mm512_load_si512((const __m512i *) buf);
156 0 : cnt = _mm512_popcnt_epi64(val);
157 0 : accum = _mm512_add_epi64(accum, cnt);
158 : }
159 : }
160 :
161 : /* Final iteration needs to ignore bytes that are not within the length */
162 0 : mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
163 :
164 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
165 0 : cnt = _mm512_popcnt_epi64(val);
166 0 : accum = _mm512_add_epi64(accum, cnt);
167 :
168 0 : return _mm512_reduce_add_epi64(accum);
169 : }
170 :
171 : /*
172 : * pg_popcount_masked_avx512
173 : * Returns the number of 1-bits in buf after applying the mask to each byte
174 : */
175 : pg_attribute_target("avx512vpopcntdq,avx512bw")
176 : static uint64
177 0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
178 : {
179 : __m512i val,
180 : vmasked,
181 : cnt;
182 0 : __m512i accum = _mm512_setzero_si512();
183 : const char *final;
184 : int tail_idx;
185 0 : __mmask64 bmask = ~UINT64CONST(0);
186 0 : const __m512i maskv = _mm512_set1_epi8(mask);
187 :
188 : /*
189 : * Align buffer down to avoid double load overhead from unaligned access.
190 : * Calculate a mask to ignore preceding bytes. Find start offset of final
191 : * iteration and ensure it is not empty.
192 : */
193 0 : bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
194 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
195 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
196 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
197 :
198 : /*
199 : * Iterate through all but the final iteration. Starting from the second
200 : * iteration, the mask is ignored.
201 : */
202 0 : if (buf < final)
203 : {
204 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
205 0 : vmasked = _mm512_and_si512(val, maskv);
206 0 : cnt = _mm512_popcnt_epi64(vmasked);
207 0 : accum = _mm512_add_epi64(accum, cnt);
208 :
209 0 : buf += sizeof(__m512i);
210 0 : bmask = ~UINT64CONST(0);
211 :
212 0 : for (; buf < final; buf += sizeof(__m512i))
213 : {
214 0 : val = _mm512_load_si512((const __m512i *) buf);
215 0 : vmasked = _mm512_and_si512(val, maskv);
216 0 : cnt = _mm512_popcnt_epi64(vmasked);
217 0 : accum = _mm512_add_epi64(accum, cnt);
218 : }
219 : }
220 :
221 : /* Final iteration needs to ignore bytes that are not within the length */
222 0 : bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
223 :
224 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
225 0 : vmasked = _mm512_and_si512(val, maskv);
226 0 : cnt = _mm512_popcnt_epi64(vmasked);
227 0 : accum = _mm512_add_epi64(accum, cnt);
228 :
229 0 : return _mm512_reduce_add_epi64(accum);
230 : }
231 :
232 : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
233 :
234 : /*
235 : * pg_popcount64_sse42
236 : * Return the number of 1 bits set in word
237 : */
238 : static inline int
239 94735379 : pg_popcount64_sse42(uint64 word)
240 : {
241 : #ifdef _MSC_VER
242 : return __popcnt64(word);
243 : #else
244 : uint64 res;
245 :
246 94735379 : __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
247 94735379 : return (int) res;
248 : #endif
249 : }
250 :
251 : /*
252 : * pg_popcount_sse42
253 : * Returns the number of 1-bits in buf
254 : */
255 : pg_attribute_no_sanitize_alignment()
256 : static uint64
257 18 : pg_popcount_sse42(const char *buf, int bytes)
258 : {
259 18 : uint64 popcnt = 0;
260 18 : const uint64 *words = (const uint64 *) buf;
261 :
262 262267 : while (bytes >= 8)
263 : {
264 262249 : popcnt += pg_popcount64_sse42(*words++);
265 262249 : bytes -= 8;
266 : }
267 :
268 18 : buf = (const char *) words;
269 :
270 : /* Process any remaining bytes */
271 78 : while (bytes--)
272 60 : popcnt += pg_number_of_ones[(unsigned char) *buf++];
273 :
274 18 : return popcnt;
275 : }
276 :
277 : /*
278 : * pg_popcount_masked_sse42
279 : * Returns the number of 1-bits in buf after applying the mask to each byte
280 : */
281 : pg_attribute_no_sanitize_alignment()
282 : static uint64
283 92530 : pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
284 : {
285 92530 : uint64 popcnt = 0;
286 92530 : uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
287 92530 : const uint64 *words = (const uint64 *) buf;
288 :
289 94565660 : while (bytes >= 8)
290 : {
291 94473130 : popcnt += pg_popcount64_sse42(*words++ & maskv);
292 94473130 : bytes -= 8;
293 : }
294 :
295 92530 : buf = (const char *) words;
296 :
297 : /* Process any remaining bytes */
298 92530 : while (bytes--)
299 0 : popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
300 :
301 92530 : return popcnt;
302 : }
303 :
304 : #endif /* HAVE_X86_64_POPCNTQ */
|