Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_popcount_avx512.c
4 : * Holds the AVX-512 pg_popcount() implementation.
5 : *
6 : * Copyright (c) 2024, PostgreSQL Global Development Group
7 : *
8 : * IDENTIFICATION
9 : * src/port/pg_popcount_avx512.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "c.h"
14 :
15 : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
16 :
17 : #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
18 : #include <cpuid.h>
19 : #endif
20 :
21 : #include <immintrin.h>
22 :
23 : #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
24 : #include <intrin.h>
25 : #endif
26 :
27 : #include "port/pg_bitutils.h"
28 :
29 : /*
30 : * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
31 : * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
32 : * the function pointers that are only used when TRY_POPCNT_FAST is set.
33 : */
34 : #ifdef TRY_POPCNT_FAST
35 :
36 : /*
37 : * Does CPUID say there's support for XSAVE instructions?
38 : */
39 : static inline bool
40 12034 : xsave_available(void)
41 : {
42 12034 : unsigned int exx[4] = {0, 0, 0, 0};
43 :
44 : #if defined(HAVE__GET_CPUID)
45 12034 : __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
46 : #elif defined(HAVE__CPUID)
47 : __cpuid(exx, 1);
48 : #else
49 : #error cpuid instruction not available
50 : #endif
51 12034 : return (exx[2] & (1 << 27)) != 0; /* osxsave */
52 : }
53 :
54 : /*
55 : * Does XGETBV say the ZMM registers are enabled?
56 : *
57 : * NB: Caller is responsible for verifying that xsave_available() returns true
58 : * before calling this.
59 : */
60 : #ifdef HAVE_XSAVE_INTRINSICS
61 : pg_attribute_target("xsave")
62 : #endif
63 : static inline bool
64 12034 : zmm_regs_available(void)
65 : {
66 : #ifdef HAVE_XSAVE_INTRINSICS
67 12034 : return (_xgetbv(0) & 0xe6) == 0xe6;
68 : #else
69 : return false;
70 : #endif
71 : }
72 :
73 : /*
74 : * Does CPUID say there's support for AVX-512 popcount and byte-and-word
75 : * instructions?
76 : */
77 : static inline bool
78 12034 : avx512_popcnt_available(void)
79 : {
80 12034 : unsigned int exx[4] = {0, 0, 0, 0};
81 :
82 : #if defined(HAVE__GET_CPUID_COUNT)
83 12034 : __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
84 : #elif defined(HAVE__CPUIDEX)
85 : __cpuidex(exx, 7, 0);
86 : #else
87 : #error cpuid instruction not available
88 : #endif
89 12034 : return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
90 0 : (exx[1] & (1 << 30)) != 0; /* avx512-bw */
91 : }
92 :
93 : /*
94 : * Returns true if the CPU supports the instructions required for the AVX-512
95 : * pg_popcount() implementation.
96 : */
97 : bool
98 12034 : pg_popcount_avx512_available(void)
99 : {
100 24068 : return xsave_available() &&
101 24068 : zmm_regs_available() &&
102 12034 : avx512_popcnt_available();
103 : }
104 :
105 : /*
106 : * pg_popcount_avx512
107 : * Returns the number of 1-bits in buf
108 : */
109 : pg_attribute_target("avx512vpopcntdq,avx512bw")
110 : uint64
111 0 : pg_popcount_avx512(const char *buf, int bytes)
112 : {
113 : __m512i val,
114 : cnt;
115 0 : __m512i accum = _mm512_setzero_si512();
116 : const char *final;
117 : int tail_idx;
118 0 : __mmask64 mask = ~UINT64CONST(0);
119 :
120 : /*
121 : * Align buffer down to avoid double load overhead from unaligned access.
122 : * Calculate a mask to ignore preceding bytes. Find start offset of final
123 : * iteration and ensure it is not empty.
124 : */
125 0 : mask <<= ((uintptr_t) buf) % sizeof(__m512i);
126 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
127 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
128 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
129 :
130 : /*
131 : * Iterate through all but the final iteration. Starting from the second
132 : * iteration, the mask is ignored.
133 : */
134 0 : if (buf < final)
135 : {
136 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
137 0 : cnt = _mm512_popcnt_epi64(val);
138 0 : accum = _mm512_add_epi64(accum, cnt);
139 :
140 0 : buf += sizeof(__m512i);
141 0 : mask = ~UINT64CONST(0);
142 :
143 0 : for (; buf < final; buf += sizeof(__m512i))
144 : {
145 0 : val = _mm512_load_si512((const __m512i *) buf);
146 0 : cnt = _mm512_popcnt_epi64(val);
147 0 : accum = _mm512_add_epi64(accum, cnt);
148 : }
149 : }
150 :
151 : /* Final iteration needs to ignore bytes that are not within the length */
152 0 : mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
153 :
154 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
155 0 : cnt = _mm512_popcnt_epi64(val);
156 0 : accum = _mm512_add_epi64(accum, cnt);
157 :
158 0 : return _mm512_reduce_add_epi64(accum);
159 : }
160 :
161 : /*
162 : * pg_popcount_masked_avx512
163 : * Returns the number of 1-bits in buf after applying the mask to each byte
164 : */
165 : pg_attribute_target("avx512vpopcntdq,avx512bw")
166 : uint64
167 0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
168 : {
169 : __m512i val,
170 : vmasked,
171 : cnt;
172 0 : __m512i accum = _mm512_setzero_si512();
173 : const char *final;
174 : int tail_idx;
175 0 : __mmask64 bmask = ~UINT64CONST(0);
176 0 : const __m512i maskv = _mm512_set1_epi8(mask);
177 :
178 : /*
179 : * Align buffer down to avoid double load overhead from unaligned access.
180 : * Calculate a mask to ignore preceding bytes. Find start offset of final
181 : * iteration and ensure it is not empty.
182 : */
183 0 : bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
184 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
185 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
186 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
187 :
188 : /*
189 : * Iterate through all but the final iteration. Starting from the second
190 : * iteration, the mask is ignored.
191 : */
192 0 : if (buf < final)
193 : {
194 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
195 0 : vmasked = _mm512_and_si512(val, maskv);
196 0 : cnt = _mm512_popcnt_epi64(vmasked);
197 0 : accum = _mm512_add_epi64(accum, cnt);
198 :
199 0 : buf += sizeof(__m512i);
200 0 : bmask = ~UINT64CONST(0);
201 :
202 0 : for (; buf < final; buf += sizeof(__m512i))
203 : {
204 0 : val = _mm512_load_si512((const __m512i *) buf);
205 0 : vmasked = _mm512_and_si512(val, maskv);
206 0 : cnt = _mm512_popcnt_epi64(vmasked);
207 0 : accum = _mm512_add_epi64(accum, cnt);
208 : }
209 : }
210 :
211 : /* Final iteration needs to ignore bytes that are not within the length */
212 0 : bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
213 :
214 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
215 0 : vmasked = _mm512_and_si512(val, maskv);
216 0 : cnt = _mm512_popcnt_epi64(vmasked);
217 0 : accum = _mm512_add_epi64(accum, cnt);
218 :
219 0 : return _mm512_reduce_add_epi64(accum);
220 : }
221 :
222 : #endif /* TRY_POPCNT_FAST */
223 : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
|