Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_crc32c_sse42.c
4 : * Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
5 : *
6 : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/port/pg_crc32c_sse42.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "c.h"
16 :
17 : #include <nmmintrin.h>
18 : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
19 : #include <immintrin.h>
20 : #endif
21 :
22 : #include "port/pg_cpu.h"
23 : #include "port/pg_crc32c.h"
24 :
25 : static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len);
26 :
27 : pg_attribute_no_sanitize_alignment()
28 : pg_attribute_target("sse4.2")
29 : pg_crc32c
30 85436494 : pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
31 : {
32 85436494 : const unsigned char *p = data;
33 85436494 : const unsigned char *pend = p + len;
34 :
35 : /*
36 : * Process eight bytes of data at a time.
37 : *
38 : * NB: We do unaligned accesses here. The Intel architecture allows that,
39 : * and performance testing didn't show any performance gain from aligning
40 : * the begin address.
41 : */
42 : #ifdef __x86_64__
43 4065241101 : while (p + 8 <= pend)
44 : {
45 3979804607 : crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
46 3979804607 : p += 8;
47 : }
48 :
49 : /* Process remaining full four bytes if any */
50 85436494 : if (p + 4 <= pend)
51 : {
52 50919080 : crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
53 50919080 : p += 4;
54 : }
55 : #else
56 :
57 : /*
58 : * Process four bytes at a time. (The eight byte instruction is not
59 : * available on the 32-bit x86 architecture).
60 : */
61 : while (p + 4 <= pend)
62 : {
63 : crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
64 : p += 4;
65 : }
66 : #endif /* __x86_64__ */
67 :
68 : /* Process any remaining bytes one at a time. */
69 173651946 : while (p < pend)
70 : {
71 88215452 : crc = _mm_crc32_u8(crc, *p);
72 88215452 : p++;
73 : }
74 :
75 85436494 : return crc;
76 : }
77 :
78 : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
79 :
80 : /*
81 : * Note: There is no copyright notice in the following generated code.
82 : *
83 : * We have modified the output to
84 : * - match our function declaration
85 : * - match whitespace to our project style
86 : * - add a threshold for the alignment stanza
87 : */
88 :
89 : /* Generated by https://github.com/corsix/fast-crc32/ using: */
90 : /* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
91 : /* MIT licensed */
92 :
93 : #define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
94 : #define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
95 :
96 : pg_attribute_target("vpclmulqdq,avx512vl")
97 : pg_crc32c
98 0 : pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len)
99 : {
100 : /* adjust names to match generated code */
101 0 : pg_crc32c crc0 = crc;
102 0 : const char *buf = data;
103 :
104 : /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
105 0 : if (unlikely(len > 256))
106 : {
107 0 : for (; len && ((uintptr_t) buf & 7); --len)
108 0 : crc0 = _mm_crc32_u8(crc0, *buf++);
109 0 : while (((uintptr_t) buf & 56) && len >= 8)
110 : {
111 0 : crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
112 0 : buf += 8;
113 0 : len -= 8;
114 : }
115 : }
116 :
117 0 : if (len >= 64)
118 : {
119 0 : const char *end = buf + len;
120 0 : const char *limit = buf + len - 64;
121 : __m128i z0;
122 :
123 : /* First vector chunk. */
124 0 : __m512i x0 = _mm512_loadu_si512((const void *) buf),
125 : y0;
126 : __m512i k;
127 :
128 0 : k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
129 0 : x0 = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
130 0 : buf += 64;
131 :
132 : /* Main loop. */
133 0 : while (buf <= limit)
134 : {
135 0 : y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
136 0 : x0 = _mm512_ternarylogic_epi64(x0, y0,
137 : _mm512_loadu_si512((const void *) buf),
138 : 0x96);
139 0 : buf += 64;
140 : }
141 :
142 : /* Reduce 512 bits to 128 bits. */
143 0 : k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
144 : 0x3da6d0cb, 0, 0xba4fc28e, 0,
145 : 0xf20c0dfe, 0, 0x493c7d27, 0,
146 : 0, 0, 0, 0);
147 0 : y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
148 0 : y0 = _mm512_xor_si512(y0, k);
149 0 : z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
150 : _mm512_extracti32x4_epi32(y0, 1),
151 : _mm512_extracti32x4_epi32(y0, 2),
152 : 0x96);
153 0 : z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
154 :
155 : /* Reduce 128 bits to 32 bits, and multiply by x^32. */
156 0 : crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
157 0 : crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
158 0 : len = end - buf;
159 : }
160 :
161 0 : return pg_comp_crc32c_sse42(crc0, buf, len);
162 : }
163 :
164 : #endif /* USE_AVX512_CRC32C_WITH_RUNTIME_CHECK */
165 :
166 : /*
167 : * This gets called on the first call. It replaces the function pointer
168 : * so that subsequent calls are routed directly to the chosen implementation.
169 : */
170 : static pg_crc32c
171 1501 : pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
172 : {
173 : /*
174 : * Set fallback. We must guard since slicing-by-8 is not visible
175 : * everywhere.
176 : */
177 : #ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
178 1501 : pg_comp_crc32c = pg_comp_crc32c_sb8;
179 : #endif
180 :
181 1501 : if (x86_feature_available(PG_SSE4_2))
182 1501 : pg_comp_crc32c = pg_comp_crc32c_sse42;
183 :
184 : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
185 3002 : if (x86_feature_available(PG_AVX512_VL) &&
186 1501 : x86_feature_available(PG_AVX512_VPCLMULQDQ))
187 0 : pg_comp_crc32c = pg_comp_crc32c_avx512;
188 : #endif
189 :
190 1501 : return pg_comp_crc32c(crc, data, len);
191 : }
192 :
193 : pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
|