Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_crc32c_sse42.c
4 : * Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
5 : *
6 : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/port/pg_crc32c_sse42.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "c.h"
16 :
17 : #include <nmmintrin.h>
18 : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
19 : #include <immintrin.h>
20 : #endif
21 :
22 : #include "port/pg_crc32c.h"
23 :
24 : pg_attribute_no_sanitize_alignment()
25 : pg_attribute_target("sse4.2")
26 : pg_crc32c
27 162572306 : pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
28 : {
29 162572306 : const unsigned char *p = data;
30 162572306 : const unsigned char *pend = p + len;
31 :
32 : /*
33 : * Process eight bytes of data at a time.
34 : *
35 : * NB: We do unaligned accesses here. The Intel architecture allows that,
36 : * and performance testing didn't show any performance gain from aligning
37 : * the begin address.
38 : */
39 : #ifdef __x86_64__
40 7830540952 : while (p + 8 <= pend)
41 : {
42 7667968646 : crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
43 7667968646 : p += 8;
44 : }
45 :
46 : /* Process remaining full four bytes if any */
47 162572306 : if (p + 4 <= pend)
48 : {
49 96960440 : crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
50 96960440 : p += 4;
51 : }
52 : #else
53 :
54 : /*
55 : * Process four bytes at a time. (The eight byte instruction is not
56 : * available on the 32-bit x86 architecture).
57 : */
58 : while (p + 4 <= pend)
59 : {
60 : crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
61 : p += 4;
62 : }
63 : #endif /* __x86_64__ */
64 :
65 : /* Process any remaining bytes one at a time. */
66 330000018 : while (p < pend)
67 : {
68 167427712 : crc = _mm_crc32_u8(crc, *p);
69 167427712 : p++;
70 : }
71 :
72 162572306 : return crc;
73 : }
74 :
75 : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
76 :
77 : /*
78 : * Note: There is no copyright notice in the following generated code.
79 : *
80 : * We have modified the output to
81 : * - match our function declaration
82 : * - match whitespace to our project style
83 : * - add a threshold for the alignment stanza
84 : */
85 :
86 : /* Generated by https://github.com/corsix/fast-crc32/ using: */
87 : /* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
88 : /* MIT licensed */
89 :
90 : #define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
91 : #define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
92 :
93 : pg_attribute_target("vpclmulqdq,avx512vl")
94 : pg_crc32c
95 0 : pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len)
96 : {
97 : /* adjust names to match generated code */
98 0 : pg_crc32c crc0 = crc;
99 0 : const char *buf = data;
100 :
101 : /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
102 0 : if (unlikely(len > 256))
103 : {
104 0 : for (; len && ((uintptr_t) buf & 7); --len)
105 0 : crc0 = _mm_crc32_u8(crc0, *buf++);
106 0 : while (((uintptr_t) buf & 56) && len >= 8)
107 : {
108 0 : crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
109 0 : buf += 8;
110 0 : len -= 8;
111 : }
112 : }
113 :
114 0 : if (len >= 64)
115 : {
116 0 : const char *end = buf + len;
117 0 : const char *limit = buf + len - 64;
118 : __m128i z0;
119 :
120 : /* First vector chunk. */
121 0 : __m512i x0 = _mm512_loadu_si512((const void *) buf),
122 : y0;
123 : __m512i k;
124 :
125 0 : k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
126 0 : x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
127 0 : buf += 64;
128 :
129 : /* Main loop. */
130 0 : while (buf <= limit)
131 : {
132 0 : y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
133 0 : x0 = _mm512_ternarylogic_epi64(x0, y0,
134 : _mm512_loadu_si512((const void *) buf),
135 : 0x96);
136 0 : buf += 64;
137 : }
138 :
139 : /* Reduce 512 bits to 128 bits. */
140 0 : k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
141 : 0x3da6d0cb, 0, 0xba4fc28e, 0,
142 : 0xf20c0dfe, 0, 0x493c7d27, 0,
143 : 0, 0, 0, 0);
144 0 : y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
145 0 : y0 = _mm512_xor_si512(y0, k);
146 0 : z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
147 : _mm512_extracti32x4_epi32(y0, 1),
148 : _mm512_extracti32x4_epi32(y0, 2),
149 : 0x96);
150 0 : z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
151 :
152 : /* Reduce 128 bits to 32 bits, and multiply by x^32. */
153 0 : crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
154 0 : crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
155 0 : len = end - buf;
156 : }
157 :
158 0 : return pg_comp_crc32c_sse42(crc0, buf, len);
159 : }
160 :
161 : #endif
|