LCOV - code coverage report
Current view: top level - src/port - pg_crc32c_sse42.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 39.2 % 51 20
Test Date: 2026-03-03 07:15:00 Functions: 66.7 % 3 2
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_crc32c_sse42.c
       4              :  *    Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
       5              :  *
       6              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
       7              :  * Portions Copyright (c) 1994, Regents of the University of California
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/port/pg_crc32c_sse42.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : #include "c.h"
      16              : 
      17              : #include <nmmintrin.h>
      18              : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
      19              : #include <immintrin.h>
      20              : #endif
      21              : 
      22              : #include "port/pg_cpu.h"
      23              : #include "port/pg_crc32c.h"
      24              : 
      25              : static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len);
      26              : 
      27              : pg_attribute_no_sanitize_alignment()
      28              : pg_attribute_target("sse4.2")
      29              : pg_crc32c
      30     85436494 : pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
      31              : {
      32     85436494 :     const unsigned char *p = data;
      33     85436494 :     const unsigned char *pend = p + len;
      34              : 
      35              :     /*
      36              :      * Process eight bytes of data at a time.
      37              :      *
      38              :      * NB: We do unaligned accesses here. The Intel architecture allows that,
      39              :      * and performance testing didn't show any performance gain from aligning
      40              :      * the begin address.
      41              :      */
      42              : #ifdef __x86_64__
      43   4065241101 :     while (p + 8 <= pend)
      44              :     {
      45   3979804607 :         crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
      46   3979804607 :         p += 8;
      47              :     }
      48              : 
      49              :     /* Process remaining full four bytes if any */
      50     85436494 :     if (p + 4 <= pend)
      51              :     {
      52     50919080 :         crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
      53     50919080 :         p += 4;
      54              :     }
      55              : #else
      56              : 
      57              :     /*
      58              :      * Process four bytes at a time. (The eight byte instruction is not
      59              :      * available on the 32-bit x86 architecture).
      60              :      */
      61              :     while (p + 4 <= pend)
      62              :     {
      63              :         crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
      64              :         p += 4;
      65              :     }
      66              : #endif                          /* __x86_64__ */
      67              : 
      68              :     /* Process any remaining bytes one at a time. */
      69    173651946 :     while (p < pend)
      70              :     {
      71     88215452 :         crc = _mm_crc32_u8(crc, *p);
      72     88215452 :         p++;
      73              :     }
      74              : 
      75     85436494 :     return crc;
      76              : }
      77              : 
      78              : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
      79              : 
      80              : /*
      81              :  * Note: There is no copyright notice in the following generated code.
      82              :  *
      83              :  * We have modified the output to
      84              :  *   - match our function declaration
      85              :  *   - match whitespace to our project style
      86              :  *   - add a threshold for the alignment stanza
      87              :  */
      88              : 
      89              : /* Generated by https://github.com/corsix/fast-crc32/ using: */
      90              : /* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
      91              : /* MIT licensed */
      92              : 
      93              : #define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
      94              : #define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
      95              : 
      96              : pg_attribute_target("vpclmulqdq,avx512vl")
      97              : pg_crc32c
      98            0 : pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len)
      99              : {
     100              :     /* adjust names to match generated code */
     101            0 :     pg_crc32c   crc0 = crc;
     102            0 :     const char *buf = data;
     103              : 
     104              :     /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
     105            0 :     if (unlikely(len > 256))
     106              :     {
     107            0 :         for (; len && ((uintptr_t) buf & 7); --len)
     108            0 :             crc0 = _mm_crc32_u8(crc0, *buf++);
     109            0 :         while (((uintptr_t) buf & 56) && len >= 8)
     110              :         {
     111            0 :             crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
     112            0 :             buf += 8;
     113            0 :             len -= 8;
     114              :         }
     115              :     }
     116              : 
     117            0 :     if (len >= 64)
     118              :     {
     119            0 :         const char *end = buf + len;
     120            0 :         const char *limit = buf + len - 64;
     121              :         __m128i     z0;
     122              : 
     123              :         /* First vector chunk. */
     124            0 :         __m512i     x0 = _mm512_loadu_si512((const void *) buf),
     125              :                     y0;
     126              :         __m512i     k;
     127              : 
     128            0 :         k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
     129            0 :         x0 = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
     130            0 :         buf += 64;
     131              : 
     132              :         /* Main loop. */
     133            0 :         while (buf <= limit)
     134              :         {
     135            0 :             y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
     136            0 :             x0 = _mm512_ternarylogic_epi64(x0, y0,
     137              :                                            _mm512_loadu_si512((const void *) buf),
     138              :                                            0x96);
     139            0 :             buf += 64;
     140              :         }
     141              : 
     142              :         /* Reduce 512 bits to 128 bits. */
     143            0 :         k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
     144              :                               0x3da6d0cb, 0, 0xba4fc28e, 0,
     145              :                               0xf20c0dfe, 0, 0x493c7d27, 0,
     146              :                               0, 0, 0, 0);
     147            0 :         y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
     148            0 :         y0 = _mm512_xor_si512(y0, k);
     149            0 :         z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
     150              :                                     _mm512_extracti32x4_epi32(y0, 1),
     151              :                                     _mm512_extracti32x4_epi32(y0, 2),
     152              :                                     0x96);
     153            0 :         z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
     154              : 
     155              :         /* Reduce 128 bits to 32 bits, and multiply by x^32. */
     156            0 :         crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
     157            0 :         crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
     158            0 :         len = end - buf;
     159              :     }
     160              : 
     161            0 :     return pg_comp_crc32c_sse42(crc0, buf, len);
     162              : }
     163              : 
     164              : #endif                          /* USE_AVX512_CRC32C_WITH_RUNTIME_CHECK */
     165              : 
     166              : /*
     167              :  * This gets called on the first call. It replaces the function pointer
     168              :  * so that subsequent calls are routed directly to the chosen implementation.
     169              :  */
     170              : static pg_crc32c
     171         1501 : pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
     172              : {
     173              :     /*
     174              :      * Set fallback. We must guard since slicing-by-8 is not visible
     175              :      * everywhere.
     176              :      */
     177              : #ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
     178         1501 :     pg_comp_crc32c = pg_comp_crc32c_sb8;
     179              : #endif
     180              : 
     181         1501 :     if (x86_feature_available(PG_SSE4_2))
     182         1501 :         pg_comp_crc32c = pg_comp_crc32c_sse42;
     183              : 
     184              : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
     185         3002 :     if (x86_feature_available(PG_AVX512_VL) &&
     186         1501 :         x86_feature_available(PG_AVX512_VPCLMULQDQ))
     187            0 :         pg_comp_crc32c = pg_comp_crc32c_avx512;
     188              : #endif
     189              : 
     190         1501 :     return pg_comp_crc32c(crc, data, len);
     191              : }
     192              : 
     193              : pg_crc32c   (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
        

Generated by: LCOV version 2.0-1