LCOV - code coverage report
Current view: top level - src/port - pg_crc32c_sse42.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 13 43 30.2 %
Date: 2025-04-28 15:15:40 Functions: 1 2 50.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_crc32c_sse42.c
       4             :  *    Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/port/pg_crc32c_sse42.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : #include "c.h"
      16             : 
      17             : #include <nmmintrin.h>
      18             : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
      19             : #include <immintrin.h>
      20             : #endif
      21             : 
      22             : #include "port/pg_crc32c.h"
      23             : 
      24             : pg_attribute_no_sanitize_alignment()
      25             : pg_attribute_target("sse4.2")
      26             : pg_crc32c
      27   162572306 : pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
      28             : {
      29   162572306 :     const unsigned char *p = data;
      30   162572306 :     const unsigned char *pend = p + len;
      31             : 
      32             :     /*
      33             :      * Process eight bytes of data at a time.
      34             :      *
      35             :      * NB: We do unaligned accesses here. The Intel architecture allows that,
      36             :      * and performance testing didn't show any performance gain from aligning
      37             :      * the begin address.
      38             :      */
      39             : #ifdef __x86_64__
      40  7830540952 :     while (p + 8 <= pend)
      41             :     {
      42  7667968646 :         crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
      43  7667968646 :         p += 8;
      44             :     }
      45             : 
      46             :     /* Process remaining full four bytes if any */
      47   162572306 :     if (p + 4 <= pend)
      48             :     {
      49    96960440 :         crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
      50    96960440 :         p += 4;
      51             :     }
      52             : #else
      53             : 
      54             :     /*
      55             :      * Process four bytes at a time. (The eight byte instruction is not
      56             :      * available on the 32-bit x86 architecture).
      57             :      */
      58             :     while (p + 4 <= pend)
      59             :     {
      60             :         crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
      61             :         p += 4;
      62             :     }
      63             : #endif                          /* __x86_64__ */
      64             : 
      65             :     /* Process any remaining bytes one at a time. */
      66   330000018 :     while (p < pend)
      67             :     {
      68   167427712 :         crc = _mm_crc32_u8(crc, *p);
      69   167427712 :         p++;
      70             :     }
      71             : 
      72   162572306 :     return crc;
      73             : }
      74             : 
      75             : #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
      76             : 
      77             : /*
      78             :  * Note: There is no copyright notice in the following generated code.
      79             :  *
      80             :  * We have modified the output to
      81             :  *   - match our function declaration
      82             :  *   - match whitespace to our project style
      83             :  *   - add a threshold for the alignment stanza
      84             :  */
      85             : 
      86             : /* Generated by https://github.com/corsix/fast-crc32/ using: */
      87             : /* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
      88             : /* MIT licensed */
      89             : 
      90             : #define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
      91             : #define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
      92             : 
      93             : pg_attribute_target("vpclmulqdq,avx512vl")
      94             : pg_crc32c
      95           0 : pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len)
      96             : {
      97             :     /* adjust names to match generated code */
      98           0 :     pg_crc32c   crc0 = crc;
      99           0 :     const char *buf = data;
     100             : 
     101             :     /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
     102           0 :     if (unlikely(len > 256))
     103             :     {
     104           0 :         for (; len && ((uintptr_t) buf & 7); --len)
     105           0 :             crc0 = _mm_crc32_u8(crc0, *buf++);
     106           0 :         while (((uintptr_t) buf & 56) && len >= 8)
     107             :         {
     108           0 :             crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
     109           0 :             buf += 8;
     110           0 :             len -= 8;
     111             :         }
     112             :     }
     113             : 
     114           0 :     if (len >= 64)
     115             :     {
     116           0 :         const char *end = buf + len;
     117           0 :         const char *limit = buf + len - 64;
     118             :         __m128i     z0;
     119             : 
     120             :         /* First vector chunk. */
     121           0 :         __m512i     x0 = _mm512_loadu_si512((const void *) buf),
     122             :                     y0;
     123             :         __m512i     k;
     124             : 
     125           0 :         k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
     126           0 :         x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
     127           0 :         buf += 64;
     128             : 
     129             :         /* Main loop. */
     130           0 :         while (buf <= limit)
     131             :         {
     132           0 :             y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
     133           0 :             x0 = _mm512_ternarylogic_epi64(x0, y0,
     134             :                                            _mm512_loadu_si512((const void *) buf),
     135             :                                            0x96);
     136           0 :             buf += 64;
     137             :         }
     138             : 
     139             :         /* Reduce 512 bits to 128 bits. */
     140           0 :         k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
     141             :                               0x3da6d0cb, 0, 0xba4fc28e, 0,
     142             :                               0xf20c0dfe, 0, 0x493c7d27, 0,
     143             :                               0, 0, 0, 0);
     144           0 :         y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
     145           0 :         y0 = _mm512_xor_si512(y0, k);
     146           0 :         z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
     147             :                                     _mm512_extracti32x4_epi32(y0, 1),
     148             :                                     _mm512_extracti32x4_epi32(y0, 2),
     149             :                                     0x96);
     150           0 :         z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
     151             : 
     152             :         /* Reduce 128 bits to 32 bits, and multiply by x^32. */
     153           0 :         crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
     154           0 :         crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
     155           0 :         len = end - buf;
     156             :     }
     157             : 
     158           0 :     return pg_comp_crc32c_sse42(crc0, buf, len);
     159             : }
     160             : 
     161             : #endif

Generated by: LCOV version 1.14