LCOV - code coverage report
Current view: top level - src/port - pg_popcount_avx512.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18devel Lines: 14 63 22.2 %
Date: 2024-11-21 08:14:44 Functions: 4 6 66.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_popcount_avx512.c
       4             :  *    Holds the AVX-512 pg_popcount() implementation.
       5             :  *
       6             :  * Copyright (c) 2024, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/port/pg_popcount_avx512.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "c.h"
      14             : 
      15             : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      16             : 
      17             : #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
      18             : #include <cpuid.h>
      19             : #endif
      20             : 
      21             : #include <immintrin.h>
      22             : 
      23             : #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
      24             : #include <intrin.h>
      25             : #endif
      26             : 
      27             : #include "port/pg_bitutils.h"
      28             : 
      29             : /*
      30             :  * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
      31             :  * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
      32             :  * the function pointers that are only used when TRY_POPCNT_FAST is set.
      33             :  */
      34             : #ifdef TRY_POPCNT_FAST
      35             : 
      36             : /*
      37             :  * Does CPUID say there's support for XSAVE instructions?
      38             :  */
      39             : static inline bool
      40       12034 : xsave_available(void)
      41             : {
      42       12034 :     unsigned int exx[4] = {0, 0, 0, 0};
      43             : 
      44             : #if defined(HAVE__GET_CPUID)
      45       12034 :     __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
      46             : #elif defined(HAVE__CPUID)
      47             :     __cpuid(exx, 1);
      48             : #else
      49             : #error cpuid instruction not available
      50             : #endif
      51       12034 :     return (exx[2] & (1 << 27)) != 0; /* osxsave */
      52             : }
      53             : 
      54             : /*
      55             :  * Does XGETBV say the ZMM registers are enabled?
      56             :  *
      57             :  * NB: Caller is responsible for verifying that xsave_available() returns true
      58             :  * before calling this.
      59             :  */
      60             : #ifdef HAVE_XSAVE_INTRINSICS
      61             : pg_attribute_target("xsave")
      62             : #endif
      63             : static inline bool
      64       12034 : zmm_regs_available(void)
      65             : {
      66             : #ifdef HAVE_XSAVE_INTRINSICS
      67       12034 :     return (_xgetbv(0) & 0xe6) == 0xe6;
      68             : #else
      69             :     return false;
      70             : #endif
      71             : }
      72             : 
      73             : /*
      74             :  * Does CPUID say there's support for AVX-512 popcount and byte-and-word
      75             :  * instructions?
      76             :  */
      77             : static inline bool
      78       12034 : avx512_popcnt_available(void)
      79             : {
      80       12034 :     unsigned int exx[4] = {0, 0, 0, 0};
      81             : 
      82             : #if defined(HAVE__GET_CPUID_COUNT)
      83       12034 :     __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
      84             : #elif defined(HAVE__CPUIDEX)
      85             :     __cpuidex(exx, 7, 0);
      86             : #else
      87             : #error cpuid instruction not available
      88             : #endif
      89       12034 :     return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
      90           0 :         (exx[1] & (1 << 30)) != 0;    /* avx512-bw */
      91             : }
      92             : 
      93             : /*
      94             :  * Returns true if the CPU supports the instructions required for the AVX-512
      95             :  * pg_popcount() implementation.
      96             :  */
      97             : bool
      98       12034 : pg_popcount_avx512_available(void)
      99             : {
     100       24068 :     return xsave_available() &&
     101       24068 :         zmm_regs_available() &&
     102       12034 :         avx512_popcnt_available();
     103             : }
     104             : 
     105             : /*
     106             :  * pg_popcount_avx512
     107             :  *      Returns the number of 1-bits in buf
     108             :  */
     109             : pg_attribute_target("avx512vpopcntdq,avx512bw")
     110             : uint64
     111           0 : pg_popcount_avx512(const char *buf, int bytes)
     112             : {
     113             :     __m512i     val,
     114             :                 cnt;
     115           0 :     __m512i     accum = _mm512_setzero_si512();
     116             :     const char *final;
     117             :     int         tail_idx;
     118           0 :     __mmask64   mask = ~UINT64CONST(0);
     119             : 
     120             :     /*
     121             :      * Align buffer down to avoid double load overhead from unaligned access.
     122             :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
     123             :      * iteration and ensure it is not empty.
     124             :      */
     125           0 :     mask <<= ((uintptr_t) buf) % sizeof(__m512i);
     126           0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
     127           0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
     128           0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
     129             : 
     130             :     /*
     131             :      * Iterate through all but the final iteration.  Starting from the second
     132             :      * iteration, the mask is ignored.
     133             :      */
     134           0 :     if (buf < final)
     135             :     {
     136           0 :         val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
     137           0 :         cnt = _mm512_popcnt_epi64(val);
     138           0 :         accum = _mm512_add_epi64(accum, cnt);
     139             : 
     140           0 :         buf += sizeof(__m512i);
     141           0 :         mask = ~UINT64CONST(0);
     142             : 
     143           0 :         for (; buf < final; buf += sizeof(__m512i))
     144             :         {
     145           0 :             val = _mm512_load_si512((const __m512i *) buf);
     146           0 :             cnt = _mm512_popcnt_epi64(val);
     147           0 :             accum = _mm512_add_epi64(accum, cnt);
     148             :         }
     149             :     }
     150             : 
     151             :     /* Final iteration needs to ignore bytes that are not within the length */
     152           0 :     mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
     153             : 
     154           0 :     val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
     155           0 :     cnt = _mm512_popcnt_epi64(val);
     156           0 :     accum = _mm512_add_epi64(accum, cnt);
     157             : 
     158           0 :     return _mm512_reduce_add_epi64(accum);
     159             : }
     160             : 
     161             : /*
     162             :  * pg_popcount_masked_avx512
     163             :  *      Returns the number of 1-bits in buf after applying the mask to each byte
     164             :  */
     165             : pg_attribute_target("avx512vpopcntdq,avx512bw")
     166             : uint64
     167           0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
     168             : {
     169             :     __m512i     val,
     170             :                 vmasked,
     171             :                 cnt;
     172           0 :     __m512i     accum = _mm512_setzero_si512();
     173             :     const char *final;
     174             :     int         tail_idx;
     175           0 :     __mmask64   bmask = ~UINT64CONST(0);
     176           0 :     const __m512i maskv = _mm512_set1_epi8(mask);
     177             : 
     178             :     /*
     179             :      * Align buffer down to avoid double load overhead from unaligned access.
     180             :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
     181             :      * iteration and ensure it is not empty.
     182             :      */
     183           0 :     bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
     184           0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
     185           0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
     186           0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
     187             : 
     188             :     /*
     189             :      * Iterate through all but the final iteration.  Starting from the second
     190             :      * iteration, the mask is ignored.
     191             :      */
     192           0 :     if (buf < final)
     193             :     {
     194           0 :         val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     195           0 :         vmasked = _mm512_and_si512(val, maskv);
     196           0 :         cnt = _mm512_popcnt_epi64(vmasked);
     197           0 :         accum = _mm512_add_epi64(accum, cnt);
     198             : 
     199           0 :         buf += sizeof(__m512i);
     200           0 :         bmask = ~UINT64CONST(0);
     201             : 
     202           0 :         for (; buf < final; buf += sizeof(__m512i))
     203             :         {
     204           0 :             val = _mm512_load_si512((const __m512i *) buf);
     205           0 :             vmasked = _mm512_and_si512(val, maskv);
     206           0 :             cnt = _mm512_popcnt_epi64(vmasked);
     207           0 :             accum = _mm512_add_epi64(accum, cnt);
     208             :         }
     209             :     }
     210             : 
     211             :     /* Final iteration needs to ignore bytes that are not within the length */
     212           0 :     bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
     213             : 
     214           0 :     val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     215           0 :     vmasked = _mm512_and_si512(val, maskv);
     216           0 :     cnt = _mm512_popcnt_epi64(vmasked);
     217           0 :     accum = _mm512_add_epi64(accum, cnt);
     218             : 
     219           0 :     return _mm512_reduce_add_epi64(accum);
     220             : }
     221             : 
     222             : #endif                          /* TRY_POPCNT_FAST */
     223             : #endif                          /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */

Generated by: LCOV version 1.14