LCOV - code coverage report
Current view: top level - src/port - pg_popcount_x86.c (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 60 122 49.2 %
Date: 2026-02-07 08:18:04 Functions: 12 16 75.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_popcount_x86.c
       4             :  *    Holds the x86-64 pg_popcount() implementations.
       5             :  *
       6             :  * Copyright (c) 2024-2026, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/port/pg_popcount_x86.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "c.h"
      14             : 
      15             : #ifdef HAVE_X86_64_POPCNTQ
      16             : 
      17             : #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
      18             : #include <cpuid.h>
      19             : #endif
      20             : 
      21             : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      22             : #include <immintrin.h>
      23             : #endif
      24             : 
      25             : #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
      26             : #include <intrin.h>
      27             : #endif
      28             : 
      29             : #include "port/pg_bitutils.h"
      30             : 
      31             : /*
      32             :  * The SSE4.2 versions are built regardless of whether we are building the
      33             :  * AVX-512 versions.
      34             :  *
      35             :  * Technically, POPCNT is not part of SSE4.2, and isn't even a vector
      36             :  * operation, but in practice this is close enough, and "sse42" seems easier to
      37             :  * follow than "popcnt" for these names.
      38             :  */
      39             : static inline int pg_popcount32_sse42(uint32 word);
      40             : static inline int pg_popcount64_sse42(uint64 word);
      41             : static uint64 pg_popcount_sse42(const char *buf, int bytes);
      42             : static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
      43             : 
      44             : /*
      45             :  * These are the AVX-512 implementations of the popcount functions.
      46             :  */
      47             : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      48             : static uint64 pg_popcount_avx512(const char *buf, int bytes);
      49             : static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
      50             : #endif                          /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
      51             : 
      52             : /*
      53             :  * The function pointers are initially set to "choose" functions.  These
      54             :  * functions will first set the pointers to the right implementations (base on
      55             :  * what the current CPU supports) and then will call the pointer to fulfill the
      56             :  * caller's request.
      57             :  */
      58             : static int  pg_popcount32_choose(uint32 word);
      59             : static int  pg_popcount64_choose(uint64 word);
      60             : static uint64 pg_popcount_choose(const char *buf, int bytes);
      61             : static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
      62             : int         (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
      63             : int         (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
      64             : uint64      (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
      65             : uint64      (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
      66             : 
      67             : /*
      68             :  * Return true if CPUID indicates that the POPCNT instruction is available.
      69             :  */
      70             : static bool
      71       12652 : pg_popcount_sse42_available(void)
      72             : {
      73       12652 :     unsigned int exx[4] = {0, 0, 0, 0};
      74             : 
      75             : #if defined(HAVE__GET_CPUID)
      76       12652 :     __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
      77             : #elif defined(HAVE__CPUID)
      78             :     __cpuid(exx, 1);
      79             : #else
      80             : #error cpuid instruction not available
      81             : #endif
      82             : 
      83       12652 :     return (exx[2] & (1 << 23)) != 0; /* POPCNT */
      84             : }
      85             : 
      86             : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      87             : 
      88             : /*
      89             :  * Does CPUID say there's support for XSAVE instructions?
      90             :  */
      91             : static inline bool
      92       12652 : xsave_available(void)
      93             : {
      94       12652 :     unsigned int exx[4] = {0, 0, 0, 0};
      95             : 
      96             : #if defined(HAVE__GET_CPUID)
      97       12652 :     __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
      98             : #elif defined(HAVE__CPUID)
      99             :     __cpuid(exx, 1);
     100             : #else
     101             : #error cpuid instruction not available
     102             : #endif
     103       12652 :     return (exx[2] & (1 << 27)) != 0; /* osxsave */
     104             : }
     105             : 
     106             : /*
     107             :  * Does XGETBV say the ZMM registers are enabled?
     108             :  *
     109             :  * NB: Caller is responsible for verifying that xsave_available() returns true
     110             :  * before calling this.
     111             :  */
     112             : #ifdef HAVE_XSAVE_INTRINSICS
     113             : pg_attribute_target("xsave")
     114             : #endif
     115             : static inline bool
     116       12652 : zmm_regs_available(void)
     117             : {
     118             : #ifdef HAVE_XSAVE_INTRINSICS
     119       12652 :     return (_xgetbv(0) & 0xe6) == 0xe6;
     120             : #else
     121             :     return false;
     122             : #endif
     123             : }
     124             : 
     125             : /*
     126             :  * Does CPUID say there's support for AVX-512 popcount and byte-and-word
     127             :  * instructions?
     128             :  */
     129             : static inline bool
     130       12652 : avx512_popcnt_available(void)
     131             : {
     132       12652 :     unsigned int exx[4] = {0, 0, 0, 0};
     133             : 
     134             : #if defined(HAVE__GET_CPUID_COUNT)
     135       12652 :     __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
     136             : #elif defined(HAVE__CPUIDEX)
     137             :     __cpuidex(exx, 7, 0);
     138             : #else
     139             : #error cpuid instruction not available
     140             : #endif
     141       12652 :     return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
     142           0 :         (exx[1] & (1 << 30)) != 0;    /* avx512-bw */
     143             : }
     144             : 
     145             : /*
     146             :  * Returns true if the CPU supports the instructions required for the AVX-512
     147             :  * pg_popcount() implementation.
     148             :  */
     149             : static bool
     150       12652 : pg_popcount_avx512_available(void)
     151             : {
     152       25304 :     return xsave_available() &&
     153       25304 :         zmm_regs_available() &&
     154       12652 :         avx512_popcnt_available();
     155             : }
     156             : 
     157             : #endif                          /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
     158             : 
     159             : /*
     160             :  * These functions get called on the first call to pg_popcount32 etc.
     161             :  * They detect whether we can use the asm implementations, and replace
     162             :  * the function pointers so that subsequent calls are routed directly to
     163             :  * the chosen implementation.
     164             :  */
     165             : static inline void
     166       12652 : choose_popcount_functions(void)
     167             : {
     168       12652 :     if (pg_popcount_sse42_available())
     169             :     {
     170       12652 :         pg_popcount32 = pg_popcount32_sse42;
     171       12652 :         pg_popcount64 = pg_popcount64_sse42;
     172       12652 :         pg_popcount_optimized = pg_popcount_sse42;
     173       12652 :         pg_popcount_masked_optimized = pg_popcount_masked_sse42;
     174             :     }
     175             :     else
     176             :     {
     177           0 :         pg_popcount32 = pg_popcount32_portable;
     178           0 :         pg_popcount64 = pg_popcount64_portable;
     179           0 :         pg_popcount_optimized = pg_popcount_portable;
     180           0 :         pg_popcount_masked_optimized = pg_popcount_masked_portable;
     181             :     }
     182             : 
     183             : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
     184       12652 :     if (pg_popcount_avx512_available())
     185             :     {
     186           0 :         pg_popcount_optimized = pg_popcount_avx512;
     187           0 :         pg_popcount_masked_optimized = pg_popcount_masked_avx512;
     188             :     }
     189             : #endif
     190       12652 : }
     191             : 
     192             : static int
     193           0 : pg_popcount32_choose(uint32 word)
     194             : {
     195           0 :     choose_popcount_functions();
     196           0 :     return pg_popcount32(word);
     197             : }
     198             : 
     199             : static int
     200       10692 : pg_popcount64_choose(uint64 word)
     201             : {
     202       10692 :     choose_popcount_functions();
     203       10692 :     return pg_popcount64(word);
     204             : }
     205             : 
     206             : static uint64
     207           4 : pg_popcount_choose(const char *buf, int bytes)
     208             : {
     209           4 :     choose_popcount_functions();
     210           4 :     return pg_popcount_optimized(buf, bytes);
     211             : }
     212             : 
     213             : static uint64
     214        1956 : pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
     215             : {
     216        1956 :     choose_popcount_functions();
     217        1956 :     return pg_popcount_masked(buf, bytes, mask);
     218             : }
     219             : 
     220             : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
     221             : 
     222             : /*
     223             :  * pg_popcount_avx512
     224             :  *      Returns the number of 1-bits in buf
     225             :  */
     226             : pg_attribute_target("avx512vpopcntdq,avx512bw")
     227             : static uint64
     228           0 : pg_popcount_avx512(const char *buf, int bytes)
     229             : {
     230             :     __m512i     val,
     231             :                 cnt;
     232           0 :     __m512i     accum = _mm512_setzero_si512();
     233             :     const char *final;
     234             :     int         tail_idx;
     235           0 :     __mmask64   mask = ~UINT64CONST(0);
     236             : 
     237             :     /*
     238             :      * Align buffer down to avoid double load overhead from unaligned access.
     239             :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
     240             :      * iteration and ensure it is not empty.
     241             :      */
     242           0 :     mask <<= ((uintptr_t) buf) % sizeof(__m512i);
     243           0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
     244           0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
     245           0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
     246             : 
     247             :     /*
     248             :      * Iterate through all but the final iteration.  Starting from the second
     249             :      * iteration, the mask is ignored.
     250             :      */
     251           0 :     if (buf < final)
     252             :     {
     253           0 :         val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
     254           0 :         cnt = _mm512_popcnt_epi64(val);
     255           0 :         accum = _mm512_add_epi64(accum, cnt);
     256             : 
     257           0 :         buf += sizeof(__m512i);
     258           0 :         mask = ~UINT64CONST(0);
     259             : 
     260           0 :         for (; buf < final; buf += sizeof(__m512i))
     261             :         {
     262           0 :             val = _mm512_load_si512((const __m512i *) buf);
     263           0 :             cnt = _mm512_popcnt_epi64(val);
     264           0 :             accum = _mm512_add_epi64(accum, cnt);
     265             :         }
     266             :     }
     267             : 
     268             :     /* Final iteration needs to ignore bytes that are not within the length */
     269           0 :     mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
     270             : 
     271           0 :     val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
     272           0 :     cnt = _mm512_popcnt_epi64(val);
     273           0 :     accum = _mm512_add_epi64(accum, cnt);
     274             : 
     275           0 :     return _mm512_reduce_add_epi64(accum);
     276             : }
     277             : 
     278             : /*
     279             :  * pg_popcount_masked_avx512
     280             :  *      Returns the number of 1-bits in buf after applying the mask to each byte
     281             :  */
     282             : pg_attribute_target("avx512vpopcntdq,avx512bw")
     283             : static uint64
     284           0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
     285             : {
     286             :     __m512i     val,
     287             :                 vmasked,
     288             :                 cnt;
     289           0 :     __m512i     accum = _mm512_setzero_si512();
     290             :     const char *final;
     291             :     int         tail_idx;
     292           0 :     __mmask64   bmask = ~UINT64CONST(0);
     293           0 :     const __m512i maskv = _mm512_set1_epi8(mask);
     294             : 
     295             :     /*
     296             :      * Align buffer down to avoid double load overhead from unaligned access.
     297             :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
     298             :      * iteration and ensure it is not empty.
     299             :      */
     300           0 :     bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
     301           0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
     302           0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
     303           0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
     304             : 
     305             :     /*
     306             :      * Iterate through all but the final iteration.  Starting from the second
     307             :      * iteration, the mask is ignored.
     308             :      */
     309           0 :     if (buf < final)
     310             :     {
     311           0 :         val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     312           0 :         vmasked = _mm512_and_si512(val, maskv);
     313           0 :         cnt = _mm512_popcnt_epi64(vmasked);
     314           0 :         accum = _mm512_add_epi64(accum, cnt);
     315             : 
     316           0 :         buf += sizeof(__m512i);
     317           0 :         bmask = ~UINT64CONST(0);
     318             : 
     319           0 :         for (; buf < final; buf += sizeof(__m512i))
     320             :         {
     321           0 :             val = _mm512_load_si512((const __m512i *) buf);
     322           0 :             vmasked = _mm512_and_si512(val, maskv);
     323           0 :             cnt = _mm512_popcnt_epi64(vmasked);
     324           0 :             accum = _mm512_add_epi64(accum, cnt);
     325             :         }
     326             :     }
     327             : 
     328             :     /* Final iteration needs to ignore bytes that are not within the length */
     329           0 :     bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
     330             : 
     331           0 :     val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     332           0 :     vmasked = _mm512_and_si512(val, maskv);
     333           0 :     cnt = _mm512_popcnt_epi64(vmasked);
     334           0 :     accum = _mm512_add_epi64(accum, cnt);
     335             : 
     336           0 :     return _mm512_reduce_add_epi64(accum);
     337             : }
     338             : 
     339             : #endif                          /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
     340             : 
     341             : /*
     342             :  * pg_popcount32_sse42
     343             :  *      Return the number of 1 bits set in word
     344             :  */
     345             : static inline int
     346           0 : pg_popcount32_sse42(uint32 word)
     347             : {
     348             : #ifdef _MSC_VER
     349             :     return __popcnt(word);
     350             : #else
     351             :     uint32      res;
     352             : 
     353           0 : __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
     354           0 :     return (int) res;
     355             : #endif
     356             : }
     357             : 
     358             : /*
     359             :  * pg_popcount64_sse42
     360             :  *      Return the number of 1 bits set in word
     361             :  */
     362             : static inline int
     363   176970324 : pg_popcount64_sse42(uint64 word)
     364             : {
     365             : #ifdef _MSC_VER
     366             :     return __popcnt64(word);
     367             : #else
     368             :     uint64      res;
     369             : 
     370   176970324 : __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
     371   176970324 :     return (int) res;
     372             : #endif
     373             : }
     374             : 
     375             : /*
     376             :  * pg_popcount_sse42
     377             :  *      Returns the number of 1-bits in buf
     378             :  */
     379             : static uint64
     380          28 : pg_popcount_sse42(const char *buf, int bytes)
     381             : {
     382          28 :     uint64      popcnt = 0;
     383             : 
     384             : #if SIZEOF_VOID_P >= 8
     385             :     /* Process in 64-bit chunks if the buffer is aligned. */
     386          28 :     if (buf == (const char *) TYPEALIGN(8, buf))
     387             :     {
     388          28 :         const uint64 *words = (const uint64 *) buf;
     389             : 
     390      524508 :         while (bytes >= 8)
     391             :         {
     392      524480 :             popcnt += pg_popcount64_sse42(*words++);
     393      524480 :             bytes -= 8;
     394             :         }
     395             : 
     396          28 :         buf = (const char *) words;
     397             :     }
     398             : #else
     399             :     /* Process in 32-bit chunks if the buffer is aligned. */
     400             :     if (buf == (const char *) TYPEALIGN(4, buf))
     401             :     {
     402             :         const uint32 *words = (const uint32 *) buf;
     403             : 
     404             :         while (bytes >= 4)
     405             :         {
     406             :             popcnt += pg_popcount32_sse42(*words++);
     407             :             bytes -= 4;
     408             :         }
     409             : 
     410             :         buf = (const char *) words;
     411             :     }
     412             : #endif
     413             : 
     414             :     /* Process any remaining bytes */
     415         148 :     while (bytes--)
     416         120 :         popcnt += pg_number_of_ones[(unsigned char) *buf++];
     417             : 
     418          28 :     return popcnt;
     419             : }
     420             : 
     421             : /*
     422             :  * pg_popcount_masked_sse42
     423             :  *      Returns the number of 1-bits in buf after applying the mask to each byte
     424             :  */
     425             : static uint64
     426      171068 : pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
     427             : {
     428      171068 :     uint64      popcnt = 0;
     429             : 
     430             : #if SIZEOF_VOID_P >= 8
     431             :     /* Process in 64-bit chunks if the buffer is aligned */
     432      171068 :     uint64      maskv = ~UINT64CONST(0) / 0xFF * mask;
     433             : 
     434      171068 :     if (buf == (const char *) TYPEALIGN(8, buf))
     435             :     {
     436      171068 :         const uint64 *words = (const uint64 *) buf;
     437             : 
     438   174831496 :         while (bytes >= 8)
     439             :         {
     440   174660428 :             popcnt += pg_popcount64_sse42(*words++ & maskv);
     441   174660428 :             bytes -= 8;
     442             :         }
     443             : 
     444      171068 :         buf = (const char *) words;
     445             :     }
     446             : #else
     447             :     /* Process in 32-bit chunks if the buffer is aligned. */
     448             :     uint32      maskv = ~((uint32) 0) / 0xFF * mask;
     449             : 
     450             :     if (buf == (const char *) TYPEALIGN(4, buf))
     451             :     {
     452             :         const uint32 *words = (const uint32 *) buf;
     453             : 
     454             :         while (bytes >= 4)
     455             :         {
     456             :             popcnt += pg_popcount32_sse42(*words++ & maskv);
     457             :             bytes -= 4;
     458             :         }
     459             : 
     460             :         buf = (const char *) words;
     461             :     }
     462             : #endif
     463             : 
     464             :     /* Process any remaining bytes */
     465      171068 :     while (bytes--)
     466           0 :         popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
     467             : 
     468      171068 :     return popcnt;
     469             : }
     470             : 
     471             : #endif                          /* HAVE_X86_64_POPCNTQ */

Generated by: LCOV version 1.16