LCOV - code coverage report
Current view: top level - src/port - pg_popcount_x86.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 41.8 % 91 38
Test Date: 2026-03-03 10:15:07 Functions: 77.8 % 9 7
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * pg_popcount_x86.c
       4              :  *    Holds the x86-64 pg_popcount() implementations.
       5              :  *
       6              :  * Copyright (c) 2024-2026, PostgreSQL Global Development Group
       7              :  *
       8              :  * IDENTIFICATION
       9              :  *    src/port/pg_popcount_x86.c
      10              :  *
      11              :  *-------------------------------------------------------------------------
      12              :  */
      13              : #include "c.h"
      14              : 
      15              : #ifdef HAVE_X86_64_POPCNTQ
      16              : 
      17              : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      18              : #include <immintrin.h>
      19              : #endif
      20              : 
      21              : #include "port/pg_bitutils.h"
      22              : #include "port/pg_cpu.h"
      23              : 
      24              : /*
      25              :  * The SSE4.2 versions are built regardless of whether we are building the
      26              :  * AVX-512 versions.
      27              :  *
      28              :  * Technically, POPCNT is not part of SSE4.2, and isn't even a vector
      29              :  * operation, but in practice this is close enough, and "sse42" seems easier to
      30              :  * follow than "popcnt" for these names.
      31              :  */
      32              : static uint64 pg_popcount_sse42(const char *buf, int bytes);
      33              : static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
      34              : 
      35              : /*
      36              :  * These are the AVX-512 implementations of the popcount functions.
      37              :  */
      38              : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      39              : static uint64 pg_popcount_avx512(const char *buf, int bytes);
      40              : static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
      41              : #endif                          /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
      42              : 
      43              : /*
      44              :  * The function pointers are initially set to "choose" functions.  These
      45              :  * functions will first set the pointers to the right implementations (base on
      46              :  * what the current CPU supports) and then will call the pointer to fulfill the
      47              :  * caller's request.
      48              :  */
      49              : static uint64 pg_popcount_choose(const char *buf, int bytes);
      50              : static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
      51              : uint64      (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
      52              : uint64      (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
      53              : 
      54              : 
      55              : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      56              : 
      57              : /*
      58              :  * Returns true if the CPU supports the instructions required for the AVX-512
      59              :  * pg_popcount() implementation.
      60              :  */
      61              : static bool
      62         1301 : pg_popcount_avx512_available(void)
      63              : {
      64         2602 :     return x86_feature_available(PG_AVX512_BW) &&
      65         1301 :         x86_feature_available(PG_AVX512_VPOPCNTDQ);
      66              : }
      67              : 
      68              : #endif                          /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
      69              : 
      70              : /*
      71              :  * These functions get called on the first call to pg_popcount(), etc.
      72              :  * They detect whether we can use the asm implementations, and replace
      73              :  * the function pointers so that subsequent calls are routed directly to
      74              :  * the chosen implementation.
      75              :  */
      76              : static inline void
      77         1301 : choose_popcount_functions(void)
      78              : {
      79         1301 :     if (x86_feature_available(PG_POPCNT))
      80              :     {
      81         1301 :         pg_popcount_optimized = pg_popcount_sse42;
      82         1301 :         pg_popcount_masked_optimized = pg_popcount_masked_sse42;
      83              :     }
      84              :     else
      85              :     {
      86            0 :         pg_popcount_optimized = pg_popcount_portable;
      87            0 :         pg_popcount_masked_optimized = pg_popcount_masked_portable;
      88              :     }
      89              : 
      90              : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
      91         1301 :     if (pg_popcount_avx512_available())
      92              :     {
      93            0 :         pg_popcount_optimized = pg_popcount_avx512;
      94            0 :         pg_popcount_masked_optimized = pg_popcount_masked_avx512;
      95              :     }
      96              : #endif
      97         1301 : }
      98              : 
      99              : static uint64
     100            6 : pg_popcount_choose(const char *buf, int bytes)
     101              : {
     102            6 :     choose_popcount_functions();
     103            6 :     return pg_popcount_optimized(buf, bytes);
     104              : }
     105              : 
     106              : static uint64
     107         1295 : pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
     108              : {
     109         1295 :     choose_popcount_functions();
     110         1295 :     return pg_popcount_masked(buf, bytes, mask);
     111              : }
     112              : 
     113              : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
     114              : 
     115              : /*
     116              :  * pg_popcount_avx512
     117              :  *      Returns the number of 1-bits in buf
     118              :  */
     119              : pg_attribute_target("avx512vpopcntdq,avx512bw")
     120              : static uint64
     121            0 : pg_popcount_avx512(const char *buf, int bytes)
     122              : {
     123              :     __m512i     val,
     124              :                 cnt;
     125            0 :     __m512i     accum = _mm512_setzero_si512();
     126              :     const char *final;
     127              :     int         tail_idx;
     128            0 :     __mmask64   mask = ~UINT64CONST(0);
     129              : 
     130              :     /*
     131              :      * Align buffer down to avoid double load overhead from unaligned access.
     132              :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
     133              :      * iteration and ensure it is not empty.
     134              :      */
     135            0 :     mask <<= ((uintptr_t) buf) % sizeof(__m512i);
     136            0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
     137            0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
     138            0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
     139              : 
     140              :     /*
     141              :      * Iterate through all but the final iteration.  Starting from the second
     142              :      * iteration, the mask is ignored.
     143              :      */
     144            0 :     if (buf < final)
     145              :     {
     146            0 :         val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
     147            0 :         cnt = _mm512_popcnt_epi64(val);
     148            0 :         accum = _mm512_add_epi64(accum, cnt);
     149              : 
     150            0 :         buf += sizeof(__m512i);
     151            0 :         mask = ~UINT64CONST(0);
     152              : 
     153            0 :         for (; buf < final; buf += sizeof(__m512i))
     154              :         {
     155            0 :             val = _mm512_load_si512((const __m512i *) buf);
     156            0 :             cnt = _mm512_popcnt_epi64(val);
     157            0 :             accum = _mm512_add_epi64(accum, cnt);
     158              :         }
     159              :     }
     160              : 
     161              :     /* Final iteration needs to ignore bytes that are not within the length */
     162            0 :     mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
     163              : 
     164            0 :     val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
     165            0 :     cnt = _mm512_popcnt_epi64(val);
     166            0 :     accum = _mm512_add_epi64(accum, cnt);
     167              : 
     168            0 :     return _mm512_reduce_add_epi64(accum);
     169              : }
     170              : 
     171              : /*
     172              :  * pg_popcount_masked_avx512
     173              :  *      Returns the number of 1-bits in buf after applying the mask to each byte
     174              :  */
     175              : pg_attribute_target("avx512vpopcntdq,avx512bw")
     176              : static uint64
     177            0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
     178              : {
     179              :     __m512i     val,
     180              :                 vmasked,
     181              :                 cnt;
     182            0 :     __m512i     accum = _mm512_setzero_si512();
     183              :     const char *final;
     184              :     int         tail_idx;
     185            0 :     __mmask64   bmask = ~UINT64CONST(0);
     186            0 :     const __m512i maskv = _mm512_set1_epi8(mask);
     187              : 
     188              :     /*
     189              :      * Align buffer down to avoid double load overhead from unaligned access.
     190              :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
     191              :      * iteration and ensure it is not empty.
     192              :      */
     193            0 :     bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
     194            0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
     195            0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
     196            0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
     197              : 
     198              :     /*
     199              :      * Iterate through all but the final iteration.  Starting from the second
     200              :      * iteration, the mask is ignored.
     201              :      */
     202            0 :     if (buf < final)
     203              :     {
     204            0 :         val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     205            0 :         vmasked = _mm512_and_si512(val, maskv);
     206            0 :         cnt = _mm512_popcnt_epi64(vmasked);
     207            0 :         accum = _mm512_add_epi64(accum, cnt);
     208              : 
     209            0 :         buf += sizeof(__m512i);
     210            0 :         bmask = ~UINT64CONST(0);
     211              : 
     212            0 :         for (; buf < final; buf += sizeof(__m512i))
     213              :         {
     214            0 :             val = _mm512_load_si512((const __m512i *) buf);
     215            0 :             vmasked = _mm512_and_si512(val, maskv);
     216            0 :             cnt = _mm512_popcnt_epi64(vmasked);
     217            0 :             accum = _mm512_add_epi64(accum, cnt);
     218              :         }
     219              :     }
     220              : 
     221              :     /* Final iteration needs to ignore bytes that are not within the length */
     222            0 :     bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
     223              : 
     224            0 :     val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     225            0 :     vmasked = _mm512_and_si512(val, maskv);
     226            0 :     cnt = _mm512_popcnt_epi64(vmasked);
     227            0 :     accum = _mm512_add_epi64(accum, cnt);
     228              : 
     229            0 :     return _mm512_reduce_add_epi64(accum);
     230              : }
     231              : 
     232              : #endif                          /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
     233              : 
     234              : /*
     235              :  * pg_popcount64_sse42
     236              :  *      Return the number of 1 bits set in word
     237              :  */
     238              : static inline int
     239     94735379 : pg_popcount64_sse42(uint64 word)
     240              : {
     241              : #ifdef _MSC_VER
     242              :     return __popcnt64(word);
     243              : #else
     244              :     uint64      res;
     245              : 
     246     94735379 : __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
     247     94735379 :     return (int) res;
     248              : #endif
     249              : }
     250              : 
     251              : /*
     252              :  * pg_popcount_sse42
     253              :  *      Returns the number of 1-bits in buf
     254              :  */
     255              : pg_attribute_no_sanitize_alignment()
     256              : static uint64
     257           18 : pg_popcount_sse42(const char *buf, int bytes)
     258              : {
     259           18 :     uint64      popcnt = 0;
     260           18 :     const uint64 *words = (const uint64 *) buf;
     261              : 
     262       262267 :     while (bytes >= 8)
     263              :     {
     264       262249 :         popcnt += pg_popcount64_sse42(*words++);
     265       262249 :         bytes -= 8;
     266              :     }
     267              : 
     268           18 :     buf = (const char *) words;
     269              : 
     270              :     /* Process any remaining bytes */
     271           78 :     while (bytes--)
     272           60 :         popcnt += pg_number_of_ones[(unsigned char) *buf++];
     273              : 
     274           18 :     return popcnt;
     275              : }
     276              : 
     277              : /*
     278              :  * pg_popcount_masked_sse42
     279              :  *      Returns the number of 1-bits in buf after applying the mask to each byte
     280              :  */
     281              : pg_attribute_no_sanitize_alignment()
     282              : static uint64
     283        92530 : pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
     284              : {
     285        92530 :     uint64      popcnt = 0;
     286        92530 :     uint64      maskv = ~UINT64CONST(0) / 0xFF * mask;
     287        92530 :     const uint64 *words = (const uint64 *) buf;
     288              : 
     289     94565660 :     while (bytes >= 8)
     290              :     {
     291     94473130 :         popcnt += pg_popcount64_sse42(*words++ & maskv);
     292     94473130 :         bytes -= 8;
     293              :     }
     294              : 
     295        92530 :     buf = (const char *) words;
     296              : 
     297              :     /* Process any remaining bytes */
     298        92530 :     while (bytes--)
     299            0 :         popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
     300              : 
     301        92530 :     return popcnt;
     302              : }
     303              : 
     304              : #endif                          /* HAVE_X86_64_POPCNTQ */
        

Generated by: LCOV version 2.0-1